# Add new data to a pretrained model

## Import data

In [1]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

## split data in three parts

* X_train, y_train - training set 80% of 40% of data
* X_test, y_test - test set 20% of 40 of data
* X2 - y2 - new samples (60% of data)

In [41]:
from sklearn.model_selection import train_test_split

X1, X2, y1, y2 = train_test_split(X, y, test_size=0.60, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.20, random_state=42)

In [69]:
len(X_train), len(X_test), len(X2)

(48, 12, 90)

## First strategy - warm_start

First training - warm_start = False. When I add new samples, training is done from scratch

In [89]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=2, random_state=0, warm_start=False, n_estimators=1)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, n_estimators=1, random_state=0)

In [90]:
model.score(X_test, y_test)

0.75

Fit the model on new data

In [91]:
model.fit(X2, y2)

RandomForestClassifier(max_depth=2, n_estimators=1, random_state=0)

In [92]:
model.score(X_test, y_test)

0.8333333333333334

warm_start = True. When I add new samples, training is incremental

In [75]:
model = RandomForestClassifier(max_depth=2, random_state=0, warm_start=True, n_estimators=1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.75

In [76]:
model.n_estimators+=1
model.fit(X2, y2)
model.score(X_test, y_test)

0.9166666666666666

## Second strategy - partial fit

In [86]:
from sklearn.linear_model import SGDClassifier
import numpy as np

model = SGDClassifier() 
model.partial_fit(X_train, y_train, classes=np.unique(y))

SGDClassifier()

In [87]:
model.score(X_test, y_test)

0.4166666666666667

In [88]:
model.partial_fit(X2, y2)
model.score(X_test, y_test)

0.8333333333333334