# Finale

- choosing model: we should always test out multiple models and grid search + cross validate.
- sklearn algo cheatsheet https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
- do not set expectations based on fully trained model as it is not representative of performance of the model. (must test on unseen data)

# Model Persistence

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../DATA/Advertising.csv')

In [3]:
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [4]:
df.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [5]:
y = df['sales']
X = df.drop('sales', axis=1)

In [6]:
X

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [7]:
# Split to train validation and hold out
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test_unused, y_train, y_test_unused = train_test_split(X, y, test_size=0.30, random_state=42)
## So hold out is 15%
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.50, random_state=42)

# Should be 70 / 15 / 15

In [9]:
len(X)

200

In [10]:
len(X_train)

140

In [11]:
len(X_validation)

30

In [12]:
len(X_holdout_test)

30

## Model Training

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
model = RandomForestRegressor(n_estimators=30, random_state=42) # Random state should be same across all calls!

In [25]:
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30, random_state=42)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [29]:
val_preds = model.predict(X_validation)

In [30]:
mean_absolute_error(y_validation, val_preds)

0.483111111111111

In [31]:
mean_squared_error(y_validation, val_preds) ** 0.5

0.6177971619660723

In [32]:
df.describe()['sales']

count    200.000000
mean      14.022500
std        5.217457
min        1.600000
25%       10.375000
50%       12.900000
75%       17.400000
max       27.000000
Name: sales, dtype: float64

In [33]:
# Final metrics (holdout set)
holdout_preds = model.predict(X_holdout_test)

In [37]:
mean_absolute_error(y_holdout_test, holdout_preds)

0.5649999999999998

In [38]:
mean_squared_error(y_holdout_test, holdout_preds) ** 0.5

0.6758333675845999

In [39]:
final_model = RandomForestRegressor(n_estimators=30, random_state=42)

In [40]:
final_model.fit(X, y)

RandomForestRegressor(n_estimators=30, random_state=42)

In [41]:
import joblib

In [42]:
joblib.dump(final_model, 'final_model.pkl') # pickle file holding the trained model.

['final_model.pkl']

In [43]:
X.columns

Index(['TV', 'radio', 'newspaper'], dtype='object')

In [44]:
list(X.columns)

['TV', 'radio', 'newspaper']

In [45]:
# Save col data for later
joblib.dump(list(X.columns), 'col_names.pkl')

['col_names.pkl']

# Load the model

In [46]:
new_cols = joblib.load('col_names.pkl')

In [47]:
new_cols

['TV', 'radio', 'newspaper']

In [48]:
loaded_model = joblib.load('final_model.pkl')

In [49]:
loaded_model.predict([[230, 37, 69]])

array([21.82666667])