# ***Training the Model***

In this Jupyter we will try and test different models with different parameters to find the best model approach, using RMSE as indicator.

In [89]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Lets import the clean data:

In [31]:
df = pd.read_csv("DATA/clean_data.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price
0,0,1.14,5,4,9,61.0,56.0,9013
1,1,0.76,5,3,7,62.7,57.0,2692
2,2,0.84,5,4,8,61.4,56.0,4372
3,3,1.55,5,3,8,62.0,57.0,13665
4,4,0.3,5,4,5,61.9,57.0,422


# First, we split the DataFrame for train and test...

In [32]:
y=df["price"]
X=df.drop(columns=["price","id"])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [33]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.14,5,4,9,61.0,56.0
1,0.76,5,3,7,62.7,57.0
2,0.84,5,4,8,61.4,56.0
3,1.55,5,3,8,62.0,57.0
4,0.3,5,4,5,61.9,57.0


In [34]:
y.head()

0     9013
1     2692
2     4372
3    13665
4      422
Name: price, dtype: int64

In [35]:
X.shape

(40455, 6)

In [36]:
y.shape

(40455,)

# ...and prepare a series of different regression models:

In [86]:
models={
    "Linear" : LinearRegression(),
    "Dec Tree" : DecisionTreeRegressor(),
    "KNeighb" : KNeighborsRegressor(),
    "Grad" : GradientBoostingRegressor(),
    "Random Forest" : RandomForestRegressor()
}

In [88]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print("_")

------Linear------
RMSE :  1240.7525412254759
_
------Dec Tree------
RMSE :  707.092855460467
_
------KNeighb------
RMSE :  1947.0195638144364
_
------Grad------
RMSE :  622.8015805037678
_
------Random Forest------
RMSE :  542.5932241439061
_


Decission tree, Gradient and Random Forest seems to be the best choices. Lets tune some hyperparameters for this models:

# Decission Tree:

In [90]:
for i in range (1,20):
    tree = DecisionTreeRegressor(max_depth=i)
    tree.fit(X_train,y_train)
    y_pred = tree.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 1  -->  RMSE:  2556.2418321272166
Max Depth = 2  -->  RMSE:  1692.594825562079
Max Depth = 3  -->  RMSE:  1377.3895045281097
Max Depth = 4  -->  RMSE:  1173.179041558432
Max Depth = 5  -->  RMSE:  1012.8938469309877
Max Depth = 6  -->  RMSE:  875.768671943671
Max Depth = 7  -->  RMSE:  796.099885788222
Max Depth = 8  -->  RMSE:  704.9897198199112
Max Depth = 9  -->  RMSE:  655.2900214544294
Max Depth = 10  -->  RMSE:  619.4049372491342
Max Depth = 11  -->  RMSE:  604.2907875566599
Max Depth = 12  -->  RMSE:  601.3268823366212
Max Depth = 13  -->  RMSE:  628.0255544646831
Max Depth = 14  -->  RMSE:  623.7840896141942
Max Depth = 15  -->  RMSE:  653.3557181543457
Max Depth = 16  -->  RMSE:  672.3131812355342
Max Depth = 17  -->  RMSE:  689.0838053750151
Max Depth = 18  -->  RMSE:  683.3976658694658
Max Depth = 19  -->  RMSE:  687.340864630778


Best RMSE comes with max_depth=12

# Gradient Boosting:

In [102]:
for i in [1000,1100,1200]:
    grad = GradientBoostingRegressor(n_estimators=i)
    grad.fit(X_train,y_train)
    y_pred = grad.predict(X_test)
    print(f"n_estimators = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

n_estimators = 1000  -->  RMSE:  541.8914556072579
n_estimators = 1100  -->  RMSE:  540.6034230906944
n_estimators = 1200  -->  RMSE:  539.9719182706748


Best RMSE comes with n_estimators=

# Random Forest:

In [81]:
for i in range(12,25):
    forest = RandomForestRegressor(max_depth=i)
    forest.fit(X_train,y_train)
    y_pred = forest.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 12  -->  RMSE:  540.7544380257997
Max Depth = 13  -->  RMSE:  536.8425996690071
Max Depth = 14  -->  RMSE:  537.8256841050809
Max Depth = 15  -->  RMSE:  539.2720903620158
Max Depth = 16  -->  RMSE:  539.4178061900482
Max Depth = 17  -->  RMSE:  539.4740312642746
Max Depth = 18  -->  RMSE:  538.8441729204425
Max Depth = 19  -->  RMSE:  541.2236025978418
Max Depth = 20  -->  RMSE:  542.7489124731181
Max Depth = 21  -->  RMSE:  542.6827399020762
Max Depth = 22  -->  RMSE:  543.226452410092
Max Depth = 23  -->  RMSE:  541.9298874779803
Max Depth = 24  -->  RMSE:  541.6672664794535


Best RMSE comes with max_depth=18

Fixed max_depth=18, lets try different n_estimators:

In [101]:
for i in [120,130,140,150,200]:
    forest = RandomForestRegressor(max_depth=18,n_estimators=i)
    forest.fit(X_train,y_train)
    y_pred = forest.predict(X_test)
    print(f"Max Depth=18, n_stimators = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth=18, n_stimators = 120  -->  RMSE:  539.778452725323
Max Depth=18, n_stimators = 130  -->  RMSE:  537.6049987198777
Max Depth=18, n_stimators = 140  -->  RMSE:  539.3855437020056
Max Depth=18, n_stimators = 150  -->  RMSE:  540.7845144840555
Max Depth=18, n_stimators = 200  -->  RMSE:  540.3556017557063


Best RMSE comes with max_depth=18 and n_estimators=130

### Seems like we can set almost infinite combinations of hyper-parameters.
## Lets optimize that with a little help of GridSearchCV :

Parameters for a RandomForest:

In [113]:
parameters = {'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}

In [114]:
rfr = RandomForestRegressor()

In [115]:
grid = GridSearchCV(rfr, parameters, verbose=1)

In [123]:
### beware that code line can take more than 1 hour to run!!

In [116]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True, False],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             verbose=1)

In [117]:
print(grid.best_params_)

{'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10}


Lets re-fit the RandomForest:

In [124]:
forestGS = RandomForestRegressor(max_depth=18, n_estimators=130, bootstrap=True, max_features="auto", min_samples_leaf=2, min_samples_split=10)

we fit the final model with the whole "Train" dataset:

In [127]:
forestGS.fit(X,y)
y_pred = forestGS.predict(X_test)
print(f"RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE:  372.6392772026178


# Saving and exporting my best model:

In [128]:
import pickle

In [131]:
pickle.dump(forestGS, open("Models/my_model_01", 'wb'))