# Choose the best model

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
#import h2o
#from h2o.automl import H2OAutoML
#from sklearn.ensemble import RandomForestClassifier

In [5]:
cleantrain_df = pd.read_csv("./inputs/cleantrain.csv")
cleantrain_df.drop(columns=["price"]).head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.3,0,0,3,62.8,56.0,4.29,4.31,2.7
1,1,0.34,1,1,2,62.6,55.0,4.46,4.49,2.8
2,2,0.4,0,2,2,60.3,62.0,4.7,4.75,2.85
3,3,0.4,2,3,4,61.8,59.2,4.72,4.74,2.92
4,4,0.9,0,2,2,61.0,63.0,6.1,6.13,3.73


## Checking out the RMSE for each model

### 1- Train Model with n estimators (n = 100, 250, 500, 750, 1000)
### 2- Evaluate lowest RMSE

#### Divide Data to train the model

In [9]:
X = cleantrain_df.drop(columns=["price"])
y = cleantrain_df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 10) (8091, 10) (32364,) (8091,)


### * Gradient Boosting Model *

In [12]:
boosting_model = {
    "boosting_100": GradientBoostingRegressor(n_estimators=100),
    "boosting_250": GradientBoostingRegressor(n_estimators=250),
    "boosting_500": GradientBoostingRegressor(n_estimators=500),
    "boosting_750": GradientBoostingRegressor(n_estimators=750),
    "boosting_1000": GradientBoostingRegressor(n_estimators=1000)
}

In [14]:
for name, model  in boosting_model.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training boosting_100...
Training complete
Starting training
Training boosting_250...
Training complete
Starting training
Training boosting_500...
Training complete
Starting training
Training boosting_750...
Training complete
Starting training
Training boosting_1000...
Training complete


In [19]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in boosting_model.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model boosting_100
	 RMSE: 685.6292
Evaluating model boosting_250
	 RMSE: 634.9237
Evaluating model boosting_500
	 RMSE: 607.6421
Evaluating model boosting_750
	 RMSE: 594.1696
Evaluating model boosting_1000
	 RMSE: 595.5055


#### I am selecting model boosting_750

### * Random Forest Model *

In [21]:
randomforest_model = {
    "forest_100": RandomForestRegressor(n_estimators=100),
    "forest_250": RandomForestRegressor(n_estimators=250),
    "forest_500": RandomForestRegressor(n_estimators=500),
    "forest_750": RandomForestRegressor(n_estimators=750),
    "forest_1000":RandomForestRegressor(n_estimators=1000),
}

In [22]:
for name, model  in randomforest_model.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training forest_100...
Training complete
Starting training
Training forest_250...
Training complete
Starting training
Training forest_500...
Training complete
Starting training
Training forest_750...
Training complete
Starting training
Training forest_1000...
Training complete


In [25]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in randomforest_model.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model forest_100
	 RMSE: 563.6655
Evaluating model forest_250
	 RMSE: 563.0528
Evaluating model forest_500
	 RMSE: 561.1321
Evaluating model forest_750
	 RMSE: 561.4771
Evaluating model forest_1000
	 RMSE: 562.2579


#### I am selecting model forest_500

### * KNeighbors Model *

In [29]:
kneighbors_model = {
    "neighbor":KNeighborsRegressor()
}

In [30]:
for name, model  in kneighbors_model.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training neighbor...
Training complete


In [31]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in kneighbors_model.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model neighbor
	 RMSE: 3769.6703


### * Decision Tree Model *

In [33]:
randomforest_model = {
    "tree":DecisionTreeRegressor()
}

In [34]:
for name, model  in randomforest_model.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training tree...
Training complete


In [35]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in randomforest_model.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model tree
	 RMSE: 774.9734


### * Extra Trees Model *

In [43]:
from sklearn.ensemble import ExtraTreesRegressor

In [47]:
extratrees_model = {
    "trees_100": ExtraTreesRegressor(n_estimators=100),
    "trees_250": ExtraTreesRegressor(n_estimators=250),
    "trees_500": ExtraTreesRegressor(n_estimators=500),
    "trees_750": ExtraTreesRegressor(n_estimators=750),
    "trees_1000":ExtraTreesRegressor(n_estimators=1000),
    "trees_1500": ExtraTreesRegressor(n_estimators=1500),
    "trees_2000":ExtraTreesRegressor(n_estimators=2000),
}

In [48]:
for name, model  in extratrees_model.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training trees_100...
Training complete
Starting training
Training trees_250...
Training complete
Starting training
Training trees_500...
Training complete
Starting training
Training trees_750...
Training complete
Starting training
Training trees_1000...
Training complete
Starting training
Training trees_1500...
Training complete
Starting training
Training trees_2000...
Training complete


In [49]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in extratrees_model.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model trees_100
	 RMSE: 563.1775
Evaluating model trees_250
	 RMSE: 560.6456
Evaluating model trees_500
	 RMSE: 560.5339
Evaluating model trees_750
	 RMSE: 559.178
Evaluating model trees_1000
	 RMSE: 559.0019
Evaluating model trees_1500
	 RMSE: 560.5129
Evaluating model trees_2000
	 RMSE: 559.4457


## Hyperparameter optimization with GridSearchCV

In [40]:
from sklearn.model_selection import GridSearchCV

### * Model forest_500 * ---> (RMSE=561.1321)

In [36]:
params = {'max_features': ['auto', 'sqrt','log2']}

In [37]:
rfc = RandomForestRegressor(n_estimators=500)

In [None]:
grid = GridSearchCV(rfc,params,verbose=1)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

### * Model boosting_750 *  ---> (RMSE=594.1696)

In [None]:
params = { 'max_features': ['auto', 'sqrt','log2']}

In [None]:
gbm = GradientBoostingRegressor(n_estimators=750)

In [None]:
grid = GridSearchCV(gbm,params,verbose=1)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

### * Model trees_1000 * ---> (RMSE=559.0019)

In [None]:
params = { 'max_features': ['auto', 'sqrt','log2']}

In [None]:
gbm = GradientBoostingRegressor(n_estimators=750)

In [None]:
grid = GridSearchCV(gbm,params,verbose=1)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)