# Training and Tuning Predictive Models

### Prepare the data

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import metrics
import warnings; warnings.simplefilter('ignore')


In [2]:
x_train = pd.read_csv('X_train.csv').iloc[:,1:]
x_test = pd.read_csv('X_valid.csv').iloc[:,1:]
y_train = pd.read_csv('y_train.csv').iloc[:,1:]
y_test = pd.read_csv('y_valid.csv').iloc[:,1:]

#### Decision Tree

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 1.0
Rsquared on Testing 0.6176999225755886


#### Decision Tree Tuning

In [5]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': np.arange(1,100,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_depth': 6}
Rsquared on Training 0.8235772077938193
Rsquared on Testing 0.7170904647558012


#### AdaBoost

In [6]:
from sklearn.ensemble import AdaBoostRegressor

# Create a decision tree and train
model = AdaBoostRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.061125604808840106
Rsquared on Testing 0.4591623124741959


#### Adaboost Tuning

In [7]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(50,100, 5), 'learning_rate':np.linspace(0.001,.2,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostRegressor(), param_grid, cv = 3)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.05075, 'n_estimators': 60}
Rsquared on Training 0.7157576442323954
Rsquared on Testing 0.7359690287043726


#### Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = RandomForestRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9557035508541831
Rsquared on Testing 0.7758716255154877


#### Random Forest Tuning

In [9]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(50,100, 5), 'max_features':np.arange(2,10)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestRegressor(), param_grid, cv = 3)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_features': 9, 'n_estimators': 70}
Rsquared on Training 0.9371917245901695
Rsquared on Testing 0.6334379128187273


#### Gradient Boosting

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a decision tree and train
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8717278002706266
Rsquared on Testing 0.7936213233634128


#### Gradient Boosting Tuning

In [11]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(50,100, 5), 'learning_rate':np.linspace(0.001,.2,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.1005, 'n_estimators': 70}
Rsquared on Training 0.8489119840933748
Rsquared on Testing 0.8094880179627664


#### ElasticNet

In [12]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.6308324655303715
Rsquared on Testing 0.7083867549966092


#### ElasticNet Tuning

In [13]:
param_grid = {'alpha':np.linspace(0,10, 10), 'l1_ratio':np.linspace(0,1,10)}
model = GridSearchCV(ElasticNet(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'alpha': 1.1111111111111112, 'l1_ratio': 0.8888888888888888}
Rsquared on Training 0.6571211783271504
Rsquared on Testing 0.7170309395631529


#### KNN

In [14]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.6688856014236269
Rsquared on Testing 0.5987641198971616


#### KNN Tuning

In [15]:
param_grid = {'n_neighbors':np.arange(10,20)}
model = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 3)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'n_neighbors': 13}
Rsquared on Training 0.5956103204157177
Rsquared on Testing 0.6328068318400353


## Neural Network

In [16]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(10, 20))
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.549450112716284
Rsquared on Testing 0.6577945707591298


In [18]:
import os

os.system('jupyter nbconvert --to html few_models.ipynb')


0