# Training and Tuning Predictive Models

## Classification Problem

When we predict a catgorical target, we have a classification problem.  We will use the `titanic` dataset as an example for implementing a several classifers in sklearn

### Prepare the data

In [5]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import warnings; warnings.simplefilter('ignore')


In [2]:
# Import the data
df = pd.read_csv('titanic.csv')

# Assign input variables
X = df.loc[:,['Pclass','Sex','Age','Fare','Embarked','SibSp','Parch']]

# Assign target variable
y = df['Survived']

# Replace missing values by the median
X["Age"] = X["Age"].fillna(X["Age"].median())

# Impute the Embarked variable
X["Embarked"] = X["Embarked"].fillna("S")

In [3]:
# Change Pclass to categorical variable
X['Pclass'] = X['Pclass'].astype(object)

# Encode categorical variable
X = pd.get_dummies(X)

In [4]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

#### Decision Tree

In [5]:
# Create a decision tree and train
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9803370786516854
Testing Accuracy: 0.7486033519553073


#### Decision Tree Tuning

In [6]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': range(1,10), 'criterion':['gini', 'entropy']}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'criterion': 'gini', 'max_depth': 6}
Training Accuracy: 0.8820224719101124
Testing Accuracy: 0.770949720670391


#### Adaboost

In [7]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.8384831460674157
Testing Accuracy: 0.8044692737430168


#### Adaboost Tuning

In [8]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,100, 2), 'learning_rate':np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.112, 'n_estimators': 75}
Training Accuracy: 0.824438202247191
Testing Accuracy: 0.770949720670391


#### Gradient Boosting

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9129213483146067
Testing Accuracy: 0.7821229050279329


#### Gradient Boosting Tuning

In [10]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,50, 2), 'learning_rate':np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.445, 'n_estimators': 5}
Training Accuracy: 0.8539325842696629
Testing Accuracy: 0.7821229050279329


#### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9676966292134831
Testing Accuracy: 0.770949720670391


#### Random Forest Tuning

In [12]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(2,50, 2), 'max_features':np.arange(2,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'max_features': 9, 'n_estimators': 28}
Training Accuracy: 0.9789325842696629
Testing Accuracy: 0.7932960893854749


#### KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.8216292134831461
Testing Accuracy: 0.7318435754189944


In [14]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_neighbors': np.arange(2,20, 2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'n_neighbors': 2}
Training Accuracy: 0.8356741573033708
Testing Accuracy: 0.6815642458100558


#### ElasticNet

Notice that ElasticNet model includes LASSO, Ridge Regression and Logistic Regression.  Change the value of `alpha` and `l1_ratio` to have the desired model.  

In [8]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log', penalty='elasticnet', alpha=1, l1_ratio=.1)

model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.6671348314606742
Testing Accuracy: 0.6480446927374302


#### Elasticnet Tuning

In [16]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'alpha':np.linspace(0.1,10, 10), 'l1_ratio':np.linspace(0,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(SGDClassifier(loss='log', penalty='elasticnet'), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'alpha': 0.1, 'l1_ratio': 0.0}
Training Accuracy: 0.6615168539325843
Testing Accuracy: 0.6312849162011173


### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.7991573033707865
Testing Accuracy: 0.7988826815642458


# Regression Problem

When we predict a numeric target, we have a regression problem.  We will use the `boston housing` dataset as an example for implementing a several regressors in sklearn.

#### Data Preparation

In [3]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston=load_boston()
boston_df=pd.DataFrame(boston.data,columns=boston.feature_names)

#print boston_df.info()
# add another column that contains the house prices which in scikit learn datasets are considered as target
boston_df['Price']=boston.target

#print boston_df.head(3)
newX=boston_df.drop('Price',axis=1)
newY=boston_df['Price']
#print type(newY)# pandas core frame
x_train,x_test,y_train,y_test=train_test_split(newX,newY,test_size=0.3,random_state=9)

#### Decision Tree

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 1.0
Rsquared on Testing 0.6275316392243291


#### Decision Tree Tuning

In [19]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': np.arange(1,100,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_depth': 46}
Rsquared on Training 1.0
Rsquared on Testing 0.6983656152467781


#### AdaBoost

In [20]:
from sklearn.ensemble import AdaBoostRegressor

# Create a decision tree and train
model = AdaBoostRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9092212616586101
Rsquared on Testing 0.849114176702914


#### Adaboost Tuning

In [21]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,100, 2), 'learning_rate':np.linspace(0.001,1,10)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.445, 'n_estimators': 31}
Rsquared on Training 0.8941910447449928
Rsquared on Testing 0.8006519125223412


#### Random Forest

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = RandomForestRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9617639347910545
Rsquared on Testing 0.7958895500037099


#### Random Forest Tuning

In [23]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(2,50, 2), 'max_features':np.arange(2,10)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_features': 5, 'n_estimators': 26}
Rsquared on Training 0.9786420237922232
Rsquared on Testing 0.8826499273162934


#### Gradient Boosting

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a decision tree and train
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9818535458511707
Rsquared on Testing 0.8857026030279465


#### Gradient Boosting Tuning

In [25]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,20, 2), 'learning_rate':np.linspace(0.001,.3,10)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.223, 'n_estimators': 19}
Rsquared on Training 0.9575133653372899
Rsquared on Testing 0.8572693411649928


#### ElasticNet

In [26]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.6669924636401611
Rsquared on Testing 0.7041261455492028


#### ElasticNet Tuning

In [27]:
param_grid = {'alpha':np.linspace(0,10, 10), 'l1_ratio':np.linspace(0,1,10)}
model = GridSearchCV(ElasticNet(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'alpha': 0.0, 'l1_ratio': 0.0}
Rsquared on Training 0.7176202687340321
Rsquared on Testing 0.7826126074271011


#### KNN

In [7]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.6680284285778176
Rsquared on Testing 0.5941848521354716


#### KNN Tuning

In [29]:
param_grid = {'n_neighbors':np.arange(1,10)}
model = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'n_neighbors': 3}
Rsquared on Training 0.7561816517830935
Rsquared on Testing 0.5547519590174173
