# 1. Load dataset

#### In this assignment, you are expected to practice building pipeline, doing k-fold cross validation and performing hyperparameter tuning.
#### You will be working with mobile phone dataset (mobile_train.csv).

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('mobile_train.csv')

# 2. Tuning

#### Build a random forest classifier model and perform hyperparameter tuning using grid search. Also apply 5-fold cross validation while doing searching. Use following values for the search:
- n_estimators - 100, 200, 300
- max_depth - 3, 5
- criterion - gini, entropy

In [6]:
# manually searching using for loops

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

X=df[['battery_power','int_memory','ram','n_cores','px_height','px_width','three_g','wifi']]  # Features
y=df['price_range']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

n_estimators=[100,200,300]
max_depths=[3,5]
criterions=['gini', 'entropy']

avg_scores={}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for criterion in criterions:
            clf=RandomForestClassifier(n_estimators=n_estimator,max_depth=max_depth, criterion=criterion)
            scores=cross_val_score(clf,X,y,cv=5)
            avg_scores[str(n_estimator)+' '+str(max_depth)+' '+criterion] = np.average(scores)

print(avg_scores)
print('\n\nBest params: ', list(avg_scores.keys())[list(avg_scores.values()).index(max(avg_scores.values()))])

{'100 3 gini': 0.8094999999999999, '100 3 entropy': 0.79, '100 5 gini': 0.858, '100 5 entropy': 0.8475000000000001, '200 3 gini': 0.8135, '200 3 entropy': 0.7945, '200 5 gini': 0.8585, '200 5 entropy': 0.8480000000000001, '300 3 gini': 0.8065000000000001, '300 3 entropy': 0.8025, '300 5 gini': 0.857, '300 5 entropy': 0.852}


Best params:  200 5 gini


In [7]:
# using GRID SEARCH 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

model=RandomForestClassifier()

clf = GridSearchCV(model, {
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy'],
    'max_depth':[3,5]
}, cv=5, return_train_score=False)

clf.fit(X_train, y_train)
grid_search_results = pd.DataFrame(clf.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.251056,0.03093,0.01759,0.003927,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.81875,0.784375,0.809375,0.809375,0.803125,0.805,0.011456,7
1,0.372789,0.0409,0.030782,0.013548,gini,3,200,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.825,0.765625,0.803125,0.8,0.790625,0.796875,0.019264,9
2,0.545288,0.078639,0.035581,0.00326,gini,3,300,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.834375,0.775,0.80625,0.7875,0.79375,0.799375,0.020194,8
3,0.220671,0.02662,0.015793,0.001599,gini,5,100,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.85625,0.821875,0.846875,0.8625,0.8625,0.85,0.015181,3
4,0.474927,0.077248,0.026385,0.002058,gini,5,200,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.853125,0.84375,0.85,0.8875,0.86875,0.860625,0.015762,1
5,0.556085,0.047403,0.036977,0.003577,gini,5,300,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.865625,0.8125,0.85,0.871875,0.859375,0.851875,0.020972,2
6,0.18889,0.011235,0.01319,0.00172,entropy,3,100,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.790625,0.8,0.721875,0.790625,0.75,0.770625,0.029883,12
7,0.457939,0.063132,0.029579,0.010832,entropy,3,200,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.81875,0.740625,0.7875,0.803125,0.803125,0.790625,0.026882,11
8,0.580465,0.043701,0.033383,0.002331,entropy,3,300,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.8125,0.771875,0.8,0.81875,0.765625,0.79375,0.021378,10
9,0.264049,0.032902,0.014192,0.002134,entropy,5,100,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.8375,0.840625,0.8375,0.86875,0.859375,0.84875,0.0129,4


#### Get the best score and optimal values for hyperparameters.

In [8]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 200}

In [9]:
clf.best_score_

0.8606250000000001

#### Perform hyperparameter tuning using random search. Increase number of iterations if needed. Do not forget about 5-fold validation.

In [11]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

space = dict()
space['n_estimators'] = [100, 200, 300] 
space['criterion'] = ['gini', 'entropy']
space['max_depth'] = [3, 5]

search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)



Best Score: 0.8571666666666667
Best Hyperparameters: {'n_estimators': 200, 'max_depth': 5, 'criterion': 'gini'}


#### Create a pipeline and add standard scaling and dimensionality reduction. You can use StandardScaler and PCA. Perform tuning by random search. Now you have to provide values for hyperparameters of different components of your pipeline. Find out how you can achieve that. For PCA, one hyperparameter to tune would be number of components. Try to isolate and check the effect of scaling and dimensionality reduction on the model.

In [12]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [13]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])


In [14]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [15]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [16]:
## LEts make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [17]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [18]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

In [19]:
# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [20]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.345
Decision Tree Test Accuracy: 0.28
RandomForest Test Accuracy: 0.2625


In [21]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Logistic Regression


In [22]:
from sklearn.model_selection import GridSearchCV

In [25]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [100, 200, 300],
                 "classifier__max_depth":[3, 5]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

 0.555625      nan 0.5525        nan 0.55125       nan 0.551875      nan
 0.551875      nan 0.55125       nan 0.95     0.53875  0.545625 0.75375
 0.951875 0.539375 0.545    0.781875 0.95375  0.539375 0.54625  0.80125
 0.951875 0.539375 0.545625 0.805    0.955    0.539375 0.545625 0.805
 0.955    0.539375 0.545    0.806875 0.9525   0.53875  0.545625 0.81375
 0.9525   0.539375 0.544375 0.810625 0.95125  0.539375 0.545    0.809375
 0.950625 0.539375 0.546875 0.803125 0.801875 0.800625 0.801875 0.8475
 0.85125  0.854375]


Pipeline(steps=[('classifier',
                 LogisticRegression(C=59.94842503189409, solver='newton-cg'))])
The mean accuracy of the model is: 0.965




In [26]:
from sklearn.pipeline import make_pipeline

In [30]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [100, 200, 300],
                 "randomforestclassifier__max_depth":[3, 5]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [31]:
best_model.score(X_test,y_test)

0.8675