# 1. Load dataset

#### In this assignment, you are expected to practice building pipeline, doing k-fold cross validation and performing hyperparameter tuning.
#### You will be working with mobile phone dataset (mobile_train.csv).

In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('mobile_train.csv')

# 2. Tuning

#### Build a random forest classifier model and perform hyperparameter tuning using grid search. Also apply 5-fold cross validation while doing searching. Use following values for the search:
- n_estimators - 100, 200, 300
- max_depth - 3, 5
- criterion - gini, entropy

In [61]:
# manually searching using for loops

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

X=df[['battery_power','int_memory','ram','n_cores','px_height','px_width','three_g','wifi']]  # Features
y=df['price_range']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

n_estimators=[100,200,300]
max_depths=[3,5]
criterions=['gini', 'entropy']

avg_scores={}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for criterion in criterions:
            clf=RandomForestClassifier(n_estimators=n_estimator,max_depth=max_depth, criterion=criterion)
            scores=cross_val_score(clf,X,y,cv=5)
            avg_scores[str(n_estimator)+' '+str(max_depth)+' '+criterion] = np.average(scores)

print(avg_scores)
print('\n\nBest params: ', list(avg_scores.keys())[list(avg_scores.values()).index(max(avg_scores.values()))])

{'100 3 gini': 0.8109999999999999, '100 3 entropy': 0.796, '100 5 gini': 0.849, '100 5 entropy': 0.8525, '200 3 gini': 0.8039999999999999, '200 3 entropy': 0.8039999999999999, '200 5 gini': 0.8605, '200 5 entropy': 0.853, '300 3 gini': 0.8085000000000001, '300 3 entropy': 0.796, '300 5 gini': 0.8560000000000001, '300 5 entropy': 0.852}


Best params:  200 5 gini


In [64]:
# using GRID SEARCH 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

model=RandomForestClassifier()

clf = GridSearchCV(model, {
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy'],
    'max_depth':[3,5]
}, cv=5, return_train_score=False)

clf.fit(X_train, y_train)
grid_search_results = pd.DataFrame(clf.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.14292,0.000635,0.012391,0.000798,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.8375,0.8,0.840625,0.840625,0.79375,0.8225,0.021047,7
1,0.286637,0.00538,0.022985,0.000635,gini,3,200,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.80625,0.834375,0.84375,0.80625,0.778125,0.81375,0.023268,9
2,0.426761,0.000635,0.03238,0.000798,gini,3,300,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.809375,0.81875,0.8125,0.834375,0.803125,0.815625,0.010643,8
3,0.161307,0.001853,0.012793,0.0004,gini,5,100,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.8375,0.840625,0.8875,0.8625,0.821875,0.85,0.022793,5
4,0.322214,0.004311,0.024186,0.000979,gini,5,200,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.8375,0.84375,0.890625,0.871875,0.834375,0.855625,0.021955,2
5,0.482923,0.001465,0.043977,0.018497,gini,5,300,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.825,0.840625,0.871875,0.86875,0.821875,0.845625,0.021158,6
6,0.170898,0.001268,0.012197,0.000403,entropy,3,100,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.7875,0.75625,0.79375,0.79375,0.78125,0.7825,0.013919,12
7,0.341406,0.002802,0.022781,0.000747,entropy,3,200,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.8,0.796875,0.803125,0.796875,0.778125,0.795,0.00875,11
8,0.526298,0.009259,0.033584,0.000797,entropy,3,300,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.809375,0.809375,0.8375,0.81875,0.790625,0.813125,0.015233,10
9,0.213876,0.008119,0.012991,0.000892,entropy,5,100,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.85,0.8375,0.86875,0.875,0.834375,0.853125,0.016298,3


#### Get the best score and optimal values for hyperparameters.

In [65]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 200}

In [66]:
clf.best_score_

0.8574999999999999

#### Inspect the values and check if you can do better.

#### Perform hyperparameter tuning using random search. Increase number of iterations if needed. Do not forget about 5-fold validation.

#### Create a pipeline and add standard scaling and dimensionality reduction. You can use StandardScaler and PCA. Perform tuning by random search. Now you have to provide values for hyperparameters of different components of your pipeline. Find out how you can achieve that. For PCA, one hyperparameter to tune would be number of components. Try to isolate and check the effect of scaling and dimensionality reduction on the model.