# 1. Load dataset

#### In this assignment, you are expected to practice building pipeline, doing k-fold cross validation and performing hyperparameter tuning.
#### You will be working with mobile phone dataset (mobile_train.csv).

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('mobile_train.csv')
df


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


# 2. Tuning

#### Build a random forest classifier model and perform hyperparameter tuning using grid search. Also apply 5-fold cross validation while doing searching. Use following values for the search:
- n_estimators - 100, 200, 300
- max_depth - 3, 5
- criterion - gini, entropy

In [2]:
# manually searching using for loops

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

X=df[['battery_power','int_memory','ram','n_cores','px_height','px_width','three_g','wifi']]  # Features
y=df['price_range']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

n_estimators=[100,200,300]
max_depths=[3,5]
criterions=['gini', 'entropy']

avg_scores={}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for criterion in criterions:
            clf=RandomForestClassifier(n_estimators=n_estimator,max_depth=max_depth, criterion=criterion)
            scores=cross_val_score(clf,X,y,cv=5)
            
            avg_scores[str(n_estimator)+' '+str(max_depth)+' '+criterion] = np.average(scores)

print(avg_scores)

print('\n\nBest params: ', list(avg_scores.keys())[list(avg_scores.values()).index(max(avg_scores.values()))])

{'100 3 gini': 0.807, '100 3 entropy': 0.7949999999999999, '100 5 gini': 0.849, '100 5 entropy': 0.858, '200 3 gini': 0.8115, '200 3 entropy': 0.8005000000000001, '200 5 gini': 0.8539999999999999, '200 5 entropy': 0.8540000000000001, '300 3 gini': 0.8119999999999999, '300 3 entropy': 0.7955000000000001, '300 5 gini': 0.8540000000000001, '300 5 entropy': 0.8585}


Best params:  300 5 entropy


In [3]:
# using GRID SEARCH 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

model=RandomForestClassifier()

clf = GridSearchCV(model, {
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy'],
    'max_depth':[3,5]
}, cv=5, return_train_score=False)

clf.fit(X_train, y_train)
grid_search_results = pd.DataFrame(clf.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.144117,0.004789,0.011193,0.0004,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.796875,0.859375,0.76875,0.8125,0.775,0.8025,0.03244,8
1,0.287036,0.00708,0.021586,0.000798,gini,3,200,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.796875,0.859375,0.79375,0.8125,0.79375,0.81125,0.025047,7
2,0.429345,0.004754,0.032193,0.001322,gini,3,300,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.775,0.84375,0.790625,0.803125,0.775,0.7975,0.025418,10
3,0.161906,0.002607,0.01219,0.000394,gini,5,100,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.815625,0.86875,0.828125,0.846875,0.840625,0.84,0.017941,4
4,0.329414,0.018098,0.021788,0.0004,gini,5,200,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.8,0.865625,0.81875,0.84375,0.81875,0.829375,0.022845,6
5,0.473528,0.013616,0.031781,0.001599,gini,5,300,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.803125,0.8875,0.83125,0.86875,0.840625,0.84625,0.029408,1
6,0.161703,0.001599,0.010597,0.000493,entropy,3,100,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.7625,0.828125,0.76875,0.778125,0.784375,0.784375,0.023133,12
7,0.331409,0.020884,0.021786,0.002131,entropy,3,200,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.765625,0.809375,0.8,0.834375,0.7875,0.799375,0.022827,9
8,0.496301,0.006777,0.030803,0.001348,entropy,3,300,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.771875,0.8125,0.775,0.809375,0.78125,0.79,0.017388,11
9,0.202575,0.003897,0.012391,0.000487,entropy,5,100,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.80625,0.8875,0.815625,0.8625,0.834375,0.84125,0.030065,3


#### Get the best score and optimal values for hyperparameters.

In [4]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 300}

In [5]:
clf.best_score_

0.8462499999999998

#### Perform hyperparameter tuning using random search. Increase number of iterations if needed. Do not forget about 5-fold validation.

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.model_selection import RandomizedSearchCV 
model = RandomForestClassifier()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['n_estimators'] = [100,200,300]
space['criterion'] = ['gini', 'entropy']
space['max_depth'] = [3,5]
# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)



Best Score: 0.8566666666666664
Best Hyperparameters: {'n_estimators': 300, 'max_depth': 5, 'criterion': 'gini'}


#### Create a pipeline and add standard scaling and dimensionality reduction. You can use StandardScaler and PCA. Perform tuning by random search. Now you have to provide values for hyperparameters of different components of your pipeline. Find out how you can achieve that. For PCA, one hyperparameter to tune would be number of components. Try to isolate and check the effect of scaling and dimensionality reduction on the model.

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [100, 200, 300],
                 "classifier__max_depth":[3,5],
                 "classifier__criterion":["gini","entropy"]}]
gridsearch = GridSearchCV(pipe, grid_param, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1), verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(steps=[('classifier',
                 RandomForestClassifier(criterion='entropy', max_depth=5,
                                        n_estimators=300))])
The mean accuracy of the model is: 0.885
