# 1. Load dataset

#### In this assignment, you are expected to practice building pipeline, doing k-fold cross validation and performing hyperparameter tuning.
#### You will be working with mobile phone dataset (mobile_train.csv).

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('mobile_train.csv')

# 2. Tuning

#### Build a random forest classifier model and perform hyperparameter tuning using grid search. Also apply 5-fold cross validation while doing searching. Use following values for the search:
- n_estimators - 100, 200, 300
- max_depth - 3, 5
- criterion - gini, entropy

In [4]:
# manually searching using for loops

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

X=df[['battery_power','int_memory','ram','n_cores','px_height','px_width','three_g','wifi']]  # Features
y=df['price_range']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

n_estimators=[100,200,300]
max_depths=[3,5]
criterions=['gini', 'entropy']

avg_scores={}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for criterion in criterions:
            clf=RandomForestClassifier(n_estimators=n_estimator,max_depth=max_depth, criterion=criterion)
            scores=cross_val_score(clf,X,y,cv=5)
            avg_scores[str(n_estimator)+' '+str(max_depth)+' '+criterion] = np.average(scores)

print(avg_scores)
print('\n\nBest params: ', list(avg_scores.keys())[list(avg_scores.values()).index(max(avg_scores.values()))])

KeyboardInterrupt: 

In [3]:
# using GRID SEARCH 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

model=RandomForestClassifier()

clf = GridSearchCV(model, {
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy'],
    'max_depth':[3,5]
}, cv=5, return_train_score=False)

clf.fit(X_train, y_train)
grid_search_results = pd.DataFrame(clf.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.298204,0.00754,0.02654,0.008772,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.80625,0.8125,0.80625,0.80625,0.78125,0.8025,0.010897,8
1,0.596722,0.034765,0.048322,0.001346,gini,3,200,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.815625,0.79375,0.790625,0.825,0.778125,0.800625,0.017162,10
2,0.885575,0.024677,0.071761,0.007334,gini,3,300,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",0.815625,0.790625,0.790625,0.809375,0.79375,0.8,0.010458,11
3,0.328341,0.006573,0.024226,0.010356,gini,5,100,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.853125,0.84375,0.853125,0.865625,0.85,0.853125,0.007126,5
4,0.6712,0.019518,0.044598,0.006041,gini,5,200,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.865625,0.8375,0.8625,0.8625,0.846875,0.855,0.010933,2
5,0.973848,0.006605,0.070015,0.007736,gini,5,300,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.8625,0.828125,0.85625,0.865625,0.86875,0.85625,0.014658,1
6,0.340063,0.004943,0.025402,0.00757,entropy,3,100,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.79375,0.809375,0.784375,0.828125,0.78125,0.799375,0.017388,12
7,0.690679,0.01763,0.045941,0.007763,entropy,3,200,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.825,0.8,0.790625,0.81875,0.771875,0.80125,0.019223,9
8,1.074413,0.044907,0.066001,0.003526,entropy,3,300,"{'criterion': 'entropy', 'max_depth': 3, 'n_es...",0.825,0.7875,0.790625,0.834375,0.7875,0.805,0.020406,7
9,0.452838,0.048386,0.024354,0.004083,entropy,5,100,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.871875,0.8375,0.853125,0.859375,0.846875,0.85375,0.011592,4


#### Get the best score and optimal values for hyperparameters.

In [4]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 300}

In [5]:
clf.best_score_

0.85625

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score

X=df[['battery_power','int_memory','ram','n_cores','px_height','px_width','three_g','wifi']]  # Features
y=df['price_range']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model=RandomForestClassifier(criterion='gini', max_depth=5, n_estimators=300)
model.fit(X_train, y_train)
y_predict=model.predict(X_test)

accuracy_score(y_test, y_predict)

0.8075

#### Perform hyperparameter tuning using random search. Increase number of iterations if needed. Do not forget about 5-fold validation.

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

space = dict()
space['n_estimators'] = [100, 200, 300] 
space['criterion'] = ['gini', 'entropy']
space['max_depth'] = [3, 5]

search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)



Best Score: 0.8571666666666667
Best Hyperparameters: {'n_estimators': 300, 'max_depth': 5, 'criterion': 'gini'}


#### Create a pipeline and add standard scaling and dimensionality reduction. You can use StandardScaler and PCA. Perform tuning by random search. Now you have to provide values for hyperparameters of different components of your pipeline. Find out how you can achieve that. For PCA, one hyperparameter to tune would be number of components. Try to isolate and check the effect of scaling and dimensionality reduction on the model.

In [7]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])


In [9]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [10]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [11]:
## LEts make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [12]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [13]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

In [14]:
# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [15]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.385
Decision Tree Test Accuracy: 0.355
RandomForest Test Accuracy: 0.3725


In [16]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Logistic Regression


In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [100, 200, 300],
                 "classifier__max_depth":[3, 5]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

50 fits failed out of a total of 330.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bashi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bashi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\bashi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  Fi

Pipeline(steps=[('classifier',
                 LogisticRegression(C=7.742636826811269, solver='newton-cg'))])
The mean accuracy of the model is: 0.955




In [19]:
from sklearn.pipeline import make_pipeline

In [20]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [100, 200, 300],
                 "randomforestclassifier__max_depth":[3, 5]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [21]:
best_model.score(X_test,y_test)

0.8675