# Ex01 Gridsearch

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from tqdm.notebook import tqdm 
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

## 1. Preprocessing

In [2]:
df_s = pd.read_csv('../data/dayofweek.csv')

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
df['dayofweek'] = df_s['dayofweek']

In [5]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

In [7]:
svc = SVC(probability=True, random_state=21)

param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}


In [8]:
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [9]:
best_params = grid_search.best_params_
best_params

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

In [10]:
best_score = grid_search.best_score_
best_score

0.8761090458488228

In [11]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df.sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
70,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.876109
64,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.863500
58,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.816018
52,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.808608
63,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.721052
...,...,...
53,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.129792
65,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.115693
41,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.079380
17,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062310


there is huge difference between different combinations in mean accuracy

## 3. Decision tree

In [12]:
tree = DecisionTreeClassifier(random_state=21)

In [13]:
tree_param_grid = {
    'max_depth': [1,5,10,20,30,40,45,49],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}

In [14]:
grid_search_tree = GridSearchCV(estimator=tree, param_grid=tree_param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_tree.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [15]:
best_params = grid_search_tree.best_params_
best_params

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30}

In [16]:
best_score = grid_search_tree.best_score_
best_score

0.8731157923722979

In [17]:
results_tree = pd.DataFrame(grid_search_tree.cv_results_)
results_tree.sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
15,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116
14,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116
12,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116
13,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116
11,"{'class_weight': 'balanced', 'criterion': 'gin...",0.871632
27,"{'class_weight': None, 'criterion': 'gini', 'm...",0.867922
4,"{'class_weight': 'balanced', 'criterion': 'ent...",0.866463
5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.866463
6,"{'class_weight': 'balanced', 'criterion': 'ent...",0.866463
7,"{'class_weight': 'balanced', 'criterion': 'ent...",0.866463


there is huge difference between different combinations in mean accuracy

## 4. Random forest

In [18]:
forest = RandomForestClassifier(random_state=21)

In [19]:
forest_param_grid = {
    'max_depth': [1,5,10,20,30,40,45,49],
    'n_estimators' : [5,10,50,100],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}

In [20]:
grid_search_forest = GridSearchCV(estimator=forest, param_grid=forest_param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_forest.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [21]:
best_params = grid_search_forest.best_params_
best_params

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 30,
 'n_estimators': 50}

In [22]:
best_score = grid_search_forest.best_score_
best_score

0.9028170177612557

In [23]:
results_forest = pd.DataFrame(grid_search_forest.cv_results_)
results_forest.sort_values(by='rank_test_score', ascending=True)[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902817
119,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806
127,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806
123,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806
115,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902068
...,...,...
65,"{'class_weight': None, 'criterion': 'entropy',...",0.369404
96,"{'class_weight': None, 'criterion': 'gini', 'm...",0.364219
64,"{'class_weight': None, 'criterion': 'entropy',...",0.353832
32,"{'class_weight': 'balanced', 'criterion': 'gin...",0.283390


there is huge difference between different combinations in mean accuracy

## 5. Progress bar

In [24]:
forest_param_grid = {
    'max_depth': [1,5,10,20,30,40,45,49],
    'n_estimators' : [5,10,50,100],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}

In [25]:
results = []
for n_estimators in tqdm(forest_param_grid['n_estimators'], desc="n_estimators"):
    for max_depth in forest_param_grid['max_depth']:
        for class_weight in forest_param_grid['class_weight']:
            for criterion in forest_param_grid['criterion']:
                
                model = RandomForestClassifier(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    class_weight=class_weight,
                    random_state=21,
                    n_jobs=-1 
                )
                
                scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
                
                # Добавим результаты в список
                results.append({
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'class_weight': class_weight,
                    'min_samples_leaf': criterion,
                    'mean_accuracy': np.mean(scores),
                    'std_accuracy': np.std(scores)
                })


n_estimators:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
results_df = pd.DataFrame(results)
results_df.sort_values(by='mean_accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,class_weight,min_samples_leaf,mean_accuracy,std_accuracy
80,50,30,balanced,entropy,0.902817,0.013554
81,50,30,balanced,gini,0.902817,0.013554
127,100,49,,gini,0.902806,0.010460
126,100,49,,entropy,0.902806,0.010460
122,100,45,,entropy,0.902806,0.010460
...,...,...,...,...,...,...
32,10,1,balanced,entropy,0.381264,0.024786
3,5,1,,gini,0.364219,0.021651
2,5,1,,entropy,0.364219,0.021651
1,5,1,balanced,gini,0.283390,0.011062


## 6. Predictions

In [27]:
best_model = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=30, n_estimators=50,
                       random_state=21)

In [28]:
best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)

In [29]:
accuracy_score(pred, y_test)

0.9319526627218935