In [2]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from time import sleep 
import pickle

## data preparation

In [3]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')
y = df['dayofweek'].values
x = df.drop(columns='dayofweek')

## Using train_test_split with parameters test_size=0.2, random_state=21 get X_train, y_train, X_test, y_test. Use the additional parameter stratify.

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## Using GridSearchCV try different parameters of kernel (linear, rbf, sigmoid), C (0.01, 0.1, 1, 1.5, 5, 10), gamma (scale, auto), class_weight (balanced, None) use random_state=21 and probability=True and get the best combination of them in terms of accuracy.
## Create a dataframe from the results of the gridsearch and sort it ascendingly by the rank_test_score. 

In [None]:
svc = SVC(random_state=21, probability=True)
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 
               'gamma': ['scale', 'auto'],
               'C': [0.01, 0.1, 1, 1.5, 5, 10],
               'class_weight': ['balanced', None]}
grid_search = GridSearchCV(svc, params, verbose=1)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
df.to_csv('../data/cv_results_SVC.csv')

## Decision tree
## Using GridSearchCV try different parameters of max_depth (from 1 to 49), class_weight (balanced, None) and criterion (entropy and gini) and get the best combination of them in terms of accuracy. Use random_state=21.
## Create a dataframe from the results of the gridsearch and sort it ascendingly by the rank_test_score.

In [8]:
params = {'max_depth': list(range(1, 50)),  'class_weight': ['balanced', None], 'criterion': ['entropy', 'gini']}
model = DecisionTreeClassifier(random_state=21)
grid_search = GridSearchCV(model, params)
grid_search.fit(x_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=21),
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...]})

In [10]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
df.to_csv('../data/cv_results_decision_tree.csv')

## Random forest

## Using GridSearchCV try different parameters of n_estimators (5, 10, 50, 100), max_depth (from 1 to 49), class_weight (balanced, None) and criterion (entropy and gini) and get the best combination of them in terms of accuracy. Use random_state=21.
## Create a dataframe from the results of the gridsearch and sort it ascendengly by the rank_test_score

In [11]:
params = {'n_estimators': [5, 10, 50, 100], 'max_depth': list(range(1, 50)), 'class_weight': ['balanced', None], 
'criterion': ['entropy', 'gini']}
model = RandomForestClassifier(random_state=21)
grid_search = GridSearchCV(model, params)
grid_search.fit(x_train, y_train)


GridSearchCV(estimator=RandomForestClassifier(random_state=21),
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_estimators': [5, 10, 50, 100]})

In [58]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
df.to_csv('../data/cv_results_random_forest.csv')

## Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating cross_val_score for each combination. Try to increase n_jobs. The value cv for cross_val_score is 5.

## Track the progress using the library tqdm.notebook.
## Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and mean_accuracy and std_accuracy.
## Sort it descendingly by the mean_accuracy, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [39]:
params = {'n_estimators': [5, 10, 50, 100], 'max_depth': list(range(1, 50)), 'class_weight': ['balanced', None], 
'criterion': ['entropy', 'gini'], 'random_state': [21]}
model = RandomForestClassifier(random_state=21)

In [66]:
res = []
for n in tqdm(params['n_estimators']):
    for m in params['max_depth']:
        for c in params['class_weight']:  
            for cr in params['criterion']: 
                parameters = {'n_estimators': n, 
                                'max_depth': m, 
                                'class_weight': c, 
                                'criterion': cr, 
                                'random_state': 21}
                model = RandomForestClassifier(**parameters)
                cvs = cross_val_score(model, x_train, y_train, cv=5)
                res.append([n, m, c, cr, 21, np.mean(cvs), np.std(cvs)])







  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A




 25%|██▌       | 1/4 [00:06<00:20,  6.75s/it][A[A[A[A[A




 50%|█████     | 2/4 [00:17<00:16,  8.06s/it][A[A[A[A[A




 75%|███████▌  | 3/4 [01:04<00:19, 19.55s/it][A[A[A[A[A




100%|██████████| 4/4 [02:34<00:00, 38.64s/it]


In [67]:
df = pd.DataFrame(res, columns=['n_estimators', 'max_depth', 'class_weight', 'criterion', 'random_state', 'mean_accuracy', 'std_accuracy'])
df.sort_values('mean_accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,random_state,mean_accuracy,std_accuracy
698,100,28,,entropy,21,0.874629,0.009644
702,100,29,,entropy,21,0.874629,0.009644
718,100,33,,entropy,21,0.874629,0.009644
694,100,27,,entropy,21,0.873887,0.008902
706,100,30,,entropy,21,0.873887,0.008902
...,...,...,...,...,...,...,...
4,5,2,balanced,entropy,21,0.309347,0.009644
197,10,1,balanced,gini,21,0.297478,0.060089
196,10,1,balanced,entropy,21,0.296736,0.031157
0,5,1,balanced,entropy,21,0.247774,0.011869


## Predictions
## Choose the best model and use it to make predictions for the test dataset.
## Calculate the final accuracy.

In [57]:
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=30, class_weight=None, random_state=21
).fit(x_train, y_train)
print(f'Final accuracy: {rfc.score(x_test, y_test)}')

Final accuracy: 0.9349112426035503
