# Day 09. Exercise 02
# Metrics

## 0. Imports

In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
  train_test_split
)

from sklearn.tree import (
  DecisionTreeClassifier,
)

from sklearn.svm import (
  SVC
)

from sklearn.metrics import (
  accuracy_score,
  precision_score,
  recall_score,
  roc_auc_score,
  confusion_matrix,
)


from sklearn.ensemble import (
  RandomForestClassifier,
)

from typing import (
  List,
  Dict
)

import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [22]:
df = pd.read_csv('../../datasets/day-of-week-not-scaled.csv')
df['dayofweek'] = pd.read_csv('../../datasets/dayofweek.csv')['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
  df.drop('dayofweek', axis=1),
  df['dayofweek'],
  test_size=0.2,
  stratify=df['dayofweek'],
  random_state=21
)

In [None]:
%store -r svc_params
%store -r tree_params
%store -r forest_params

params = [svc_params, tree_params, forest_params]
params

[{'C': 10,
  'class_weight': None,
  'gamma': 'auto',
  'kernel': 'rbf',
  'probability': True,
  'random_state': 21},
 {'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 21,
  'random_state': 21},
 {'class_weight': 'balanced',
  'criterion': 'entropy',
  'max_depth': 24,
  'n_estimators': 100,
  'random_state': 21}]

## 0. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [25]:
def choose_best_model(model_list : List, param_list: List[Dict]) -> Dict:
  models = {}
  for model, param in zip(model_list, param_list):
    metriscs = {}
    
    estimator = model(**param)
    estimator.fit(X_train, y_train)
    predict = estimator.predict(X_test)
    proba = estimator.predict_proba(X_test)

    metriscs['accuracy'] = accuracy_score(y_test, predict)
    metriscs['precision'] = precision_score(y_test, predict, average='weighted')
    metriscs['recall'] = recall_score(y_test, predict, average='weighted')
    metriscs['roc_auc'] = roc_auc_score(y_test, proba, multi_class='ovo', average='weighted')

    models[model.__name__] = metriscs

  return models

In [26]:
def print_metrics(metrics: Dict) -> None:
  for k, v in metrics.items():
    print(f'{k} is {v:.5f}')


In [27]:
metrics = choose_best_model([SVC, DecisionTreeClassifier, RandomForestClassifier], params)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [28]:
print_metrics(metrics['SVC'])

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [29]:
print_metrics(metrics['DecisionTreeClassifier'])

accuracy is 0.88462
precision is 0.88765
recall is 0.88462
roc_auc is 0.93528


## 4. Random forest

1. The same task for random forest.

In [30]:
print_metrics(metrics['RandomForestClassifier'])

accuracy is 0.92604
precision is 0.92754
recall is 0.92604
roc_auc is 0.98939


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [31]:
prediction = pd.read_csv('../../datasets/checker_submits.csv', parse_dates=['timestamp'])
prediction['dayofweek'] = prediction['timestamp'].dt.dayofweek


best_model = RandomForestClassifier(**params[2])
best_model.fit(X_train,y_train)
prediction['predict'] = best_model.predict(df.drop('dayofweek', axis=1))
prediction['is_error'] = (prediction['predict'] != prediction['dayofweek'])

prediction

Unnamed: 0,uid,labname,numTrials,timestamp,dayofweek,predict,is_error
0,user_4,project1,1,2020-04-17 05:19:02.744528,4,4,False
1,user_4,project1,2,2020-04-17 05:22:45.549397,4,4,False
2,user_4,project1,3,2020-04-17 05:34:24.422370,4,4,False
3,user_4,project1,4,2020-04-17 05:43:27.773992,4,4,False
4,user_4,project1,5,2020-04-17 05:46:32.275104,4,4,False
...,...,...,...,...,...,...,...
1681,user_19,laba06s,9,2020-05-21 20:01:48.959966,3,3,False
1682,user_1,laba06s,6,2020-05-21 20:18:54.487900,3,3,False
1683,user_1,laba06s,7,2020-05-21 20:19:06.872761,3,3,False
1684,user_1,laba06s,8,2020-05-21 20:22:41.877806,3,3,False


In [32]:
prediction.groupby('dayofweek')['is_error'].sum().sort_values(ascending=False)

dayofweek
0    6
5    5
1    4
3    3
4    3
2    2
6    2
Name: is_error, dtype: int64

In [33]:
prediction.groupby('uid')['is_error'].sum().sort_values(ascending=False)

uid
user_2     3
user_31    2
user_4     2
user_3     2
user_19    2
user_25    2
user_6     2
user_14    1
user_13    1
user_16    1
user_27    1
user_18    1
user_30    1
user_10    1
user_29    1
user_24    1
user_22    1
user_7     0
user_28    0
user_0     0
user_26    0
user_23    0
user_1     0
user_21    0
user_20    0
user_17    0
user_15    0
user_12    0
user_11    0
user_8     0
Name: is_error, dtype: int64

In [34]:
prediction.groupby('labname')['is_error'].sum().sort_values(ascending=False)

labname
project1    10
laba04       6
laba04s      2
laba06s      2
code_rvw     1
lab03        1
lab05s       1
laba05       1
laba06       1
lab02        0
lab03s       0
Name: is_error, dtype: int64

In [35]:
joblib.dump(best_model, 'model.joblib')

['model.joblib']