# Day 09. Exercise 02
# Metrics

## 0. Imports

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
X = pd.read_csv("../../data/day-of-week-not-scaled.csv")
y = pd.read_csv("../../data/dayofweek.csv")["dayofweek"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [3]:
best_parms_SVM = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
svm_model = SVC(**best_parms_SVM, random_state=21, probability=True)
svm_model.fit(X_train, y_train)

In [4]:
accu = accuracy_score(y_true=y_test, y_pred=svm_model.predict(X_test))
prec = precision_score(y_true=y_test, y_pred=svm_model.predict(X_test), average='weighted')
reca = recall_score(y_true=y_test, y_pred=svm_model.predict(X_test), average='weighted')
ras = roc_auc_score(y_true=y_test, y_score=svm_model.predict_proba(X_test), multi_class='ovo', average='weighted')
print('accuracy is {:.5f}'.format(accu))
print('precision is {:.5f}'.format(prec))
print('recall is {:.5f}'.format(reca))
print('roc_auc is {:.5f}'.format(ras))


accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [5]:
best_parms_Tree = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23}
tree_model = DecisionTreeClassifier(**best_parms_Tree, random_state=21)
tree_model.fit(X_train, y_train)

In [6]:
accu = accuracy_score(y_true=y_test, y_pred=tree_model.predict(X_test))
prec = precision_score(y_true=y_test, y_pred=tree_model.predict(X_test), average='weighted')
reca = recall_score(y_true=y_test, y_pred=tree_model.predict(X_test), average='weighted')
ras = roc_auc_score(y_true=y_test, y_score=tree_model.predict_proba(X_test), multi_class='ovo', average='weighted')
print('accuracy is {:.5f}'.format(accu))
print('precision is {:.5f}'.format(prec))
print('recall is {:.5f}'.format(reca))
print('roc_auc is {:.5f}'.format(ras))

accuracy is 0.89349
precision is 0.89531
recall is 0.89349
roc_auc is 0.93545


## 4. Random forest

1. The same task for random forest.

In [7]:
best_parms_Forest = {'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}
forest_model = RandomForestClassifier(**best_parms_Tree, random_state=21)
forest_model.fit(X_train, y_train)

In [8]:
accu = accuracy_score(y_true=y_test, y_pred=forest_model.predict(X_test))
prec = precision_score(y_true=y_test, y_pred=forest_model.predict(X_test), average='weighted')
reca = recall_score(y_true=y_test, y_pred=forest_model.predict(X_test), average='weighted')
ras = roc_auc_score(y_true=y_test, y_score=forest_model.predict_proba(X_test), multi_class='ovo', average='weighted')
print('accuracy is {:.5f}'.format(accu))
print('precision is {:.5f}'.format(prec))
print('recall is {:.5f}'.format(reca))
print('roc_auc is {:.5f}'.format(ras))

accuracy is 0.92899
precision is 0.93136
recall is 0.92899
roc_auc is 0.98549


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [27]:
res = forest_model.predict(X)
user_cols = list(filter(lambda x: x.startswith("uid_user_"), X.columns))
lab_cols = list(filter(lambda x: x.startswith("labname_"), X.columns))
result = pd.DataFrame()
result['user'] = X[user_cols].idxmax(axis=1)
result['labname'] = X[lab_cols].idxmax(axis=1)
result['true'] = y
result['pred'] = res
result['is_error'] = (result['true'] != result['pred'])
result

Unnamed: 0,user,labname,true,pred,is_error
0,uid_user_4,labname_project1,4,4,False
1,uid_user_4,labname_project1,4,4,False
2,uid_user_4,labname_project1,4,4,False
3,uid_user_4,labname_project1,4,4,False
4,uid_user_4,labname_project1,4,4,False
...,...,...,...,...,...
1681,uid_user_19,labname_laba06s,3,3,False
1682,uid_user_1,labname_laba06s,3,3,False
1683,uid_user_1,labname_laba06s,3,3,False
1684,uid_user_1,labname_laba06s,3,3,False


In [29]:
result.groupby(by='true')['is_error'].sum()

true
0    7
1    4
2    2
3    3
4    3
5    4
6    1
Name: is_error, dtype: int64

In [35]:
result.groupby(by='labname')['is_error'].sum().sort_values(ascending=False)

labname
labname_project1    10
labname_laba04       6
labname_code_rvw     1
labname_lab03        1
labname_lab03s       1
labname_lab05s       1
labname_laba04s      1
labname_laba05       1
labname_laba06       1
labname_laba06s      1
labname_lab02        0
Name: is_error, dtype: int64

In [32]:
result.groupby(by='user')['is_error'].sum().sort_values(ascending=False)

user
uid_user_19    3
uid_user_31    2
uid_user_4     2
uid_user_3     2
uid_user_2     2
uid_user_16    2
uid_user_25    2
uid_user_14    1
uid_user_13    1
uid_user_27    1
uid_user_18    1
uid_user_6     1
uid_user_30    1
uid_user_10    1
uid_user_29    1
uid_user_24    1
uid_user_7     0
uid_user_28    0
uid_user_0     0
uid_user_22    0
uid_user_26    0
uid_user_23    0
uid_user_1     0
uid_user_21    0
uid_user_20    0
uid_user_17    0
uid_user_15    0
uid_user_12    0
uid_user_11    0
uid_user_8     0
Name: is_error, dtype: int64

In [37]:
joblib.dump(forest_model, "model")

['model']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [51]:
models = [SVC, DecisionTreeClassifier, RandomForestClassifier]
parametres = [
    {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True},
    {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23},
    {'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}
]

In [57]:
def func(l:list, params:list):
    result = {'accuracy':[], 'precision':[], 'recall':[], 'roc_auc':[]}
    for model, param in zip(l, params):
        m = model(**param, random_state = 21)
        m.fit(X_train, y_train)
        result['accuracy'].append(accuracy_score(y_true=y_test, y_pred=m.predict(X_test)))
        result['precision'].append(precision_score(y_true=y_test, y_pred=m.predict(X_test), average='weighted'))
        result['recall'].append(recall_score(y_true=y_test, y_pred=m.predict(X_test), average='weighted'))
        result['roc_auc'].append(float(roc_auc_score(y_true=y_test, y_score=m.predict_proba(X_test), multi_class='ovo', average='weighted')))
    return result
        

In [58]:
func(models, parametres)

{'accuracy': [0.8875739644970414, 0.893491124260355, 0.9289940828402367],
 'precision': [0.8926729169690374, 0.8953094285678733, 0.9300865038851309],
 'recall': [0.8875739644970414, 0.893491124260355, 0.9289940828402367],
 'roc_auc': [0.9787793228216216, 0.9354475758928515, 0.9903274757720744]}