# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [87]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold


## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [3]:
X = pd.read_csv("../../data/day-of-week-not-scaled.csv")
y = pd.read_csv("../../data/dayofweek.csv")["dayofweek"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [29]:
def print_scores(m, X, y):
    accu = accuracy_score(y_true=y, y_pred=m.predict(X))
    prec = precision_score(y_true=y, y_pred=m.predict(X), average='weighted')
    reca = recall_score(y_true=y, y_pred=m.predict(X), average='weighted')
    print('accuracy is {:.5f}'.format(accu))
    print('precision is {:.5f}'.format(prec))
    print('recall is {:.5f}'.format(reca))

In [30]:
best_parms_SVM = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
svm_model = SVC(**best_parms_SVM, random_state=21, probability=True)
svm_model.fit(X_train, y_train)
print_scores(svm_model, X_valid, y_valid)

accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [31]:
best_parms_Tree = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23}
tree_model = DecisionTreeClassifier(**best_parms_Tree, random_state=21)
tree_model.fit(X_train, y_train)
print_scores(tree_model, X_valid, y_valid)

accuracy is 0.85926
precision is 0.86306
recall is 0.85926


In [32]:
best_parms_Forest = {'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}
forest_model = RandomForestClassifier(**best_parms_Tree, random_state=21)
forest_model.fit(X_train, y_train)
print_scores(forest_model, X_valid, y_valid)

accuracy is 0.91111
precision is 0.91087
recall is 0.91111


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [23]:
vot = VotingClassifier([('svm', svm_model),
                        ('tree', tree_model),
                        ('forest', forest_model)])
vot.fit(X_train, y_train)

In [33]:
print_scores(vot, X_valid, y_valid)

accuracy is 0.90370
precision is 0.90288
recall is 0.90370


In [60]:
from itertools import product
from tqdm.notebook import tqdm

best_acc = 0
best_prec = 0
vot_s = VotingClassifier([('svm', svm_model),
                        ('tree', tree_model),
                        ('forest', forest_model)], voting='soft')
vot_s.fit(X_train, y_train)

for w in tqdm(product(range(1, 5), range(1, 5), range(1, 5)), total=4*4*4):
    vot_s.weights = w
    acc = accuracy_score(y_true=y_valid, y_pred=vot_s.predict(X_valid))
    prec = precision_score(y_true=y_valid, y_pred=vot_s.predict(X_valid), average='weighted')

    if acc > best_acc or (acc == best_acc and prec > best_prec):
        best_w, best_acc, best_prec = w, acc, prec

print(best_w)
print(best_acc)

  0%|          | 0/64 [00:00<?, ?it/s]

(3, 1, 3)
0.9148148148148149


In [61]:
best_vot = VotingClassifier([('svm', svm_model),
                        ('tree', tree_model),
                        ('forest', forest_model)], voting='soft', weights=best_w)
best_vot.fit(X_train, y_train)
print_scores(best_vot, X_test, y_test)

accuracy is 0.90237
precision is 0.90567
recall is 0.90237


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [66]:
bag = BaggingClassifier(svm_model, random_state=21)
bag.fit(X_train, y_train)
print_scores(bag, X_valid, y_valid)

accuracy is 0.88519
precision is 0.89427
recall is 0.88519


In [None]:
best_acc = 0
best_prec = 0

for n in tqdm(range(1, 60), total=59):
    bag_s = BaggingClassifier(svm_model, random_state=21, n_estimators=n)
    bag_s.fit(X_train, y_train)
    acc = accuracy_score(y_true=y_valid, y_pred=bag_s.predict(X_valid))
    prec = precision_score(y_true=y_valid, y_pred=bag_s.predict(X_valid), average='weighted')

    if acc > best_acc or (acc == best_acc and prec > best_prec):
        best_n, best_acc, best_prec = n, acc, prec

print(best_n)
print(best_acc)

  0%|          | 0/59 [00:00<?, ?it/s]

43
0.8888888888888888


In [69]:
best_bag = BaggingClassifier(svm_model, random_state=21, n_estimators=43)
best_bag.fit(X_train, y_train)
print_scores(bag, X_test, y_test)

accuracy is 0.86391
precision is 0.86966
recall is 0.86391


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [81]:
best_acc = 0
best_prec = 0
for p in tqdm(product([2, 3, 4, 5, 6, 7], [True, False]), total = 12):
    stac_s = StackingClassifier(estimators=[('svm', svm_model), ('tree', tree_model), ('forest', forest_model)],
                            final_estimator=LogisticRegression(solver='liblinear', random_state=21),
                            passthrough=p[1],
                            cv = StratifiedKFold(n_splits = p[0], shuffle=True, random_state=21))
    stac_s.fit(X_train, y_train)
    acc = accuracy_score(y_true=y_valid, y_pred=bag_s.predict(X_valid))
    prec = precision_score(y_true=y_valid, y_pred=bag_s.predict(X_valid), average='weighted')

    if acc > best_acc or (acc == best_acc and prec > best_prec):
        best_p, best_acc, best_prec = p, acc, prec

print(best_p)
print(best_acc)


  0%|          | 0/12 [00:00<?, ?it/s]

(2, True)
0.8814814814814815


In [82]:
best_stac = StackingClassifier(estimators=[('svm', svm_model), ('tree', tree_model), ('forest', forest_model)],
                            final_estimator=LogisticRegression(solver='liblinear', random_state=21),
                            passthrough=True,
                            cv = StratifiedKFold(n_splits = 2, shuffle=True, random_state=21))
best_stac.fit(X_train, y_train)
print_scores(best_stac, X_test, y_test)

accuracy is 0.89941
precision is 0.90502
recall is 0.89941


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [83]:
res = best_vot.predict(X)
user_cols = list(filter(lambda x: x.startswith("uid_user_"), X.columns))
lab_cols = list(filter(lambda x: x.startswith("labname_"), X.columns))
result = pd.DataFrame()
result['user'] = X[user_cols].idxmax(axis=1)
result['labname'] = X[lab_cols].idxmax(axis=1)
result['true'] = y
result['pred'] = res
result['is_error'] = (result['true'] != result['pred'])
result

Unnamed: 0,user,labname,true,pred,is_error
0,uid_user_4,labname_project1,4,4,False
1,uid_user_4,labname_project1,4,4,False
2,uid_user_4,labname_project1,4,4,False
3,uid_user_4,labname_project1,4,4,False
4,uid_user_4,labname_project1,4,4,False
...,...,...,...,...,...
1681,uid_user_19,labname_laba06s,3,3,False
1682,uid_user_1,labname_laba06s,3,3,False
1683,uid_user_1,labname_laba06s,3,3,False
1684,uid_user_1,labname_laba06s,3,3,False


In [84]:
result.groupby(by='true')['is_error'].sum()

true
0    13
1    12
2     6
3     7
4     3
5    13
6    11
Name: is_error, dtype: int64

In [85]:
result.groupby(by='labname')['is_error'].sum().sort_values(ascending=False)

labname
labname_project1    24
labname_laba04      10
labname_laba04s     10
labname_code_rvw     5
labname_laba06s      5
labname_lab05s       4
labname_laba05       3
labname_laba06       3
labname_lab03        1
labname_lab02        0
labname_lab03s       0
Name: is_error, dtype: int64

In [86]:
result.groupby(by='user')['is_error'].sum().sort_values(ascending=False)

user
uid_user_2     8
uid_user_14    6
uid_user_17    5
uid_user_25    5
uid_user_4     4
uid_user_26    3
uid_user_29    3
uid_user_3     3
uid_user_19    3
uid_user_22    3
uid_user_31    3
uid_user_6     3
uid_user_18    2
uid_user_16    2
uid_user_30    2
uid_user_13    2
uid_user_24    2
uid_user_27    1
uid_user_8     1
uid_user_23    1
uid_user_1     1
uid_user_15    1
uid_user_10    1
uid_user_28    0
uid_user_21    0
uid_user_20    0
uid_user_12    0
uid_user_11    0
uid_user_7     0
uid_user_0     0
Name: is_error, dtype: int64

In [89]:
joblib.dump(best_vot, "model")

['model']