# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [126]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
  accuracy_score, 
  precision_score, 
  recall_score, 
)
from sklearn.ensemble import (
  VotingClassifier,
  BaggingClassifier,
  StackingClassifier,
  RandomForestClassifier
)

from itertools import product

In [131]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [110]:
RANDOM_STATE = 21
TEST_SIZE = 0.2
PROBABILIY = True

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [111]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
prev_df = pd.read_csv("../data/dayofweek.csv")
df.insert(0, 'dayofweek', prev_df['dayofweek'])
df

Unnamed: 0,dayofweek,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,4,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,3,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,3,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,3,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,3,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [112]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [113]:
models = [SVC, DecisionTreeClassifier, RandomForestClassifier]
models_params = [
  { 'C':10,
    'kernel':'rbf',
    'gamma':'auto',
    'probability':True,
    'class_weight':None,
    'random_state':RANDOM_STATE
  },
  {
    'criterion':'gini',
    'max_depth':22,
    'class_weight':'balanced',
    'random_state':RANDOM_STATE
  },
  {
    'n_estimators':50,
    'criterion':'gini',
    'max_depth':28,
    'class_weight':None,
    'random_state':RANDOM_STATE
  }
]

In [114]:
def calculate_metrics(y_true, y_pred) -> dict:
  return {
    'accuracy': accuracy_score(y_true, y_pred),
    'precision': precision_score(y_true, y_pred, average='weighted'),
    'recall': recall_score(y_true, y_pred, average='weighted')
  }

def print_metrics(metrics: dict) -> None:
  for key, value in metrics.items():
    print(f"{key} is {value:.5f}")

In [115]:
for model_class, params in zip(models, models_params):
  model = model_class(**params)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_valid)

  model_metrics = calculate_metrics(y_valid, y_pred)
  
  print(model_class.__name__)
  print_metrics(model_metrics)
  print('\n')

SVC
accuracy is 0.86667
precision is 0.86968
recall is 0.86667


DecisionTreeClassifier
accuracy is 0.86296
precision is 0.87211
recall is 0.86296


RandomForestClassifier
accuracy is 0.87407
precision is 0.88324
recall is 0.87407




## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [116]:
voting_clf = VotingClassifier(
  estimators=[
    ('svc', SVC(**models_params[0])),
    ('dtc', DecisionTreeClassifier(**models_params[1])),
    ('rfc', RandomForestClassifier(**models_params[2]))
  ],
  voting='hard'
)

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_valid)

voting_metrics = calculate_metrics(y_valid, y_pred_voting)
print_metrics(voting_metrics)

accuracy is 0.88889
precision is 0.89482
recall is 0.88889


## Choosing best parameters for Voting Classifier

In [133]:
def find_best_ensemble_params(ensemble_model, param_grid, X_train, y_train, cv=5, scoring='accuracy'):
  grid = GridSearchCV(
    estimator=ensemble_model,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
  )
  grid.fit(X_train, y_train)

  best_params = grid.best_params_
  best_score = grid.best_score_

  return best_params, best_score

In [118]:
param_grid_voting = {
  'voting': ['soft', 'hard'],
  'weights': [list(w) for w in product(range(1,5), repeat=3)]
}

best_params, best_score = find_best_ensemble_params(voting_clf, param_grid_voting, X_train, y_train)
print(f"best parameters: {best_params}\nbest score: {best_score}")

best parameters: {'voting': 'soft', 'weights': [3, 1, 4]}
best score: 0.8998363479758827


## Evaluating best Voting Classifier on test set

In [120]:
best_voting_clf = VotingClassifier(
  estimators=[
    ('svc', SVC(**models_params[0])),
    ('dtc', DecisionTreeClassifier(**models_params[1])),
    ('rfc', RandomForestClassifier(**models_params[2]))
  ],
  voting='soft',
  weights=[3, 1, 4]
)
best_voting_clf.fit(X_train, y_train)
y_pred_voting = best_voting_clf.predict(X_test)

print_metrics(calculate_metrics(y_test, y_pred_voting))

accuracy is 0.91124
precision is 0.91518
recall is 0.91124


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

## Finding Best Parameters for Bagging classifier

In [123]:
base_model = SVC(**models_params[0])
bagging_params_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_samples': [0.5, 0.7, 0.9, 1.0],
    'max_features': [0.5, 0.7, 0.9, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],
}
bagging_clf = BaggingClassifier(
  estimator=base_model,
  random_state=RANDOM_STATE
)
best_params, best_score = find_best_ensemble_params(
  ensemble_model=bagging_clf,
  param_grid=bagging_params_grid,
  X_train=X_train,
  y_train=y_train
)
print(f"best parameters: {best_params}\nbest score: {best_score}")

best parameters: {'bootstrap': False, 'bootstrap_features': False, 'max_features': 0.9, 'max_samples': 1.0, 'n_estimators': 50}
best score: 0.8516236003445306


In [124]:
best_bagging_clf = BaggingClassifier(
  estimator=base_model,
  n_estimators=50,
  max_samples=1.0,
  max_features=0.9,
  bootstrap=False,
  bootstrap_features=False
)
best_bagging_clf.fit(X_train, y_train)
y_pred_bagging = best_bagging_clf.predict(X_valid)
print_metrics(calculate_metrics(y_valid, y_pred_bagging))

accuracy is 0.85926
precision is 0.86051
recall is 0.85926


## 5. Stacking classifiers

  1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
  2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
  3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [134]:
stacking_clf = StackingClassifier(
  estimators=[
    ('svc', SVC(**models_params[0])),
    ('dtc', DecisionTreeClassifier(**models_params[1])),
    ('rfc', RandomForestClassifier(**models_params[2]))
  ],
  final_estimator=LogisticRegression(solver='liblinear')
)
stacking_params = {
  'passthrough': [True, False],
  'cv': [StratifiedKFold(n_splits=n, shuffle=True, random_state=RANDOM_STATE) for n in range(2, 8)]
}

best_params, best_score = find_best_ensemble_params(
  ensemble_model=stacking_clf,
  param_grid=stacking_params,
  X_train=X_train,
  y_train=y_train
)

print(f"best parameters: {best_params}\nbest score: {best_score}")

best parameters: {'cv': StratifiedKFold(n_splits=5, random_state=21, shuffle=True), 'passthrough': False}
best score: 0.8933505598621878


In [135]:
best_stacking_model = StackingClassifier(
  estimators=[
    ('svc', SVC(**models_params[0])),
    ('dtc', DecisionTreeClassifier(**models_params[1])),
    ('rfc', RandomForestClassifier(**models_params[2]))
],
  final_estimator=LogisticRegression(solver='liblinear'),
  passthrough=False,
  cv=5,
)
best_stacking_model.fit(X_train, y_train)
y_pred_stacking = best_stacking_model.predict(X_valid)

print_metrics(calculate_metrics(y_valid, y_pred_stacking))

accuracy is 0.90000
precision is 0.90508
recall is 0.90000


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

- voting classifier has the best accuracy - 0.91124

In [143]:
final_df = pd.concat([
  pd.DataFrame({
    'dayofweek': y_test,
    'best_prediction': best_voting_clf.predict(X_test)
  }),
  X_test.drop(['numTrials', 'hour'], axis=1)
], axis=1)
final_df['is_error'] = final_df['dayofweek'] != final_df['best_prediction']
final_df

Unnamed: 0,dayofweek,best_prediction,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,is_error
1087,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
16,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,False
563,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,False
1381,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
1199,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1411,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
1079,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
1222,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
1064,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False


## Day of week errors

In [146]:
dayofweek_errors = (
  final_df.groupby('dayofweek')['is_error']
  .mean() * 100
)

for dayofweek, error in dayofweek_errors.items():
  print(f"Dayofweek {dayofweek}: {error:.2f}% error")

Dayofweek 0: 18.52% error
Dayofweek 1: 7.27% error
Dayofweek 2: 10.00% error
Dayofweek 3: 3.75% error
Dayofweek 4: 19.05% error
Dayofweek 5: 11.11% error
Dayofweek 6: 7.04% error


## Labname errors

In [152]:
lab_cols = [col for col in final_df.columns if col.startswith("labname_")]

labs_long = final_df.melt(
  id_vars=["dayofweek", "best_prediction", "is_error"],
  value_vars=lab_cols,
  var_name="labname",
  value_name="active"
)

labs_active = labs_long[labs_long["active"] == 1]

labs_error = (
  labs_active.groupby("labname")["is_error"]
  .mean() * 100
)

for labname, error in labs_error.items():
  print(f"{labname}: {error:.2f}% error")

labname_code_rvw: 7.69% error
labname_lab03: 100.00% error
labname_lab03s: 0.00% error
labname_lab05s: 16.67% error
labname_laba04: 25.71% error
labname_laba04s: 16.00% error
labname_laba05: 8.51% error
labname_laba06: 11.11% error
labname_laba06s: 6.67% error
labname_project1: 4.30% error


## User errors

In [148]:
user_cols = [col for col in final_df.columns if col.startswith("uid_user_")]

users_long = final_df.melt(
    id_vars=["dayofweek", "best_prediction", "is_error"],
    value_vars=user_cols,
    var_name="user",
    value_name="active"
)

users_active = users_long[users_long["active"] == 1]

user_errors = (
    users_active.groupby("user")["is_error"]
    .mean() * 100
)

for user, error in user_errors.items():
  print(f"{user}: {error:.2f}% error")

uid_user_1: 0.00% error
uid_user_10: 8.33% error
uid_user_12: 0.00% error
uid_user_13: 11.76% error
uid_user_14: 3.23% error
uid_user_15: 0.00% error
uid_user_16: 20.00% error
uid_user_17: 28.57% error
uid_user_18: 16.67% error
uid_user_19: 15.79% error
uid_user_2: 10.71% error
uid_user_20: 5.00% error
uid_user_21: 0.00% error
uid_user_22: 0.00% error
uid_user_23: 0.00% error
uid_user_24: 9.09% error
uid_user_25: 4.55% error
uid_user_26: 0.00% error
uid_user_27: 50.00% error
uid_user_28: 0.00% error
uid_user_29: 9.09% error
uid_user_3: 14.29% error
uid_user_30: 25.00% error
uid_user_31: 5.56% error
uid_user_4: 7.41% error
uid_user_6: 50.00% error
uid_user_8: 0.00% error
