# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
dayofweek = pd.read_csv("../data/dayofweek.csv")
df['dayofweek'] = dayofweek['dayofweek'].values
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [3]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=21,
    stratify=y
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=21,
    stratify=y_train
)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [4]:
svm_model = SVC(
    probability=True, 
    random_state=21, 
    C=10, 
    class_weight=None,
    gamma='auto',
    kernel='rbf'
)
svm_model.fit(X_train, y_train)
y_valid_pred = svm_model.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [5]:
tree_model = DecisionTreeClassifier(
    random_state=21,
    class_weight='balanced',
    criterion='gini',
    max_depth=23
)
tree_model.fit(X_train, y_train)
y_valid_pred = tree_model.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.85926
precision is 0.86306
recall is 0.85926


In [6]:
forest_model = RandomForestClassifier(
    random_state=21, 
    class_weight=None,
    criterion='gini',
    max_depth=28,
    n_estimators=50
)
forest_model.fit(X_train, y_train)
y_valid_pred = forest_model.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.88889
precision is 0.88952
recall is 0.88889


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.

In [7]:
voting = VotingClassifier([('svc', svm_model), ('tree', tree_model), ('forest', forest_model)])
voting.fit(X_train, y_train)
y_valid_pred = voting.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.88889
precision is 0.88872
recall is 0.88889


2. Play with the other parameteres.

In [8]:
weights_list = [
    (1, 1, 1),
    (2, 1, 1),
    (1, 2, 1),
    (1, 1, 2),
    (1, 2, 3),
    (1, 3, 2),
    (2, 1, 3),
    (2, 3, 1),
    (3, 1, 2), 
    (3, 2, 1)
]

results = []

for weight in weights_list:
    voting = VotingClassifier(
        estimators=[
            ('svc', SVC(probability=True, random_state=21, C=10, class_weight=None, gamma='auto', kernel='rbf')),
            ('tree', DecisionTreeClassifier(random_state=21, class_weight='balanced', criterion='gini', max_depth=23)),
            ('forest', RandomForestClassifier(random_state=21, class_weight=None, criterion='gini', max_depth=28, n_estimators=50))
        ],
        voting='soft',
        weights=weight
    )

    voting.fit(X_train, y_train)
    y_valid_pred = voting.predict(X_valid)

    accuracy = accuracy_score(y_valid, y_valid_pred)
    precision = precision_score(y_valid, y_valid_pred, average='weighted')
    recall = recall_score(y_valid, y_valid_pred, average='weighted')
    
    results.append({
        'weights': weight,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    })

results

[{'weights': (1, 1, 1),
  'accuracy': 0.8777777777777778,
  'precision': 0.8816801525543417,
  'recall': 0.8777777777777778},
 {'weights': (2, 1, 1),
  'accuracy': 0.9037037037037037,
  'precision': 0.9051418757467145,
  'recall': 0.9037037037037037},
 {'weights': (1, 2, 1),
  'accuracy': 0.8592592592592593,
  'precision': 0.8630597652455576,
  'recall': 0.8592592592592593},
 {'weights': (1, 1, 2),
  'accuracy': 0.8888888888888888,
  'precision': 0.8922192012801332,
  'recall': 0.8888888888888888},
 {'weights': (1, 2, 3),
  'accuracy': 0.8814814814814815,
  'precision': 0.885320847679682,
  'recall': 0.8814814814814815},
 {'weights': (1, 3, 2),
  'accuracy': 0.8592592592592593,
  'precision': 0.8630597652455576,
  'recall': 0.8592592592592593},
 {'weights': (2, 1, 3),
  'accuracy': 0.8962962962962963,
  'precision': 0.8969316157973241,
  'recall': 0.8962962962962963},
 {'weights': (2, 3, 1),
  'accuracy': 0.8592592592592593,
  'precision': 0.8630597652455576,
  'recall': 0.859259259259

3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [9]:
sorted_results = sorted(results, key=lambda x: (x['accuracy'], x['precision']), reverse=True)
sorted_results

[{'weights': (2, 1, 1),
  'accuracy': 0.9037037037037037,
  'precision': 0.9051418757467145,
  'recall': 0.9037037037037037},
 {'weights': (3, 1, 2),
  'accuracy': 0.9,
  'precision': 0.9023637434101694,
  'recall': 0.9},
 {'weights': (2, 1, 3),
  'accuracy': 0.8962962962962963,
  'precision': 0.8969316157973241,
  'recall': 0.8962962962962963},
 {'weights': (1, 1, 2),
  'accuracy': 0.8888888888888888,
  'precision': 0.8922192012801332,
  'recall': 0.8888888888888888},
 {'weights': (3, 2, 1),
  'accuracy': 0.8888888888888888,
  'precision': 0.8912888354513078,
  'recall': 0.8888888888888888},
 {'weights': (1, 2, 3),
  'accuracy': 0.8814814814814815,
  'precision': 0.885320847679682,
  'recall': 0.8814814814814815},
 {'weights': (1, 1, 1),
  'accuracy': 0.8777777777777778,
  'precision': 0.8816801525543417,
  'recall': 0.8777777777777778},
 {'weights': (1, 2, 1),
  'accuracy': 0.8592592592592593,
  'precision': 0.8630597652455576,
  'recall': 0.8592592592592593},
 {'weights': (1, 3, 2),

In [10]:
X_train_concat = pd.concat([X_train, X_valid])
y_train_concat = pd.concat([y_train, y_valid])

In [11]:
voting = VotingClassifier(
    estimators=[
        ('svc', SVC(probability=True, random_state=21, C=10, class_weight=None, gamma='auto', kernel='rbf')),
        ('tree', DecisionTreeClassifier(random_state=21, class_weight='balanced', criterion='gini', max_depth=23)),
        ('forest', RandomForestClassifier(random_state=21, class_weight=None, criterion='gini', max_depth=28, n_estimators=50))
    ],
    voting='soft',
    weights=(2, 1, 1)
)
voting.fit(X_train_concat, y_train_concat)
y_pred = voting.predict(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.92012
precision is 0.92378
recall is 0.92012


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.

In [12]:
bagging = BaggingClassifier(
    estimator=SVC(
        probability=True, 
        random_state=21, 
        C=10, 
        class_weight=None,
        gamma='auto',
        kernel='rbf'
    ),
    n_estimators=10,
    random_state=21
)

bagging.fit(X_train, y_train)
y_valid_pred = bagging.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.88519
precision is 0.89427
recall is 0.88519


In [13]:
bagging = BaggingClassifier(
    estimator=SVC(
        probability=True, 
        random_state=21, 
        C=10, 
        class_weight=None,
        gamma='auto',
        kernel='rbf'
    ),
    n_estimators=30,
    random_state=21
)

bagging.fit(X_train, y_train)
y_valid_pred = bagging.predict(X_valid)

print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

accuracy is 0.88889
precision is 0.89718
recall is 0.88889


2. Play with the other parameters.

In [14]:
param_combinations = [
    {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 0.5},
    {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 0.7},
    {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 1.0},
    {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 0.5},
    {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 0.7},
    {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 1.0},
    {'n_estimators': 30, 'max_samples': 1.0, 'max_features': 0.5},
    {'n_estimators': 30, 'max_samples': 1.0, 'max_features': 0.7},
    {'n_estimators': 30, 'max_samples': 1.0, 'max_features': 1.0},
]

for params in param_combinations:
    bagging = BaggingClassifier(
    estimator=SVC(
        probability=True, 
        random_state=21, 
        C=10, 
        class_weight=None,
        gamma='auto',
        kernel='rbf'
    ),
    random_state=21,
    **params
    )

    bagging.fit(X_train, y_train)
    y_valid_pred = bagging.predict(X_valid)

    print(f"params: {params}")
    print(f"accuracy is {accuracy_score(y_valid, y_valid_pred):.5f}")
    print(f"precision is {precision_score(y_valid, y_valid_pred, average='weighted'):.5f}")
    print(f"recall is {recall_score(y_valid, y_valid_pred, average='weighted'):.5f}")

params: {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 0.5}
accuracy is 0.65556
precision is 0.74354
recall is 0.65556
params: {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 0.7}
accuracy is 0.74074
precision is 0.79057
recall is 0.74074
params: {'n_estimators': 30, 'max_samples': 0.5, 'max_features': 1.0}
accuracy is 0.82222
precision is 0.83976
recall is 0.82222
params: {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 0.5}
accuracy is 0.68148
precision is 0.75594
recall is 0.68148
params: {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 0.7}
accuracy is 0.78889
precision is 0.81714
recall is 0.78889
params: {'n_estimators': 30, 'max_samples': 0.7, 'max_features': 1.0}
accuracy is 0.86296
precision is 0.87310
recall is 0.86296
params: {'n_estimators': 30, 'max_samples': 1.0, 'max_features': 0.5}
accuracy is 0.74815
precision is 0.78441
recall is 0.74815
params: {'n_estimators': 30, 'max_samples': 1.0, 'max_features': 0.7}
accuracy is 0.84074
precisi

3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [15]:
best_model = BaggingClassifier(
    estimator=SVC(
        probability=True, 
        random_state=21, 
        C=10, 
        class_weight=None,
        gamma='auto',
        kernel='rbf'
    ),
    n_estimators=30,
    max_samples=1.0,
    max_features=1.0,
    random_state=21
)

best_model.fit(X_train_concat, y_train_concat)
y_pred = best_model.predict(X_test)


print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.90237
precision is 0.90532
recall is 0.90237


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [16]:
estimators = [
    ('svc', SVC(probability=True, random_state=21, C=10, class_weight=None, gamma='auto', kernel='rbf')),
    ('tree', DecisionTreeClassifier(random_state=21, class_weight='balanced', criterion='gini', max_depth=23)),
    ('forest', RandomForestClassifier(random_state=21, class_weight=None, criterion='gini', max_depth=28, n_estimators=50))
]

n_splits = [2, 3, 4, 5, 6, 7]
final_estimator = LogisticRegression(solver='liblinear')

In [17]:
results = []

for n in n_splits:
    for passthrough in [True, False]:
        generator = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)
        stacking_model = StackingClassifier(
            estimators=estimators,
            cv = generator,
            final_estimator=final_estimator, 
            passthrough=passthrough
        )

        stacking_model.fit(X_train, y_train)
        y_valid_pred = stacking_model.predict(X_valid)

        accuracy = accuracy_score(y_valid, y_valid_pred)
        precision = precision_score(y_valid, y_valid_pred, average='weighted')
        recall = recall_score(y_valid, y_valid_pred, average='weighted')

        results.append({
            'n_splits': n,
            'passthrough': passthrough,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall
        })

results = sorted(results, key=lambda x: (x['accuracy'], x['precision']), reverse=True)
results
    

[{'n_splits': 5,
  'passthrough': True,
  'accuracy': 0.9111111111111111,
  'precision': 0.9145229215229215,
  'recall': 0.9111111111111111},
 {'n_splits': 5,
  'passthrough': False,
  'accuracy': 0.9111111111111111,
  'precision': 0.913185162319288,
  'recall': 0.9111111111111111},
 {'n_splits': 6,
  'passthrough': True,
  'accuracy': 0.9037037037037037,
  'precision': 0.9059595959595959,
  'recall': 0.9037037037037037},
 {'n_splits': 7,
  'passthrough': True,
  'accuracy': 0.9037037037037037,
  'precision': 0.9058052974522796,
  'recall': 0.9037037037037037},
 {'n_splits': 6,
  'passthrough': False,
  'accuracy': 0.9037037037037037,
  'precision': 0.9048202189937163,
  'recall': 0.9037037037037037},
 {'n_splits': 4,
  'passthrough': True,
  'accuracy': 0.9,
  'precision': 0.9032536559418279,
  'recall': 0.9},
 {'n_splits': 4,
  'passthrough': False,
  'accuracy': 0.9,
  'precision': 0.9030959964293298,
  'recall': 0.9},
 {'n_splits': 2,
  'passthrough': True,
  'accuracy': 0.9,
  'pr

In [18]:
best_model = StackingClassifier(
    estimators=estimators,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=21),
    final_estimator=final_estimator,
    passthrough=True
)

best_model.fit(X_train_concat, y_train_concat)
y_pred = best_model.predict(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.92899
precision is 0.93045
recall is 0.92899


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [19]:
df_analyze = X_test.copy()
df_analyze['true'] = y_test
df_analyze['pred'] = y_pred
df_analyze['is_correct'] = df_analyze['true'] == df_analyze['pred']
df_analyze.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,true,pred,is_correct
1087,67,17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,True
16,1,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,5,True
563,14,10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,6,True
1381,20,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,3,True
1199,9,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,2,True


In [20]:
weekday_error = df_analyze.groupby('true')['is_correct'].mean().sort_values(ascending=False) * 100
weekday_error

true
3    97.500000
6    97.183099
2    93.333333
1    90.909091
5    90.740741
4    90.476190
0    77.777778
Name: is_correct, dtype: float64

ОТВЕТ: больше всего модель ошибается по понедельникам

In [21]:
joblib.dump(best_model, 'stacking_model.pkl')

['stacking_model.pkl']