# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [353]:
from sklearn.model_selection import train_test_split,StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [354]:
df_x = pd.read_csv('../data/day-of-week-not-scaled.csv')
df_y = pd.read_csv('../data/dayofweek.csv')


In [355]:
X = df_x
y = df_y['dayofweek']


In [356]:
X_train_full, X_test, y_train_full,y_test = train_test_split(X,y,test_size=0.2,random_state=21,stratify=y)

In [357]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=21, stratify=y_train_full
)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [358]:
svc = SVC(C=10,class_weight=None,gamma='auto',probability=True,kernel='rbf', random_state=21).fit(X_train,y_train)
y_pred = svc.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precision is {precision:.5f}')
print(f'recall is {recall:.5f}')

accuracy is 0.87778
precision is 0.88461
recall is 0.87778


In [359]:
dt = DecisionTreeClassifier(class_weight='balanced',criterion='gini',max_depth=22,random_state=21).fit(X_train,y_train)
y_pred = dt.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precision is {precision:.5f}')
print(f'recall is {recall:.5f}')

accuracy is 0.86667
precision is 0.86892
recall is 0.86667


In [360]:
rf = RandomForestClassifier(n_estimators=100,max_depth=24,class_weight='balanced',criterion='entropy',random_state=21).fit(X_train,y_train)
y_pred = rf.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precision is {precision:.5f}')
print(f'recall is {recall:.5f}')


accuracy is 0.88519
precision is 0.88875
recall is 0.88519


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

A voting classifier is a machine learning model that gains experience by training on a collection of several models and forecasts an output (class) based on the class with the highest likelihood of becoming the output. To forecast the output class based on the largest majority of votes, it averages the results of each classifier provided into the voting classifier.

Hard Voting: In hard voting, the predicted output class is a class with the highest majority of votes, i.e., the class with the highest probability of being predicted by each classifier. For example, let's say classifiers predicted the output classes as (Cat, Dog, Dog). As the classifiers predicted class "dog" a maximum number of times, we will proceed with Dog as our final prediction.

Soft Voting: In this, the average probabilities of the classes determine which one will be the final prediction. For example, let's say the probabilities of the class being a "dog" is (0.30, 0.47, 0.53) and a "cat" is (0.20, 0.32, 0.40). So, the average for a class dog is 0.4333, and the cat is 0.3067, from this, we can confirm our final prediction to be a dog as it has the highest average probability.

In [361]:
voting_clf = VotingClassifier(
    estimators=
    [('svc',svc),('dt',dt),('rf',rf)],
    voting='hard')
voting_clf.fit(X_train,y_train)
y_pred = voting_clf.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')

accuracy is 0.90000
precsion is 0.90185
recall is 0.90000


In [362]:
voting_clf_soft =VotingClassifier(
    estimators=
    [('svc',svc),('dt',dt),('rf',rf)],
    voting='soft'
)
voting_clf_soft.fit(X_train,y_train)
y_pred = voting_clf_soft.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')


accuracy is 0.88519
precsion is 0.88599
recall is 0.88519


In [363]:

best_score = 0
best_metrics = {}
best_model = None
best_weight = None
best_param = {}

weight_options = [
    (1, 1, 1),
    (2, 1, 1), (1, 2, 1), (1, 1, 2),
    (2, 2, 1), (2, 1, 2), (1, 2, 2),
    (3, 1, 1), (1, 3, 1), (1, 1, 3),
    (4, 1, 1), (1, 4, 1),
    (4, 1, 4), (4, 4, 1), (1, 4, 4)

]



voting_options = ['hard', 'soft']

for weight in weight_options:
    for voting in voting_options:
        model = VotingClassifier(
            estimators=
            [('svc',svc),('dt',dt),('rf',rf)],
            voting= voting,
            weights=weight
        )
        model.fit(X_train,y_train)
        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_pred,y_valid)
        precision = precision_score(y_pred,y_valid,average='weighted')
        recall = recall_score(y_pred,y_valid,average='weighted')

        if accuracy > best_score or (accuracy == best_score and precision > best_metrics.get('precision',0)):
            best_score = accuracy
            best_model = model
            best_param = {'voting':voting,'weights':weight}
            best_metrics = {'accuracy':accuracy,'precision':precision,'recall':recall}



print('Best Voting Classifier')
print(f'best param used is {best_param}')
print(f'accuracy is {best_metrics['accuracy']:.5f}')
print(f'precision is {best_metrics['precision']:.5f}')
print(f'recall is {best_metrics['recall']:.5f}')

Best Voting Classifier
best param used is {'voting': 'soft', 'weights': (4, 1, 4)}
accuracy is 0.90741
precision is 0.91275
recall is 0.90741


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

#### Bagging Classifier
Bagging or Bootstrap aggregating is a type of ensemble learning in which multiple base models are trained independently and parallelly on different subsets of training data. Each subset is generated using bootstrap sampling in which data points are picked at randomly with replacement. In bagging classifier the final prediction is made by aggregating the predictions of all base model using majority voting. In the models of regression the final prediction is made by averaging the predictions of the all base model and that is known as bagging regression.

In [364]:
bagging_clf = BaggingClassifier(svc,n_estimators=50,
                                random_state=21)


bagging_clf.fit(X_train,y_train)
y_pred = bagging_clf.predict(X_valid)
accuracy = accuracy_score(y_pred,y_valid)
precision = precision_score(y_pred,y_valid,average='weighted')
recall = recall_score(y_pred,y_valid,average='weighted')
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')

accuracy is 0.88148
precsion is 0.89042
recall is 0.88148


In [365]:

best_score = 0
best_metrics = {}
best_model = None
best_param = None

param_grid = [
    {'n_estimators':50,'max_samples':1.0,'max_features':1.0},
    {'n_estimators':100,'max_samples':0.8,'max_features':0.8},
    {'n_estimators':150,'max_samples':0.9,'max_features':1.0},
    {'n_estimators':150,'max_samples':1.0,'max_features':1.0},
    ]

for param in param_grid:
    model = BaggingClassifier(svc,
                              n_estimators=param['n_estimators'],
                              max_samples=param['max_samples'],
                              max_features=param['max_features'],
                              random_state=21,
                              n_jobs=8)
    
    model.fit(X_train_full,y_train_full)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred,y_test)
    precision = precision_score(y_pred,y_test,average='weighted')
    recall = recall_score(y_pred,y_test,average='weighted')

    if accuracy > best_score or (accuracy == best_score and precision > best_metrics.get('precision',0)):
        best_score = accuracy
        best_model = model
        best_param = param
        best_metrics = {'accuracy':accuracy,
                        'precision':precision,
                        'recall':recall}
        
print('Best Bagging classifier')
print(f'best params used is {best_param}')
print(f'accuracy is {best_metrics['accuracy']}')
print(f'precision is {best_metrics['precision']}')
print(f'recall is {best_metrics['recall']}')

Best Bagging classifier
best params used is {'n_estimators': 50, 'max_samples': 1.0, 'max_features': 1.0}
accuracy is 0.908284023668639
precision is 0.9130698211142415
recall is 0.908284023668639


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

Stacking is a ensemble learning technique where the final model known as the “stacked model" combines the predictions from multiple base models. The goal is to create a stronger model by using different models and combining them.

In [366]:
estimators= [('svc',svc),('dt',dt),('rf',rf)]

best_score = 0

results = []
best_metrics = {}
best_param = {}
final_estimator = LogisticRegression(solver='liblinear')

for n in [2,3,4,5,6,7]:

# for n in [4]:
    cv = StratifiedKFold(n_splits=n,random_state=21,shuffle=True)
    for passthrough in [True,False]:
        stacking_clf = StackingClassifier(
            estimators=estimators,
            cv=cv,
            final_estimator=final_estimator,
            passthrough=passthrough,
            n_jobs=8
        )

        stacking_clf.fit(X_train,y_train)
        y_pred = stacking_clf.predict(X_valid)
        accuracy = accuracy_score(y_pred,y_valid)
        precision = precision_score(y_pred,y_valid,average='weighted')
        recall = recall_score(y_pred,y_valid,average='weighted')

        if accuracy > best_score or (accuracy == best_score and precision > best_metrics.get('precision',0)):
            best_score = accuracy

            best_param = {'n_splits':n, 'passthrough':passthrough}
            best_metrics = {'accuracy':accuracy,'precision':precision,'recall':recall}

print("Best Stacking Classifier")
print(f'Best params used is {best_param}')
print(f'accuracy is {best_metrics['accuracy']}')
print(f'precision is {best_metrics['precision']}')
print(f'recall is {best_metrics['recall']}')


Best Stacking Classifier
Best params used is {'n_splits': 4, 'passthrough': False}
accuracy is 0.9111111111111111
precision is 0.9148447718623157
recall is 0.9111111111111111


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [367]:
bagging_clf = BaggingClassifier(svc,n_estimators=50,
                                random_state=21)


bagging_clf.fit(X_train,y_train)
y_pred = bagging_clf.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
precision = precision_score(y_pred,y_test,average='weighted')
recall = recall_score(y_pred,y_test,average='weighted')
print("Bagging Classifier")
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')

Bagging Classifier
accuracy is 0.88462
precsion is 0.89252
recall is 0.88462


In [368]:
estimators= [('svc',svc),('dt',dt),('rf',rf)]

final_estimator = LogisticRegression(solver='liblinear')

n = 4

cv = StratifiedKFold(n_splits=n,random_state=21,shuffle=True)

stacking_clf = StackingClassifier(
    estimators=estimators,
    cv=cv,
    final_estimator=final_estimator,
    passthrough=True,
    n_jobs=8
    )

stacking_clf.fit(X_train,y_train)
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
precision = precision_score(y_pred,y_test,average='weighted')
recall = recall_score(y_pred,y_test,average='weighted')

     

print("Stacking Classifier")
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')


Stacking Classifier
accuracy is 0.90828
precsion is 0.91306
recall is 0.90828


In [369]:
voting_clf = VotingClassifier(
    estimators=
    [('svc',svc),('dt',dt),('rf',rf)],
    voting='hard',
    weights=(4,1,4))
voting_clf.fit(X_train,y_train)
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
precision = precision_score(y_pred,y_test,average='weighted')
recall = recall_score(y_pred,y_test,average='weighted')

print('Best model is Voting Classifier')
print(f'accuracy is {accuracy:.5f}')
print(f'precsion is {precision:.5f}')
print(f'recall is {recall:.5f}')

Best model is Voting Classifier
accuracy is 0.91124
precsion is 0.91370
recall is 0.91124


In [370]:
results = pd.DataFrame({'true': y_test, 'pred': y_pred})
results

Unnamed: 0,true,pred
1087,1,1
16,5,5
563,6,6
1381,3,3
1199,2,2
...,...,...
1411,3,3
1079,1,1
1222,2,2
1064,1,1


In [371]:

weekday = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday",
           4: "Friday", 5: "Saturday", 6: "Sunday"}


errors = results[results['true'] != results['pred']]['true'].value_counts()
totals = results['true'].value_counts()

errors


true
5    7
0    7
6    5
1    3
4    3
2    3
3    2
Name: count, dtype: int64

In [372]:
error_percent = (errors / totals * 100).round(2).fillna(0)

print("Misclassification percentages per weekday:")
for day_num in error_percent.sort_values(ascending=False).index:
    print(f"{weekday[day_num]}: {error_percent[day_num]}%")

worst_day = error_percent.idxmax()
print(f"\nWeekday with most errors: {weekday[worst_day]} ({error_percent[worst_day]}%)")

Misclassification percentages per weekday:
Monday: 25.93%
Friday: 14.29%
Saturday: 12.96%
Wednesday: 10.0%
Sunday: 7.04%
Tuesday: 5.45%
Thursday: 2.5%

Weekday with most errors: Monday (25.93%)


In [373]:
lab_cols = [col for col in X_test.columns if col.startswith('labname_')]
results['labname'] = X_test[lab_cols].idxmax(axis=1).str.replace('labname_', '')

In [374]:
lab_errors = results[results['true'] != results['pred']]['labname'].value_counts()
lab_totals = results['labname'].value_counts()
lab_error_percent = (lab_errors / lab_totals * 100).round(2).fillna(0)
worst_lab = lab_error_percent.idxmax()
print(f"Lab with most errors: {worst_lab} ({lab_error_percent[worst_lab]}%)")

Lab with most errors: lab03 (100.0%)


In [375]:
user_cols = [col for col in X_test.columns if col.startswith('uid_user_')]
results['user'] = X_test[user_cols].idxmax(axis=1)

In [376]:
user_errors = results[results['true'] != results['pred']]['user'].value_counts()
user_totals = results['user'].value_counts()
user_error_percent = (user_errors / user_totals * 100).round(2).fillna(0)
worst_user = user_error_percent.idxmax()
print(f"User with most errors: {worst_user} ({user_error_percent[worst_user]}%)")


User with most errors: uid_user_6 (50.0%)


In [377]:
joblib.dump(best_model,"../data/VotingClassifier_bestmodel.pkl")

['../data/VotingClassifier_bestmodel.pkl']