# Ex03 Ensembles

In [1]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
import numpy as np
import joblib
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

## 1. Preprocessing

In [2]:
df_s = pd.read_csv('../data/dayofweek.csv')

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
df['dayofweek'] = df_s['dayofweek']

In [5]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state=21, stratify=y)

## 2. Individual classifiers

In [7]:
svc = SVC(C=10, gamma='auto', class_weight= None, kernel='rbf',probability=True, random_state=21)

In [8]:
svc.fit(X_train, y_train)
pred = svc.predict(X_test)

In [9]:
def print_scores(y_test, pred):
    print(f"accuracy is {accuracy_score(y_test, pred):.5f}")
    print(f"precision is {precision_score(y_test, pred, average='weighted', zero_division=0):.5f}")
    print(f"recall is {recall_score(y_test, pred, average='weighted', zero_division=0):.5f}")

In [10]:
print_scores(y_test,pred)

accuracy is 0.88757
precision is 0.89267
recall is 0.88757


In [11]:
tree = DecisionTreeClassifier(class_weight='balanced', max_depth=30, criterion='gini' ,random_state=21)

In [12]:
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [13]:
print_scores(y_test,tree_pred)

accuracy is 0.89349
precision is 0.89620
recall is 0.89349


In [14]:
forest = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=30, n_estimators=50,
                       random_state=21)

In [15]:
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [16]:
print_scores(y_test,forest_pred)

accuracy is 0.93195
precision is 0.93382
recall is 0.93195


## 3. Voting classifiers

Hard Voting (Жесткое голосование): каждый классификатор предсказывает класс, и итоговый класс выбирается на основе большинства голосов.

Soft Voting (Мягкое голосование): каждый классификатор предсказывает вероятности для каждого класса, и итоговое предсказание вычисляется как среднее вероятностей для каждого класса (выбирается класс с максимальной вероятностью).

In [17]:
voting_clf = VotingClassifier(estimators=[('forest', forest), ('svc', svc), ('tree', tree)], voting='hard')

In [18]:
voting_clf.fit(X_train, y_train)

In [19]:
voting_pred = voting_clf.predict(X_test)

In [20]:
print_scores(y_test,voting_pred)

accuracy is 0.93195
precision is 0.93234
recall is 0.93195


In [21]:
voting_clf_soft = VotingClassifier(estimators=[('forest', forest), ('svc', svc), ('tree', tree)], voting='soft')
voting_clf_soft.fit(X_train, y_train)
voting_soft_pred = voting_clf_soft.predict(X_test)
print_scores(y_test,voting_soft_pred)


accuracy is 0.91716
precision is 0.91908
recall is 0.91716


Calculate the accuracy, precision and recall on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [22]:
weights_list = [
    [1, 1, 1], 
    [2, 1, 1], # forest
    [1, 2, 1], # svc
    [1, 1, 2]  # tree
]

In [23]:
best_accuracy = 0
best_model = None
best_precision = 0

for weights in weights_list:
    voting = VotingClassifier(estimators=[('forest', forest), ('svc', svc), ('tree', tree)], voting='hard', weights=weights)
    voting.fit(X_train, y_train)
    y_pred = voting.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    
    if accuracy > best_accuracy or (accuracy == best_accuracy and precision > best_precision):
        best_accuracy = accuracy
        best_precision = precision
        best_model = weights

Лучшая модель, где модели имеют равные веса

In [24]:
best_model

[1, 1, 1]

In [25]:
best_accuracy

0.9319526627218935

In [26]:
best_precision

0.9323380605872633

## 4. Bagging classifiers

Bagging (Bootstrap Aggregating) — это метод ансамблирования, который использует несколько экземпляров одной и той же модели, обученных на различных подмножествах обучающих данных, и затем агрегирует их результаты. 

In [27]:
n_estimators_list = [10, 50, 100, 200]

In [28]:
best_accuracy = 0
best_precision = 0
best_recall = 0
best_model = None
for n_estimators in n_estimators_list:
    bagging_clf = BaggingClassifier(estimator=svc, n_estimators=n_estimators, random_state=21)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    
    if accuracy > best_accuracy or (accuracy == best_accuracy and precision > best_precision):
        best_accuracy = accuracy
        best_precision = precision
        best_recall = recall
        best_model = bagging_clf
        
    print(f"n_estimators={n_estimators} => Accuracy: {accuracy:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}")

n_estimators=10 => Accuracy: 0.88757, Precision: 0.89182, Recall: 0.88757
n_estimators=50 => Accuracy: 0.90828, Precision: 0.91091, Recall: 0.90828
n_estimators=100 => Accuracy: 0.90828, Precision: 0.91091, Recall: 0.90828
n_estimators=200 => Accuracy: 0.91124, Precision: 0.91379, Recall: 0.91124


In [29]:
best_model

In [30]:
best_accuracy

0.9112426035502958

In [31]:
best_precision

0.9137854259945866

In [32]:
best_recall

0.9112426035502958

## 5. Stacking classifiers

StackingClassifier — это метод ансамблирования, который объединяет несколько базовых классификаторов для улучшения предсказательной способности модели. Этот метод использует стратегию "стэкинга" (stacking), которая заключается в обучении нескольких моделей на обучающих данных, а затем использовании их предсказаний в качестве входных данных для другого классификатора, называемого финальным классификатором.

In [33]:
n_splits_values = [2, 3, 4, 5, 6, 7]

In [34]:
best_accuracy = 0
best_precision = 0
best_recall = 0
best_model = None
for n_splits in n_splits_values:
    for passthrough in [True, False]:
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        
        stacking_clf = StackingClassifier(
            estimators=[('svс', svc), ('forest', forest), ('tree', tree)],
            passthrough=passthrough,
            final_estimator=LogisticRegression(solver='liblinear')
        )
        for train_index, val_index in skf.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
            
            stacking_clf.fit(X_train_fold, y_train_fold)
            y_pred = stacking_clf.predict(X_val_fold)
            
            accuracy = accuracy_score(y_val_fold, y_pred)
            precision = precision_score(y_val_fold, y_pred, average='weighted', zero_division=1)
            recall = recall_score(y_val_fold, y_pred, average='weighted', zero_division=1)

            if accuracy > best_accuracy or (accuracy == best_accuracy and precision > best_precision):
                best_accuracy = accuracy
                best_precision = precision
                best_recall = recall
                best_model = stacking_clf

In [35]:
best_predict = best_model.predict(X_test)
print(f"Best model: {best_model}")
print(f"Best accuracy: {accuracy_score(y_test, best_predict)}")
print(f"Best precision: {precision_score(y_test, best_predict, average='weighted', zero_division=1)}")
print(f"Best recall : {recall_score(y_test, best_predict, average='weighted', zero_division=1)}")


Best model: StackingClassifier(estimators=[('svс',
                                SVC(C=10, gamma='auto', probability=True,
                                    random_state=21)),
                               ('forest',
                                RandomForestClassifier(class_weight='balanced',
                                                       max_depth=30,
                                                       n_estimators=50,
                                                       random_state=21)),
                               ('tree',
                                DecisionTreeClassifier(class_weight='balanced',
                                                       max_depth=30,
                                                       random_state=21))],
                   final_estimator=LogisticRegression(solver='liblinear'))
Best accuracy: 0.893491124260355
Best precision: 0.8967357338075304
Best recall : 0.893491124260355


## 6. Predictions

Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.

In [36]:
predict = best_model.predict(X)

In [37]:
df['predict'] = predict

the most errors(weekday): Saturday(5) , 0,7117%

In [38]:
df[df.predict != df.dayofweek].dayofweek.value_counts() / len(predict)

dayofweek
5    0.007117
0    0.005931
6    0.005338
1    0.004745
4    0.004745
2    0.004152
3    0.004152
Name: count, dtype: float64

In [39]:
error = df[df.predict != df.dayofweek]
error.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek,predict
20,4,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,6
27,5,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,6
38,6,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,6
43,7,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,6
46,8,21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5,6


In [40]:
users_list = [i for i in df.columns if i.startswith('uid')]
lab_list = [i for i in df.columns if i.startswith('labname')]

In [41]:
max_error = 0
max_user = ''
for user in users_list:
    error_perc = error[user].sum() / len(predict)
    if error_perc > max_error:
        max_error = error_perc
        max_user = user
print(f"max user error: {max_user}, error percent: {max_error * 100}%")

max user error: uid_user_19, error percent: 0.4744958481613286%


In [42]:
max_error = 0
max_lab = ''
for lab in lab_list:
    error_perc = error[lab].sum() / len(predict)
    if error_perc > max_error:
        max_error = error_perc
        max_lab = lab
print(f"max lab error: {max_lab}, error percent: {max_error * 100}%")

max lab error: labname_project1, error percent: 1.3048635824436536%


In [43]:
joblib.dump(forest, 'best_model_03.joblib')

['best_model_03.joblib']