This notebook contains the detailed classification pipeline.

- **Author**: Benkirane Ismail
- **Email**: [ibenkirane@mgb.org](mailto:ibenkirane@mgb.org)
- **Version**: 1.0.0
- **Date**: 2023-10-19

## Imports

In [None]:
import sys
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, StackingClassifier


from xgboost import XGBClassifier

sys.path.append('../')

from utils import CLASSIFIER,  UTILITIES

In [None]:
utilities = UTILITIES()
classifier = CLASSIFIER()

## Get extracted features

In [None]:
all_features = pd.read_csv('../computed_features/all_features.csv')
stand_features = pd.read_csv('../computed_features/stand_features.csv')

groups = {
    'Cluster 1' : [1003, 1007, 1013, 1015, 1020, 1024, 1026],
    'Cluster 2' : [1001, 1031, 1032, 1037, 1039],
    'Cluster 3' : [1008, 1017, 1022, 1025, 1033, 1040, 1041, 1042],
    'All Subjects': [1001, 1003, 1007, 1008, 1013, 1015, 1017, 1020, 1022, 1024, 1025, 1026, 1031, 1032, 1033, 1037, 1039, 1040, 1041]
    }

## Express Classification

In [None]:
classification_performances = classifier.get_classification_performances(stand_features, groups, augment_data=False, feature_selection=True, nb_features=15, reduce_dim=True, verbose=False, save=True)

## Detailed Classification

### Training & Testing

In [None]:
objective = 'All Emotions'
group = 'All Subjects'

In [None]:
X_train, y_train, X_test, y_test, selected_features = classifier.get_classification_sets(stand_features, objective, groups[group], augment_data=False, feature_selection=True, nb_features=136, reduce_dim=True) # 'Positive vs Negative' 'Neutral vs Non-Neutral' 'Shame vs Others' 'All Emotions'

##### Random Forest Classifier

In [None]:
best_rf_params = {'n_estimators': 50, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 3, 'criterion': 'gini'}

rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
rf_model.fit(X_train, np.argmax(y_train, axis=1))

rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(np.argmax(y_test, axis=1), rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy * 100}%")

Support Vector Machine

In [None]:
best_svm_params = {'C': 1000, 'kernel': 'rbf', 'gamma': 'scale'}

svm_model = SVC(**best_svm_params, probability=True, random_state=42)
svm_model.fit(X_train, np.argmax(y_train, axis=1))

svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(np.argmax(y_test, axis=1), svm_predictions)

print(f"SVM Accuracy: {svm_accuracy * 100}%")

##### Gradient Boosting Machines

In [None]:
xgb_model = XGBClassifier(
    n_estimators=5,
    learning_rate=0.1,
    max_depth=1,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',  # or 'multi:softprob' for multiclass and set num_class
    reg_alpha=0.005,
    random_state=42
)

xgb_model.fit(X_train, np.argmax(y_train, axis=1))

xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(np.argmax(y_test, axis=1), xgb_predictions)

print(f"XGBoost Accuracy: {xgb_accuracy * 100}%")

##### Ada Boost

In [None]:
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=6),
    n_estimators=50,
    learning_rate=1,
    random_state=42
)

ada_model.fit(X_train, np.argmax(y_train, axis=1))

ada_predictions = ada_model.predict(X_test)
ada_accuracy = accuracy_score(np.argmax(y_test, axis=1), ada_predictions)

print(f"AdaBoost Accuracy: {ada_accuracy * 100}%")

##### Combination

In [None]:
base_models = [('rf', rf_model), ('xgb', xgb_model)] 
meta_model = GradientBoostingClassifier() 

stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train, np.argmax(y_train, axis=1))

stacking_predictions = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(np.argmax(y_test, axis=1), stacking_predictions)
print(f"Adjusted Stacking Ensemble Accuracy: {stacking_accuracy * 100}%")

In [None]:
voting_clf_extended = VotingClassifier(
    estimators=[('rf', rf_model), ('svm', svm_model), ('xgb', xgb_model), ('ada', ada_model)],
    voting='soft'  # or 'hard'
)

voting_clf_extended.fit(X_train, np.argmax(y_train, axis=1))

voting_predictions_extended = voting_clf_extended.predict(X_test)
voting_accuracy_extended = accuracy_score(np.argmax(y_test, axis=1), voting_predictions_extended)
print(f"Extended Voting Classifier Accuracy: {voting_accuracy_extended * 100}%")