# ML lab 08
# Random Forest and Ensembles

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
pd.set_option('display.precision', 3)
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Extra imports
from pandas import read_csv
from sklearn.metrics import confusion_matrix,\
        accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier,StackingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt

from time import time
from datetime import timedelta


from joblib import dump, load
import os
import json
import pickle as pkl


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [6]:
models_dir = '/content/drive/MyDrive/Machine Learning Deliverable/models'

In [7]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/spotify_ml_21_5.csv')
df.shape

(37667, 14)

In [8]:
results_df = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results21_5.csv')
results_df.shape
results_df

(19, 17)

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,VotingHard,0.651,0.665,0.524,0.672,0.369,0.852,0.486,0.458,0.898,0.832,0.84,0.776,0.826,0.387,0.955,0.438
1,Stacking,0.654,0.659,0.5,0.669,0.317,0.84,0.535,0.446,0.895,0.817,0.845,0.77,0.826,0.361,0.962,0.445
2,MLP-best,0.654,0.659,0.492,0.653,0.35,0.841,0.535,0.46,0.899,0.826,0.821,0.772,0.819,0.375,0.947,0.436
3,ExtrTrees-best2,0.638,0.655,0.47,0.7,0.329,0.843,0.45,0.4,0.91,0.83,0.852,0.779,0.831,0.419,0.961,0.4
4,ExtrTrees-best,0.637,0.654,0.47,0.698,0.327,0.842,0.45,0.4,0.911,0.827,0.851,0.778,0.831,0.408,0.96,0.404
5,RandomForest-best,0.636,0.654,0.486,0.669,0.366,0.84,0.44,0.405,0.908,0.827,0.84,0.779,0.829,0.396,0.954,0.417
6,RandomForest-default,0.633,0.645,0.477,0.69,0.271,0.846,0.471,0.377,0.907,0.832,0.845,0.778,0.825,0.364,0.955,0.392
7,MLP-default,0.641,0.644,0.502,0.638,0.341,0.801,0.53,0.402,0.884,0.825,0.8,0.759,0.781,0.413,0.929,0.416
8,SVM,0.607,0.63,0.475,0.619,0.393,0.813,0.373,0.41,0.887,0.827,0.779,0.749,0.796,0.403,0.925,0.372
9,ExtraTrees-default,0.619,0.627,0.455,0.68,0.247,0.82,0.478,0.349,0.883,0.827,0.81,0.77,0.782,0.355,0.939,0.379


In [11]:
np.random.seed(6046)

In [12]:
labels = list(df['genre'].unique())
# results_df = pd.DataFrame()
def compute_metrics(y_true, y_pred, labels, algorithm_name, results_df=None):
    accuracy = accuracy_score(y_true, y_pred)
    f1_scores = f1_score(y_true, y_pred, average=None, labels=labels)
    f1_score_avg = f1_score(y_true, y_pred, average='macro')

    # If results_df is not provided or empty, create a new DataFrame
    if results_df is None or results_df.empty:
        columns = ['Algorithm', 'Accuracy', 'F1_Avg'] + [f'F1_{label}' for label in labels]
        results_df = pd.DataFrame(columns=columns)

    # Append results to the DataFrame with algorithm name as index
    results_df.loc[len(results_df)] = [algorithm_name, accuracy, f1_score_avg] + list(f1_scores)

    return results_df

In [13]:
with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl", "rb") as f:
    X_train, y_train = pkl.load(f)

with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl", "rb") as f:
    X_val, y_val = pkl.load(f)

with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl", "rb") as f:
    X_test, y_test = pkl.load(f)

## Load models

In [None]:
decision_trees = load(os.path.join(models_dir, "DecisionTrees_best.joblib"))
extra_trees = load(os.path.join(models_dir, "ExtraTrees2_best.joblib"))
random_forest = load(os.path.join(models_dir, "RandomForest_best.joblib"))
qda = load(os.path.join(models_dir, "QDA.joblib"))
lda = load(os.path.join(models_dir, "LDA.joblib"))
knn = load(os.path.join(models_dir, "KNN_best.joblib"))
gauss_nb = load(os.path.join(models_dir, "GaussianNB_best.joblib"))
log_regression = load(os.path.join(models_dir, "LogisticRegression_best.joblib"))
mlp = load(os.path.join(models_dir, "MLP_best.joblib"))
svm = load(os.path.join(models_dir, "SVM.joblib"))

## Ensembles

### Voting Hard

In [None]:
init_time = time()
voting_hard = VotingClassifier([('dt', decision_trees), ('rf', random_forest),
                                # ('lda', lda), ('qda', qda),
                                # ('knn', knn), ('lr', log_regression),
                                # ('gnb', gauss_nb),('extratrees', extra_trees),
                                ('mlp', mlp), ('svm', svm)])
voting_hard.fit(X_train, y_train)

print(timedelta(seconds=(time() - init_time)))
y_pred = voting_hard.predict(X_val)

In [None]:
results_df = compute_metrics(y_val, y_pred, labels, "VotingHard", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

In [None]:
dump(voting_hard, os.path.join(models_dir, 'VotingHard.joblib'))

### Voting Soft

In [None]:
init_time = time()
voting_soft = VotingClassifier([('dt', decision_trees), ('rf', random_forest),
                                # ('lda', lda), ('qda', qda),
                                # ('knn', knn), ('lr', log_regression),
                                # ('gnb', gauss_nb),('extratrees', extra_trees),
                                ('mlp', mlp), ('svm', svm)], voting='soft')
voting_soft.fit(X_train, y_train)

print(timedelta(seconds=(time() - init_time)))
y_pred = voting_soft.predict(X_val)

In [None]:
results_df = compute_metrics(y_val, y_pred, labels, "VotingSoft", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

In [None]:
dump(voting_soft, os.path.join(models_dir, 'VotingSoft.joblib'))

In [None]:
results_df.to_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results21_5.csv', index=False)

### Stacking

In [None]:
init_time = time()
stacky = StackingClassifier(
    estimators=[('dt', decision_trees), ('rf', random_forest),
                                ('mlp', mlp), ('svc', svm)],
    final_estimator=GradientBoostingClassifier())

stacky.fit(X_train, y_train)

print(timedelta(seconds=(time() - init_time)))
y_pred = stacky.predict(X_val)

In [None]:
results_df = compute_metrics(y_val, y_pred, labels, "Stacking", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

In [None]:
dump(stacky, os.path.join(models_dir, 'Stacking.joblib'))

In [None]:
results_df.to_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results21_5.csv', index=False)

## Final Model

Load it from the drive

In [9]:
best = load(os.path.join(models_dir, "VotingHardFinal.joblib"))

Or train it on training and validation sets. <br/>
<b>Note:</b> It took 6 hours on a MAC with M2 chip and 16GB

In [None]:
X_train_val = np.concatenate((X_train, X_val), axis=0)
y_train_val = np.concatenate((y_train, y_val), axis=0)

# print(X_train.shape, y_train.shape)
# print(X_val.shape, y_val.shape)
# print(X_train_val.shape, y_train_val.shape)

print("Training Voting Hard...")
init_time = time()
voting_hard = VotingClassifier([('dt', decision_trees), ('rf', random_forest),
                                ('mlp', mlp), ('svc', svm)])
voting_hard.fit(X_train_val, y_train_val)

print(timedelta(seconds=(time() - init_time)))

dump(voting_hard, os.path.join(models_dir, 'VotingHardFinal.joblib'))
print("Voting Hard has executed successfully!\n")
best = voting_hard

In [15]:
y_pred = best.predict(X_test)
final_result = compute_metrics(y_test, y_pred, labels, "VotingHard")
final_result

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,VotingHard,0.658,0.674,0.533,0.7,0.361,0.846,0.476,0.471,0.913,0.845,0.848,0.784,0.814,0.444,0.962,0.44


In [16]:
final_result.to_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/final_model_result.csv', index=False)