# Models Training 

- In this notebook, we will train various models and measure their performance by $MAE$, $MSE$, and $R^2$.
- As the purpose of this project is to predict `popularity`, we decided that regression model is the the most appropriate. 
- Our selected models are the following:
    1. Decision Tree
    2. AdaBoost
    3. Random Forest
    4. Gradient Boosting (scikit-learn)
    5. Hist Gradient Boosting (scikit-learn)
    6. XGBoost
    7. LightGBM 
    8. CatBoost
    9. K-Nearest Neighbors (KNN)
    10. Multilayer Perceptron (MLP)

---

## Preparing Helper Functions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import math

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

def split_data(df, LDS=False, S=False, N=False):
    # Define the features and target variable
    # X = df.drop(['popularity', 'weight'], axis=1)
    X = df.drop(['popularity'], axis=1)
    y = df['popularity']
    # Splitting the dataset into training and testing sets
    if LDS == False: 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        return X_train, X_test, y_train, y_test
    else:
        # Split data, including weights
        X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
            X, y, df['weight'], test_size=0.2, random_state=42
        )
        normalizer = Normalizer()
        normalizer.fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
        return X_train, X_test, y_train, y_test, weights_train, weights_test

In [73]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def model_performace(model_name, y_test, y_pred, weights_test=None):
    mae = mean_absolute_error(y_test, y_pred, sample_weight=weights_test)
    mse = mean_squared_error(y_test, y_pred, sample_weight=weights_test)
    r2 = r2_score(y_test, y_pred, sample_weight=weights_test)
    print(f"Model Performance ({model_name}):\nMAE = {mae}\nMSE = {mse}\nR^2 = {r2}\n")

---

## Models

In [74]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [81]:
class Models:
    def __init__(self, df, LDS=False):
        self.df = df
        self.LDS = LDS
        self.X_train, self.X_test, self.y_train, self.y_test = split_data(df)
        
        scaler = StandardScaler()  
        scaler.fit(self.X_train)
        self.X_train_standardized = scaler.transform(self.X_train)  
        self.X_test_standardized = scaler.transform(self.X_test)  

        normalizer = Normalizer()
        normalizer.fit(self.X_train)
        self.X_train_normalized = normalizer.transform(self.X_train)
        self.X_test_normalized = normalizer.transform(self.X_test)
        
        if self.LDS == True: 
            self.X_train_LDS, self.X_test_LDS, self.y_train_LDS, self.y_test_LDS, self.w_train, self.w_test = split_data(df, LDS=True)

        self.DecisionTree()
        self.AdaBoost()
        self.RandomForest()
        self.GB()
        self.HistGB()
        self.XGBoost()
        self.LGBM()
        self.CatBoost()
        self.KNN()
        self.MLP()

    def run_model(self, regr, model_name):
        regr.fit(self.X_train, self.y_train)
        y_pred = regr.predict(self.X_test)
        model_performace(model_name=f"{model_name}", y_test=self.y_test, y_pred=y_pred)
        if self.LDS == False: 
            return
        regr.fit(self.X_train_LDS, self.y_train_LDS, sample_weight=self.w_train)
        y_pred = regr.predict(self.X_test_LDS)
        model_performance(model_name=f"{model_name} (with LDS)", y_test=self.y_test_LDS, y_pred=y_pred, weights_test=self.w_test)
    
    def DecisionTree(self):
        regr = DecisionTreeRegressor()
        self.run_model(regr, "Decision Tree")
    
    def AdaBoost(self):
        regr = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=50)
        self.run_model(regr, "AdaBoost")

    def RandomForest(self):
        regr = RandomForestRegressor(n_estimators=100)
        self.run_model(regr, "Random Forest")

    def GB(self):
        regr = GradientBoostingRegressor(n_estimators=100)
        self.run_model(regr, "Gradient Boosting")

    def HistGB(self):
        regr = HistGradientBoostingRegressor(max_iter=100)
        self.run_model(regr, "Hist Gradient Boosting")

    def XGBoost(self):
        regr = XGBRegressor(objective='reg:squarederror', n_estimators=100)
        self.run_model(regr, "XGBoost")

    def LGBM(self):
        regr = lgb.LGBMRegressor()
        regr.fit(self.X_train, self.y_train, eval_set=[(self.X_test, self.y_test)], eval_metric='mse')
        y_pred = regr.predict(self.X_test, num_iteration=regr.best_iteration_)
        model_performace(model_name=f"LightGBM", y_test=self.y_test, y_pred=y_pred)

    def CatBoost(self):
        regr = CatBoostRegressor(verbose=0)
        regr.fit(self.X_train, self.y_train, eval_set=(self.X_test, self.y_test), use_best_model=True)
        y_pred = regr.predict(self.X_test)
        model_performace(model_name=f"CatBoost", y_test=self.y_test, y_pred=y_pred)

    def KNN(self):
        regr = KNeighborsRegressor(n_neighbors=5, weights='distance')
        regr.fit(self.X_train_normalized, self.y_train)
        y_pred = regr.predict(self.X_test_normalized)
        model_performace(model_name=f"K-Nearest Neighbors", y_test=self.y_test, y_pred=y_pred)

    def MLP(self):
        params = { 'hidden_layer_sizes' : [10,10],
            'activation' : 'relu', 'solver' : 'adam',
            'alpha' : 0.0, 'batch_size' : 10,
            'random_state' : 0, 'tol' : 0.0001,
            'nesterovs_momentum' : False,
            'learning_rate' : 'constant',
            'learning_rate_init' : 0.01,
            'max_iter' : 1000, 'shuffle' : True,
            'n_iter_no_change' : 50, 'verbose' : False }
        regr = MLPRegressor(**params)
        regr.fit(self.X_train_standardized, self.y_train)
        y_pred = regr.predict(self.X_test_standardized)
        model_performace(model_name=f"Multilayer Perceptron", y_test=self.y_test, y_pred=y_pred)

In [82]:
clean_df = pd.read_csv("./data/data_smogn_05.csv")
model = Models(clean_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 85095, number of used features: 10
[LightGBM] [Info] Start training from score 58.119126
Model Performance (LightGBM):
MAE = 15.355236282429157
MSE = 366.60797458236794
R^2 = 0.6442629189061209

Model Performance (CatBoost):
MAE = 14.165357546553388
MSE = 325.87661093465897
R^2 = 0.6837864901800835

Model Performance (K-Nearest Neighbors):
MAE = 11.076730730513027
MSE = 306.73285513191905
R^2 = 0.7023625831256848

Model Performance (Multilayer Perceptron):
MAE = 16.253345010523233
MSE = 422.04907163627735
R^2 = 0.590465796622931

