# Model Exploration - Multiclass (ML50-2023)

## __Importing libraries__

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint
import time 
import random

# Sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Metrics imports
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


#### Importing the data

In [81]:
train = pd.read_csv('../Data/train_multiclass.csv', index_col=0)
test = pd.read_csv('../Data/test_multiclass.csv', index_col=0)
b_target = pd.read_csv('../Data/train_cleaned.csv', index_col=0)['b_target']

# Droping the target from the train and test
c_target = train['c_target']
train = train.drop('c_target', axis=1)  
test = test.drop('c_target', axis=1)

### __Small processing__

## __Functions__

In [65]:
def get_train_val(X_, y_, test_size=0.3, scaler=MinMaxScaler(), sampler='simple', minority_factor=1, verbose=False, return_scaler=False):    
    X = X_.copy()
    y = y_.copy()
    multiclass = False
    # Check if y is multi-class
    if len(y.unique()) > 2:
        multiclass = True
        print('Multiclass variable') if verbose else None

    numeric_cols = [col for col in X.columns if col.startswith('n_')]
    if scaler:
        X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
        
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    # Simple doesnt work for multiclass
    if sampler == 'simple' and multiclass:
        print('Multiclass sampling not supported for simple sampler') if verbose else None
        return None
    
    # Simple sampler 
    if sampler == 'simple':
        print(f'Simple sampling with minority factor {minority_factor}') if verbose else None
        X_res = pd.concat([X_train, y_train], axis=1)
        minority_class = X_res[X_res[y.name] == 1]
        majority_class = X_res[X_res[y.name] == 0]
        # Using the resample function to upsample the minority class
        minority_upsampled = resample(minority_class, replace=True, n_samples=int(len(majority_class)*minority_factor), random_state=42)

        upsampled = pd.concat([majority_class, minority_upsampled])
        X_train = upsampled.drop(columns=y.name)
        y_train = upsampled[y.name]
    
    # Multiclass sampler
    if sampler == 'multiclass':
        print('Using multiclass sampling') if verbose else None
        X_res = pd.concat([X_train, y_train], axis=1)
        max_class_count = X_res[y.name].value_counts().max()
        to_upsample = X_res[X_res[y.name] == X_res[y.name].value_counts().idxmax()]
        # Iterate over each class
        for class_index, group in X_res.groupby(y.name):
            if len(group) < max_class_count:
                # Upsample minority class
                minority_upsampled = resample(group, replace=True, 
                                            n_samples=max_class_count, 
                                            random_state=42)
                # Replace the original samples of the class in the dataframe with the upsampled data
                to_upsample = pd.concat([to_upsample, minority_upsampled])
        X_train = to_upsample.drop(columns=y.name)
        y_train = to_upsample[y.name]
        
    # Using SMOTE to upsample the minority class, or other IMBLEARN samplers
    elif sampler:
        print('Using {} sampler'.format(sampler)) if verbose else None
        X_train, y_train = sampler.fit_resample(X_train, y_train)

    if return_scaler:
        return X_train, X_val, y_train, y_val, scaler
    else:
        return X_train, X_val, y_train, y_val 

In [37]:
def test_model(model, X_, y_, just_score=False, scaler=MinMaxScaler(), test_size=0.3, sampler='simple', minority_factor=1, verbose=False, return_scaler=False, return_X=False, average='weighted'): 
    X = X_.copy()
    y = y_.copy()

    multiclass = False
    # Check if y is multi-class
    if len(y.unique()) > 2:
        multiclass = True
        print('Multiclass variable') if verbose else None
    
    if return_scaler:
        X_train, X_test, y_train, y_test, scaler = get_train_val(X_=X, y_=y, test_size=test_size, scaler=scaler, sampler=sampler, minority_factor=minority_factor, verbose=verbose, return_scaler=return_scaler)
    else:
        X_train, X_test, y_train, y_test = get_train_val(X_=X, y_=y, test_size=test_size, scaler=scaler, sampler=sampler, minority_factor=minority_factor, verbose=verbose, return_scaler=return_scaler)
    
    fit = model.fit(X_train, y_train)
    y_pred = fit.predict(X_test)
    if just_score:
        if multiclass:
            return fit, f1_score(y_test, y_pred, average=average)
        return fit, f1_score(y_test, y_pred)  
    
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred)) if not multiclass else print('Precision: ', precision_score(y_test, y_pred, average=average))
    print('Recall: ', recall_score(y_test, y_pred)) if not multiclass else print('Recall: ', recall_score(y_test, y_pred, average=average))
    print('F1: ', f1_score(y_test, y_pred)) if not multiclass else print('F1: ', f1_score(y_test, y_pred, average=average))
    print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification report: \n', classification_report(y_test, y_pred))
    if return_scaler and return_X:
        return fit, scaler, X_test, y_test
    elif return_scaler:
        return fit, scaler
    elif return_X:
        return fit, X_test, y_test
    else :
        return fit

## __To help with the models__

### __Feature Elimination__

#### __RFE - Logistic Regression__

In [82]:
test_model(RandomForestClassifier(), train, b_target, scaler=MinMaxScaler(), test_size=0.3, sampler=RandomOverSampler(), minority_factor=1, verbose=False, return_scaler=False, return_X=False)

ValueError: Found input variables with inconsistent numbers of samples: [78303, 71233]

In [16]:
# Perform recursive feature elimination to select the best features
# Create the RFE object and rank each pixel
model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)
rfe = RFE(model, n_features_to_select=1, verbose=1, step=1)
rfe = rfe.fit(train, c_target)

Fitting estimator with 150 features.
Fitting estimator with 149 features.
Fitting estimator with 148 features.
Fitting estimator with 147 features.
Fitting estimator with 146 features.
Fitting estimator with 145 features.
Fitting estimator with 144 features.
Fitting estimator with 143 features.
Fitting estimator with 142 features.
Fitting estimator with 141 features.
Fitting estimator with 140 features.
Fitting estimator with 139 features.
Fitting estimator with 138 features.
Fitting estimator with 137 features.
Fitting estimator with 136 features.
Fitting estimator with 135 features.
Fitting estimator with 134 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 130 features.
Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 126 features.
Fitting estimator with 125 features.
Fitting estimator with 124 features.
F

### __Feature Selection__

### __Functions__

## __Models__

## __Conclusion__