This implementation of the Random Forests algorithm was done on my home PC CPU. The validation of the model was done calculating the average AUC for every random forest algorithm trained

# Setup

In [None]:
from pathlib import Path
import os

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    !pip install -Uqq fastai
    path = Path('/kaggle/input/playground-series-s4e3')
else:
    import zipfile,kaggle
    path = Path('playground-series-s4e3')
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

import pandas as pd
import numpy as np
import warnings

import matplotlib as plt
import seaborn as sns

# from fastai.imports import *
# from fastai.tabular.all import *

# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, cross_validate, cross_val_score, GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, make_scorer, classification_report

# import xgboost as xgb
# from xgboost import XGBClassifier

import optuna

In [None]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
target_classes = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]
targets_df = train_df[target_classes]
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(target_classes + ['id'], axis=1), 
                                                    targets_df, test_size=0.1, random_state=40)

categorical = ['TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']
numerical = list(set(train_df.columns) - set(categorical) - set(target_classes))
numerical.remove('id')

X = pd.get_dummies(X_train, columns=categorical)
y = y_train
X_t = pd.get_dummies(X_test, columns=categorical)
X_t.insert(len(X_t.columns)-1, 'Outside_Global_Index_0.7', 0)
y_t = y_test

# Baseline model

In [None]:
The SKLearn implementation of the Random Forest algorithm, does not natively support multiclass prediction. So in order to use it for this dataset we needed to train a different classifier for each outcome. This does multiply the compute needed with the number of possible targets.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [5]:
%%time

classifiers = []
for i in range(y_train.shape[1]):  
    rf = RandomForestClassifier(n_estimators=500, random_state=42)
    rf.fit(X, y.iloc[:, i])  
    classifiers.append(rf)

CPU times: user 1min 43s, sys: 264 ms, total: 1min 44s
Wall time: 1min 44s


In [6]:
auc_scores = []

for i, clf in enumerate(classifiers):
    y_pred_prob = clf.predict_proba(X_t)[:, 1]
    auc = roc_auc_score(y_t.iloc[:, i], y_pred_prob)
    auc_scores.append(auc)

# Calculate the average AUC score across all labels
average_auc = np.mean(auc_scores)
print(f'Average AUC: {average_auc}')

Average AUC: 0.8812101677924596


#### The only parameter I tweaked here was the n_estimators. I tested this using different numbers of estimators and found the following, performance got better using more estimators, plateauing aroung 500:
AUC for n Estimators, along with how long it took to train them
* 50: 0.868462071012809 (wall time: 11.7s)
* 100: 0.8755977066908903 (wall tim: 23.3s)
* 200: 0.8766660708906941 (wall time: 46.5s)
* 500: 0.8812101677924596 (wall time: 1min 44s)
* 1000: 0.8817353228186906 (wall time: 3min 51s)

# Hyperparameter tuning with Optuna

## Trial 1

In [8]:
def objective(trial):
    # We known that more is probably always better, to a point of diminishing returns, so I just set this at 100 and we can use more when we finally train the model
    # n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    scores = []
    # Train a classifier for each label
    for i in range(y_train.shape[1]):
        X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X, y.iloc[:, i], test_size=0.2, random_state=42)
        
        clf = RandomForestClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42
        )
        clf.fit(X_train_opt, y_train_opt)
        y_pred_opt = clf.predict(X_test_opt)
        score = roc_auc_score(y_test_opt, y_pred_opt)
        scores.append(score)
    
    # Average score across all classifiers
    average_score = np.mean(scores)
    return average_score

In [9]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Best hyperparameters:', study.best_params)

[I 2024-05-18 15:56:04,450] A new study created in memory with name: no-name-1bbbe08d-a431-447a-96d7-af38b10d5770
[I 2024-05-18 15:56:18,728] Trial 0 finished with value: 0.6624035249766821 and parameters: {'max_depth': 48, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.6624035249766821.
[I 2024-05-18 15:56:32,442] Trial 1 finished with value: 0.6629278838308331 and parameters: {'max_depth': 13, 'min_samples_split': 18, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 1 with value: 0.6629278838308331.
[I 2024-05-18 15:56:48,414] Trial 2 finished with value: 0.6668814878126674 and parameters: {'max_depth': 93, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 2 with value: 0.6668814878126674.
[I 2024-05-18 15:57:02,677] Trial 3 finished with value: 0.6624035249766821 and parameters: {'max_depth': 79, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is tria

Best hyperparameters: {'max_depth': 92, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
CPU times: user 12min 38s, sys: 679 ms, total: 12min 39s
Wall time: 12min 39s


Best hyperparameters: {'max_depth': 36, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
CPU times: user 14min 21s, sys: 1.02 s, total: 14min 22s
Wall time: 14min 21s

### Validation

In [10]:
%%time

classifiers = []
auc_scores = []
best_params = {'max_depth': 36, 'min_samples_split': 4, 
               'min_samples_leaf': 1, 'max_features': 'sqrt'}

for i in range(y_train.shape[1]):  
    rf = RandomForestClassifier(**best_params, n_estimators=500, random_state=42)
    rf.fit(X, y.iloc[:, i])  
    classifiers.append(rf)


for i, clf in enumerate(classifiers):
    y_pred_prob = clf.predict_proba(X_t)[:, 1]
    auc = roc_auc_score(y_t.iloc[:, i], y_pred_prob)
    auc_scores.append(auc)

average_auc = np.mean(auc_scores)
print(f'Average AUC: {average_auc}')

Average AUC: 0.8823182876895908
CPU times: user 1min 43s, sys: 110 ms, total: 1min 43s
Wall time: 1min 43s


Average AUC: 0.8823182876895908
CPU times: user 2min 3s, sys: 129 ms, total: 2min 3s
Wall time: 2min 

Improved performance compared to 0.8812 using 500 estimators with default hyperparameters

## Trial 2 - narrowing down the trial objective range of parameters

In [11]:
def objective(trial):
    # n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 30, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 8)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    # max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    scores = []
    # Train a classifier for each label
    for i in range(y_train.shape[1]):
        X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X, y.iloc[:, i], test_size=0.2, random_state=42)
        
        clf = RandomForestClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            # max_features=max_features,                      
            random_state=42
        )
        clf.fit(X_train_opt, y_train_opt)
        y_pred_opt = clf.predict(X_test_opt)
        score = roc_auc_score(y_test_opt, y_pred_opt)
        scores.append(score)
    
    # Average score across all classifiers
    average_score = np.mean(scores)
    return average_score

In [12]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Best hyperparameters:', study.best_params)

[I 2024-05-18 16:10:27,220] A new study created in memory with name: no-name-f0c3f0ef-02cb-4604-8a5d-21f8b314d153
[I 2024-05-18 16:10:42,629] Trial 0 finished with value: 0.6667830007286936 and parameters: {'max_depth': 42, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.6667830007286936.
[I 2024-05-18 16:10:58,840] Trial 1 finished with value: 0.6699642409088071 and parameters: {'max_depth': 44, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.6699642409088071.
[I 2024-05-18 16:11:14,391] Trial 2 finished with value: 0.6639741561469414 and parameters: {'max_depth': 31, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.6699642409088071.
[I 2024-05-18 16:11:29,948] Trial 3 finished with value: 0.6655028698925082 and parameters: {'max_depth': 44, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.6699642409088071.
[I 2024-05-18 16:11:46,444] Trial 4 finished with value: 0.667

Best hyperparameters: {'max_depth': 31, 'min_samples_split': 3, 'min_samples_leaf': 2}
CPU times: user 13min 8s, sys: 320 ms, total: 13min 9s
Wall time: 13min 8s


Best hyperparameters: {'max_depth': 35, 'min_samples_split': 4, 'min_samples_leaf': 1} CPU times: user 15min 46s, sys: 1.01 s, total: 15min 47s Wall time: 15min 46s2s

### Validation 

In [13]:
%%time

classifiers = []
auc_scores = []
best_params = {'max_depth': 35, 'min_samples_split': 4, 'min_samples_leaf': 1}


for i in range(y_train.shape[1]):  
    rf = RandomForestClassifier(**best_params, n_estimators=500, random_state=42, oob_score=roc_auc_score)
    rf.fit(X, y.iloc[:, i])  
    classifiers.append(rf)


for i, clf in enumerate(classifiers):
    # Predict the probability of the positive class
    y_pred_prob = clf.predict_proba(X_t)[:, 1]
    # Calculate the AUC score
    auc = roc_auc_score(y_t.iloc[:, i], y_pred_prob)
    auc_scores.append(auc)

# Calculate the average AUC score across all labels
average_auc = np.mean(auc_scores)
print(f'Average AUC: {average_auc}')

Average AUC: 0.882186685671311
CPU times: user 1min 48s, sys: 103 ms, total: 1min 48s
Wall time: 1min 48s
