# Imports and Setup

In [1]:
### FILE DATA FOR TRAINING

# Train_Filename: Filepath for the training data. Training data is represented in Z-scores based on given distribution
TRAIN_FILENAME = '/content/drive/MyDrive/Reservations_Final1.csv'

# Target_Col:'Y' column, translated to Z-scores
TARGET_COL = 'canceled'

# Metric = Most important metric to sort by
METRIC = "AUC"

In [None]:
!pip install pycaret

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from google.colab import drive

In [4]:
# Connect to Drive
drive.mount('/content/drive', force_remount=True)

# Input Dataframe
path= TRAIN_FILENAME
df=pd.read_csv(path)

Mounted at /content/drive


# Undersampling

In [None]:
# Separate input features and target
y = df.canceled
X = df.drop('canceled', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

# concatenate our training data back together
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)


# separate minority and majority classes
not_canc = train[train.canceled==0]
canc = train[train.canceled==1]

# upsample minority
upsampled = resample(not_canc,
                          replace=True, # sample with replacement
                          n_samples=len(canc), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
newdf = pd.concat([canc, upsampled])

# Hyperparameter Optimization

In [None]:
# drop na rows for hyperparameter optimization
newdf.dropna(inplace=True)
test.dropna(inplace=True)

# check new class counts
print(len(newdf), len(test))

# split training and test data again
y_train = newdf.canceled
X_train = newdf.drop('canceled', axis=1)

y_test = test.canceled
X_test = test.drop('canceled', axis=1)

# create options in hyperparameter optimization grid
hp_grid = {'n_estimators': [64, 80, 96, 100, 112, 128],
               'max_features': ["sqrt", "log2"],
               'min_samples_split': [2, 0.1, 0.2, 0.3, 0.4, 0.5],
               'min_samples_leaf': [1, 0.1, 0.125, 0.15, 0.175, 0.2]}

# Training Classifiers depending on metric
tuned_rf_auc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=hp_grid, scoring='roc_auc')
tuned_rf_accuracy = GridSearchCV(estimator=RandomForestClassifier(), param_grid=hp_grid, scoring='accuracy')
tuned_rf_recall = GridSearchCV(estimator=RandomForestClassifier(), param_grid=hp_grid, scoring='recall')

In [None]:
# optimize for auc
print("AUC")
tuned_rf_auc.fit(X_train, y_train) 
print(tuned_rf_auc.best_params_) 
pred_auc = tuned_rf_auc.predict(X_test) 
print(classification_report(y_test, pred_auc))

In [None]:
# optimize accuracy
print("Accuracy")
tuned_rf_accuracy.fit(X_train, y_train) 
print(tuned_rf_accuracy.best_params_) 
pred_accuracy = tuned_rf_accuracy.predict(X_test) 
print(classification_report(y_test, pred_accuracy))

In [None]:
# optimize recall
print("Recall")
tuned_rf_recall.fit(X_train, y_train) 
print(tuned_rf_recall.best_params_) 
pred_recall = tuned_rf_recall.predict(X_test) 
print(classification_report(y_test, pred_recall))