In [5]:

import pandas as pd
import numpy as np
from pathlib import Path
import os
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = Path(os.path.abspath(''))
data_path = path.resolve().parents[0]/'data'

# this notebook should be in data_processing, the pkl files should be in data

In [7]:
# perform various encodings based on train_df
# https://github.com/scikit-learn-contrib/category_encoders
# Generalized Linear Mixed Model Encoder
# Target Encoder
# Leave One Out Encoder
# James Stein
# Weight of evidence
# M-estimate 
train_df = pd.read_pickle(data_path/'train.pkl')
validation_df = pd.read_pickle(data_path/'validation.pkl')
# OHE is the baseline to compare with
train_df_OHE = pd.read_pickle(data_path/'train_OHE.pkl')
validation_df_OHE = pd.read_pickle(data_path/'validation_OHE.pkl')

In [None]:
# insert encoding steps here




In [None]:
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']
X_valid = validation_df.drop(['label'], axis=1)
y_valid = validation_df['label']


In [None]:
def objective(trial, X_train, y_train, X_valid, y_valid):
    n_estimators = trial.suggest_int('n_estimators', 100,400)
    min_samples_split = trial.suggest_float('min_samples_split',0.0,1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.0,1.0)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    rf = RandomForestClassifier(n_estimators= n_estimators, min_samples_split = min_samples_split, min_samples_leaf= min_samples_leaf, random_state=42, n_jobs= -1, max_features= max_features)
    rf.fit(X_train, y_train)
    rf_probs = rf.predict_proba(X_valid)
    true_probs = [entry[1] for entry in rf_probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

In [None]:
# name study after encoding method used
# create a new study for each encoding method 
study = optuna.create_study(study_name = 'label_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

In [None]:
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials= 60)


In [None]:
# these 2 should be recorded for each study
study.best_params
study.best_value