In [1]:

import pandas as pd
import numpy as np
from pathlib import Path
import os
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
path = Path(os.path.abspath(''))
data_path = path.resolve().parents[0]/'../data/'
data_path
# this notebook should be in data_processing, the pkl files should be in data

PosixPath('/Users/yangliran/Desktop/DSA4266/Project2/dsa4266_wooper/data_processing/../data')

In [8]:
# perform various encodings based on train_df
# https://github.com/scikit-learn-contrib/category_encoders
# Generalized Linear Mixed Model Encoder
# Target Encoder
# Leave One Out Encoder
# James Stein
# Weight of evidence
# M-estimate 
train_df = pd.read_pickle(data_path/'train.pkl')
validation_df = pd.read_pickle(data_path/'validation.pkl')
# OHE is the baseline to compare with
train_df_OHE = pd.read_pickle(data_path/'train_OHE.pkl')
validation_df_OHE = pd.read_pickle(data_path/'validation_OHE.pkl')

X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']
X_valid = validation_df.drop(['label'], axis=1)
y_valid = validation_df['label']

##Below is the Objective and study optimizer, rerun it evertime to try a new encoder

In [None]:
def objective(trial, X_train, y_train, X_valid, y_valid):
    n_estimators = trial.suggest_int('n_estimators', 100,400)
    min_samples_split = trial.suggest_float('min_samples_split',0.0,1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.0,1.0)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    rf = RandomForestClassifier(n_estimators= n_estimators, min_samples_split = min_samples_split, min_samples_leaf= min_samples_leaf, random_state=42, n_jobs= -1, max_features= max_features)
    rf.fit(X_train, y_train)
    rf_probs = rf.predict_proba(X_valid)
    true_probs = [entry[1] for entry in rf_probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

In [None]:
# name study after encoding method used
# create a new study for each encoding method 
study = optuna.create_study(study_name = 'label_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

'''
Example run of study to optimize params 
-----------

study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials= 60)

#these 2 should be recorded for each study:
#study.best_params
#study.best_value
'''

[I 2023-11-01 22:30:04,113] A new study created in memory with name: label_encoder


##One Hot Encoder

In [28]:
##One HOt's Results
study.optimize(lambda trial: objective(trial, train_df_OHE.drop("label",axis=1),train_df_OHE["label"], validation_df_OHE.drop("label",axis=1),validation_df_OHE["label"]), n_trials= 60)
#Best is trial 31 with value: 0.8629663021197282.

[I 2023-11-01 16:59:58,371] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 149, 'min_samples_split': 0.6563482542554723, 'min_samples_leaf': 0.15326087971968738, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 17:00:00,112] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 147, 'min_samples_split': 0.3456083716568995, 'min_samples_leaf': 0.5546559515315573, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 17:00:02,575] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 214, 'min_samples_split': 0.5087600334141884, 'min_samples_leaf': 0.8032553174231851, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 17:00:06,224] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 372, 'min_samples_split': 0.6475955285958578, 'min_samples_leaf': 0.48349455729801716, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 17:00:07,445] Trial 4 finished w

##Count Encoding

In [35]:
# insert encoding steps here

#train_df.columns
#start with count encoding: 
'''
Count encoding for categorical features.
For a given categorical feature, replace the names of the groups with the group counts.
'''
import pandas as pd
from count import CountEncoder

enc=CountEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
count_X_train=enc.transform(X_train)
count_X_validation=CountEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, count_X_train, y_train, count_X_validation,y_valid), n_trials= 60)
count_best_params=study.best_params
count_best_value=study.best_value
count_best_value

#Best is trial 13 with value: 0.8196568600225005.




[I 2023-11-01 17:20:47,333] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 117, 'min_samples_split': 0.4672112300669361, 'min_samples_leaf': 0.796829116606089, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 17:20:56,755] Trial 1 finished with value: 0.7713252854871397 and parameters: {'n_estimators': 243, 'min_samples_split': 0.07162120984005937, 'min_samples_leaf': 0.18830043100233151, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7713252854871397.
[I 2023-11-01 17:20:58,672] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 185, 'min_samples_split': 0.9206712941073871, 'min_samples_leaf': 0.8054461103217903, 'max_features': 'log2'}. Best is trial 1 with value: 0.7713252854871397.
[I 2023-11-01 17:21:00,644] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 190, 'min_samples_split': 0.87083417957451, 'min_samples_leaf': 0.8003897849542819, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7713252

0.8196568600225005

##Backward Difference Encoder

In [40]:
#Backward Difference Contrast encoding
from backward_difference import BackwardDifferenceEncoder

enc=BackwardDifferenceEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
BackwardDC_X_train=enc.transform(X_train)
BackwardDC_X_validation=BackwardDifferenceEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, BackwardDC_X_train, y_train, BackwardDC_X_validation,y_valid), n_trials= 60)
BackwardDC_best_params=study.best_params
BackwardDC_best_value=study.best_value
BackwardDC_best_value






[I 2023-11-01 21:30:42,636] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 215, 'min_samples_split': 0.04173264809932142, 'min_samples_leaf': 0.6544912748111981, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 21:30:43,983] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 152, 'min_samples_split': 0.7149978743366856, 'min_samples_leaf': 0.20948191249869064, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 21:30:46,938] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 374, 'min_samples_split': 0.747349574852959, 'min_samples_leaf': 0.14362523892914347, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 21:30:51,689] Trial 3 finished with value: 0.6770849505878949 and parameters: {'n_estimators': 362, 'min_samples_split': 0.09647831442818222, 'min_samples_leaf': 0.29872181228667927, 'max_features': 'log2'}. Best is trial 3 with value: 0.6770849505878949.
[I 2023-11-01 

0.7447328413865615

##Helmert Encoder

In [43]:
#Helmert encoder
from helmert import HelmertEncoder

enc=HelmertEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
Helmert_X_train=enc.transform(X_train)
Helmert_X_validation=HelmertEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, Helmert_X_train, y_train, Helmert_X_validation,y_valid), n_trials= 60)
Helmert_best_params=study.best_params
Helmert_best_value=study.best_value
Helmert_best_value

#Best is trial 28 with value: 0.7584990629072738.


[I 2023-11-01 22:01:04,602] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 336, 'min_samples_split': 0.3941660497255557, 'min_samples_leaf': 0.3639684337275443, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 22:01:08,995] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 396, 'min_samples_split': 0.4861562494061705, 'min_samples_leaf': 0.3700159993258436, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 22:01:12,116] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 283, 'min_samples_split': 0.5733549108186827, 'min_samples_leaf': 0.5140082038446863, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 22:01:13,822] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 140, 'min_samples_split': 0.08845895311957175, 'min_samples_leaf': 0.519157524969812, 'max_features': 'log2'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 22:01:22,411] Trial 4 finished wit

0.7584990629072738

##Leave One Out Encoding

In [46]:
#Leave one out encoding 
from leave_one_out import LeaveOneOutEncoder

enc=LeaveOneOutEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
LOO_X_train=enc.transform(X_train)
LOO_X_validation=LeaveOneOutEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, LOO_X_train, y_train, LOO_X_validation,y_valid), n_trials= 60)
LOO_best_params=study.best_params
LOO_best_value=study.best_value
LOO_best_value




[I 2023-11-01 22:30:13,438] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 236, 'min_samples_split': 0.31083294735728484, 'min_samples_leaf': 0.32420480311207334, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5.
[I 2023-11-01 22:30:22,047] Trial 1 finished with value: 0.7803918029250726 and parameters: {'n_estimators': 143, 'min_samples_split': 0.3325012113669469, 'min_samples_leaf': 0.041896038406206926, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7803918029250726.
[I 2023-11-01 22:30:23,330] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 112, 'min_samples_split': 0.7052503079265777, 'min_samples_leaf': 0.6317970840342333, 'max_features': 'log2'}. Best is trial 1 with value: 0.7803918029250726.
[I 2023-11-01 22:30:27,601] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 397, 'min_samples_split': 0.053554546240364376, 'min_samples_leaf': 0.7727768398134881, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.

0.8412413152622763