In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = Path(os.path.abspath(''))
data_path = path.resolve().parents[0]/'data'
data_path
# this notebook should be in data_processing, the pkl files should be in data

WindowsPath('C:/Users/dongm/Documents/GitHub/dsa4266_wooper/data')

In [4]:
# perform various encodings based on train_df
# https://github.com/scikit-learn-contrib/category_encoders
# Generalized Linear Mixed Model Encoder
# Target Encoder
# Leave One Out Encoder
# James Stein
# Weight of evidence
# M-estimate
train_df = pd.read_pickle(data_path/'train.pkl')
validation_df = pd.read_pickle(data_path/'validation.pkl')
# OHE is the baseline to compare with
train_df_OHE = pd.read_pickle(data_path/'train_OHE.pkl')
validation_df_OHE = pd.read_pickle(data_path/'validation_OHE.pkl')
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']
X_valid = validation_df.drop(['label'], axis=1)
y_valid = validation_df['label']
def objective(trial, X_train, y_train, X_valid, y_valid):
    n_estimators = trial.suggest_int('n_estimators', 100,400)
    min_samples_split = trial.suggest_float('min_samples_split',0.0,1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.0,1.0)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    rf = RandomForestClassifier(n_estimators= n_estimators, min_samples_split = min_samples_split, min_samples_leaf= min_samples_leaf, random_state=42, n_jobs= -1, max_features= max_features)
    rf.fit(X_train, y_train)
    rf_probs = rf.predict_proba(X_valid)
    true_probs = [entry[1] for entry in rf_probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

In [7]:
# name study after encoding method used
# create a new study for each encoding method 
study = optuna.create_study(study_name = 'label_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )
#study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials= 60)
# these 2 should be recorded for each study
#study.best_params
#study.best_value    

[I 2023-11-02 03:00:23,215] A new study created in memory with name: label_encoder


Polynomial Encoder

In [10]:
import pandas as pd
from Dongmen_Encoder_Exp.polynomial import PolynomialEncoder

enc=PolynomialEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
polynomial_X_train=enc.transform(X_train)
polynomial_X_validation=PolynomialEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, polynomial_X_train, y_train, polynomial_X_validation,y_valid), n_trials= 60)
polynomial_best_params=study.best_params
polynomial_best_value=study.best_value
polynomial_best_value
#best result:0.6563492391530387

[I 2023-11-02 03:11:28,396] Trial 0 finished with value: 0.5637307011210237 and parameters: {'n_estimators': 176, 'min_samples_split': 0.5930264847966491, 'min_samples_leaf': 0.04322578889509421, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5637307011210237.
[I 2023-11-02 03:11:31,683] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 352, 'min_samples_split': 0.7649948974824452, 'min_samples_leaf': 0.4022096364477704, 'max_features': 'log2'}. Best is trial 0 with value: 0.5637307011210237.
[I 2023-11-02 03:11:40,877] Trial 2 finished with value: 0.5751156027553497 and parameters: {'n_estimators': 305, 'min_samples_split': 0.1917928821384235, 'min_samples_leaf': 0.1358745109748808, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.5751156027553497.
[I 2023-11-02 03:11:42,327] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 107, 'min_samples_split': 0.467997065825871, 'min_samples_leaf': 0.39319765602854706, 'max_features': 'sqrt'}. Best 

0.6563492391530387

Rankhot Encoder

In [12]:
import pandas as pd
from Dongmen_Encoder_Exp.rankhot import RankHotEncoder
#assign ranks to each nucleotide and then use a one-hot encoding approach
enc=RankHotEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
rank_X_train=enc.transform(X_train)
rank_X_validation=RankHotEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, rank_X_train, y_train, rank_X_validation,y_valid), n_trials= 60)
rank_best_params=study.best_params
rank_best_value=study.best_value
rank_best_value
#best result:0.8576752479646811

[I 2023-11-02 03:24:32,743] Trial 74 finished with value: 0.8496969141168401 and parameters: {'n_estimators': 222, 'min_samples_split': 0.07549267368672152, 'min_samples_leaf': 0.00395279368785505, 'max_features': 'sqrt'}. Best is trial 72 with value: 0.8513686332912684.
[I 2023-11-02 03:24:44,728] Trial 75 finished with value: 0.8460822936619983 and parameters: {'n_estimators': 215, 'min_samples_split': 0.09052269424971494, 'min_samples_leaf': 0.009337855356464982, 'max_features': 'sqrt'}. Best is trial 72 with value: 0.8513686332912684.
[I 2023-11-02 03:24:55,642] Trial 76 finished with value: 0.8457447852824129 and parameters: {'n_estimators': 208, 'min_samples_split': 0.0908197503800126, 'min_samples_leaf': 0.007210264364580917, 'max_features': 'sqrt'}. Best is trial 72 with value: 0.8513686332912684.
[I 2023-11-02 03:25:09,084] Trial 77 finished with value: 0.8499149447883825 and parameters: {'n_estimators': 225, 'min_samples_split': 0.07537567438517338, 'min_samples_leaf': 0.0017

0.8576752479646811

Target Encoder

In [13]:
import pandas as pd
from Dongmen_Encoder_Exp.target_encoder import TargetEncoder
enc=TargetEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
target_X_train=enc.transform(X_train)
target_X_validation=TargetEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, target_X_train, y_train, target_X_validation,y_valid), n_trials= 60)
target_best_params=study.best_params
target_best_value=study.best_value
target_best_value
#best result:0.8481842617504712

[I 2023-11-02 03:33:08,348] Trial 134 finished with value: 0.8101093640815135 and parameters: {'n_estimators': 141, 'min_samples_split': 0.028831907067002525, 'min_samples_leaf': 0.06002447547037434, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:33:13,674] Trial 135 finished with value: 0.8017005229894485 and parameters: {'n_estimators': 111, 'min_samples_split': 0.02232648932172857, 'min_samples_leaf': 0.04137170876708874, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:33:21,056] Trial 136 finished with value: 0.8053539898450406 and parameters: {'n_estimators': 141, 'min_samples_split': 0.0897143122845134, 'min_samples_leaf': 0.021084678455162546, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:33:26,728] Trial 137 finished with value: 0.801764847204157 and parameters: {'n_estimators': 178, 'min_samples_split': 0.03822899369671011, 'min_samples_leaf': 

0.8576752479646811

Woe encoder

In [14]:
import pandas as pd
from Dongmen_Encoder_Exp.woe import WOEEncoder
enc=WOEEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
woe_X_train=enc.transform(X_train)
woe_X_validation=WOEEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, woe_X_train, y_train, woe_X_validation,y_valid), n_trials= 60)
woe_best_params=study.best_params
woe_best_value=study.best_value
woe_best_value
#best result:0.898842357882881

[I 2023-11-02 03:42:15,484] Trial 194 finished with value: 0.8475322363772809 and parameters: {'n_estimators': 231, 'min_samples_split': 0.059416698727336834, 'min_samples_leaf': 0.045502485176644104, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:42:24,741] Trial 195 finished with value: 0.8402348285914676 and parameters: {'n_estimators': 238, 'min_samples_split': 0.1802793614899026, 'min_samples_leaf': 0.021668595328057323, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:42:26,216] Trial 196 finished with value: 0.5 and parameters: {'n_estimators': 225, 'min_samples_split': 0.09991215763448531, 'min_samples_leaf': 0.40487740260622956, 'max_features': 'sqrt'}. Best is trial 107 with value: 0.8576752479646811.
[I 2023-11-02 03:42:32,220] Trial 197 finished with value: 0.8450037651623669 and parameters: {'n_estimators': 152, 'min_samples_split': 0.038427114355494583, 'min_samples_leaf': 0.0490935323

0.898842357882881