In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = Path(os.path.abspath(''))
data_path = path.resolve().parents[0]/'data'
data_path
# this notebook should be in data_processing, the pkl files should be in data

WindowsPath('C:/Users/65829/OneDrive/Desktop/Y4S1/DSA4266/Project2/dsa4266_wooper/data')

In [3]:
# perform various encodings based on train_df
# https://github.com/scikit-learn-contrib/category_encoders
# Generalized Linear Mixed Model Encoder
# Target Encoder
# Leave One Out Encoder
# James Stein
# Weight of evidence
# M-estimate
train_df = pd.read_pickle(data_path/'train.pkl')
validation_df = pd.read_pickle(data_path/'validation.pkl')
# OHE is the baseline to compare with
train_df_OHE = pd.read_pickle(data_path/'train_OHE.pkl')
validation_df_OHE = pd.read_pickle(data_path/'validation_OHE.pkl')
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']
X_valid = validation_df.drop(['label'], axis=1)
y_valid = validation_df['label']
def objective(trial, X_train, y_train, X_valid, y_valid):
    n_estimators = trial.suggest_int('n_estimators', 100,400)
    min_samples_split = trial.suggest_float('min_samples_split',0.0,1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.0,1.0)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    rf = RandomForestClassifier(n_estimators= n_estimators, min_samples_split = min_samples_split, min_samples_leaf= min_samples_leaf, random_state=42, n_jobs= -1, max_features= max_features)
    rf.fit(X_train, y_train)
    rf_probs = rf.predict_proba(X_valid)
    true_probs = [entry[1] for entry in rf_probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

In [7]:
# name study after encoding method used
# create a new study for each encoding method 
study = optuna.create_study(study_name = 'label_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )
#study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials= 60)
# these 2 should be recorded for each study
#study.best_params
#study.best_value    

[I 2023-11-02 03:00:23,215] A new study created in memory with name: label_encoder


Polynomial Encoder

In [10]:
import pandas as pd
from Dongmen_Encoder_Exp.polynomial import PolynomialEncoder

enc=PolynomialEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
polynomial_X_train=enc.transform(X_train)
polynomial_X_validation=PolynomialEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit_transform(X_valid,y_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, polynomial_X_train, y_train, polynomial_X_validation,y_valid), n_trials= 60)
polynomial_best_params=study.best_params
polynomial_best_value=study.best_value
polynomial_best_value
#best result:0.6563492391530387

[I 2023-11-02 03:11:28,396] Trial 0 finished with value: 0.5637307011210237 and parameters: {'n_estimators': 176, 'min_samples_split': 0.5930264847966491, 'min_samples_leaf': 0.04322578889509421, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5637307011210237.
[I 2023-11-02 03:11:31,683] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 352, 'min_samples_split': 0.7649948974824452, 'min_samples_leaf': 0.4022096364477704, 'max_features': 'log2'}. Best is trial 0 with value: 0.5637307011210237.
[I 2023-11-02 03:11:40,877] Trial 2 finished with value: 0.5751156027553497 and parameters: {'n_estimators': 305, 'min_samples_split': 0.1917928821384235, 'min_samples_leaf': 0.1358745109748808, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.5751156027553497.
[I 2023-11-02 03:11:42,327] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 107, 'min_samples_split': 0.467997065825871, 'min_samples_leaf': 0.39319765602854706, 'max_features': 'sqrt'}. Best 

0.6563492391530387

Rankhot Encoder

In [6]:
import pandas as pd
from Dongmen_Encoder_Exp.rankhot import RankHotEncoder
#assign ranks to each nucleotide and then use a one-hot encoding approach
study = optuna.create_study(study_name = 'rankhot_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )
enc=RankHotEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
rank_X_train=enc.transform(X_train, y_train)
rank_X_validation=enc.transform(X_valid)

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, rank_X_train, y_train, rank_X_validation,y_valid), n_trials= 60, n_jobs = -1)
rank_best_params=study.best_params
rank_best_value=study.best_value
rank_best_value
#best result:0.8610708690098074

[I 2023-11-02 10:23:48,377] A new study created in memory with name: rankhot_encoder
[I 2023-11-02 10:25:18,291] Trial 4 finished with value: 0.5 and parameters: {'n_estimators': 183, 'min_samples_split': 0.9749149289109404, 'min_samples_leaf': 0.7573780966839984, 'max_features': 'log2'}. Best is trial 4 with value: 0.5.
[I 2023-11-02 10:25:20,050] Trial 5 finished with value: 0.5 and parameters: {'n_estimators': 193, 'min_samples_split': 0.26996606788636524, 'min_samples_leaf': 0.3413719957513729, 'max_features': 'log2'}. Best is trial 4 with value: 0.5.
[I 2023-11-02 10:25:20,833] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 219, 'min_samples_split': 0.8529480676474303, 'min_samples_leaf': 0.8153887287150288, 'max_features': 'log2'}. Best is trial 4 with value: 0.5.
[I 2023-11-02 10:25:22,032] Trial 7 finished with value: 0.5 and parameters: {'n_estimators': 239, 'min_samples_split': 0.7944592176567694, 'min_samples_leaf': 0.6531675310266504, 'max_features': 'sqr

0.8610708690098074

Target Encoder

In [8]:
import pandas as pd
from Dongmen_Encoder_Exp.target_encoder import TargetEncoder
enc=TargetEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
target_X_train=enc.transform(X_train, y_train)
target_X_validation=enc.transform(X_valid)

study = optuna.create_study(study_name = 'target_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )
##Count Encoding's Results
study.optimize(lambda trial: objective(trial, target_X_train, y_train, target_X_validation,y_valid), n_trials= 60, n_jobs = -1)
target_best_params=study.best_params
target_best_value=study.best_value
target_best_value
#best result:0.8451159450428376

[I 2023-11-02 11:12:49,730] A new study created in memory with name: target_encoder
[I 2023-11-02 11:13:00,787] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 111, 'min_samples_split': 0.3472721227732767, 'min_samples_leaf': 0.4908186036601687, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.5.
[I 2023-11-02 11:13:05,063] Trial 6 finished with value: 0.5 and parameters: {'n_estimators': 186, 'min_samples_split': 0.9194043076804765, 'min_samples_leaf': 0.45335606555075836, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.5.
[I 2023-11-02 11:13:05,862] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 206, 'min_samples_split': 0.39121474161664194, 'min_samples_leaf': 0.8923821730244467, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.5.
[I 2023-11-02 11:13:05,927] Trial 7 finished with value: 0.5 and parameters: {'n_estimators': 210, 'min_samples_split': 0.8956393370664795, 'min_samples_leaf': 0.2836572229553319, 'max_features': 'sqr

0.8451159450428376

Woe encoder

In [10]:
import pandas as pd
from Dongmen_Encoder_Exp.woe import WOEEncoder
enc=WOEEncoder(cols=['sequence', 'm1_seq', 'p1_seq']).fit(X_train,y_train)
woe_X_train=enc.transform(X_train, y_train)
woe_X_validation=enc.transform(X_valid)


study = optuna.create_study(study_name = 'target_encoder',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )

##Count Encoding's Results
study.optimize(lambda trial: objective(trial, woe_X_train, y_train, woe_X_validation,y_valid), n_trials= 60, n_jobs = -1)
woe_best_params=study.best_params
woe_best_value=study.best_value
woe_best_value
#best result:0.8447989093299819

[I 2023-11-02 11:31:22,291] A new study created in memory with name: target_encoder
[I 2023-11-02 11:31:35,010] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 107, 'min_samples_split': 0.8314763750628194, 'min_samples_leaf': 0.6226989601168583, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.5.
[I 2023-11-02 11:31:38,010] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 154, 'min_samples_split': 0.6941817578257329, 'min_samples_leaf': 0.7740834361195589, 'max_features': 'log2'}. Best is trial 1 with value: 0.5.
[I 2023-11-02 11:31:40,140] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 177, 'min_samples_split': 0.17091405085194056, 'min_samples_leaf': 0.7235280567593415, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.5.
[I 2023-11-02 11:31:43,425] Trial 0 finished with value: 0.80315269380253 and parameters: {'n_estimators': 142, 'min_samples_split': 0.41614907946287105, 'min_samples_leaf': 0.22813077215550448, 'max_f

0.8447989093299819