In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve,auc, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
train_df = pd.read_csv('label_df_train.csv')

In [4]:
train_df

Unnamed: 0,transcript_id,transcript_position,five_mer,weighted_mean_neg1,weighted_mean_0,weighted_mean_1,weighted_sd_neg1,weighted_sd_0,weighted_sd_1,mean_25_neg1,...,5-mer_window1_GACTA,5-mer_window1_GACTC,5-mer_window1_GACTG,5-mer_window1_GACTT,A_freq,C_freq,G_freq,T_freq,gene_id,label
0,ENST00000000233,244,AAGACCA,123.762870,125.793483,80.775369,5.209304,8.511144,5.686085,123.0,...,0.0,0.0,0.0,0.0,4,2,1,0,ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,109.924484,108.101783,94.108586,3.910096,3.735377,3.442957,108.0,...,0.0,0.0,0.0,0.0,3,2,1,1,ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,105.450998,99.426169,89.309704,3.400684,3.910397,2.501662,105.0,...,0.0,0.0,0.0,0.0,4,1,2,0,ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,129.548782,97.842815,89.096953,6.542432,3.132313,2.372598,128.0,...,0.0,0.0,0.0,0.0,4,1,1,1,ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,118.217577,121.925694,84.996204,7.490865,6.224289,4.516338,116.0,...,0.0,0.0,0.0,0.0,4,1,2,0,ENSG00000004059,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97035,ENST00000641834,1348,GGGACAT,117.736860,116.644041,81.604529,3.679951,5.318444,4.342628,117.0,...,0.0,0.0,0.0,0.0,2,1,3,1,ENSG00000167747,1
97036,ENST00000641834,1429,CTGACAC,111.417384,115.248207,80.183728,5.835194,9.597531,3.848341,110.0,...,0.0,0.0,0.0,0.0,2,3,1,1,ENSG00000167747,0
97037,ENST00000641834,1531,TGGACAC,113.627173,113.706528,84.184182,4.313860,5.075660,2.339143,113.0,...,0.0,0.0,0.0,0.0,2,2,2,1,ENSG00000167747,1
97038,ENST00000641834,1537,CTGACCA,109.836890,123.712610,82.354740,3.270741,7.163837,2.847985,108.0,...,0.0,0.0,0.0,0.0,2,3,1,1,ENSG00000167747,0


In [5]:
train_df = train_df.drop(columns = ['transcript_id', 'transcript_position','gene_id','five_mer'])

In [6]:
X_train, y_train = train_df[[i for i in train_df.columns if i!='label']],train_df['label']

In [7]:
scaler = StandardScaler()
smote = SMOTE()
mlp_classifier = MLPClassifier(verbose = 2)
pipeline = Pipeline([('scaler',scaler),('smote', smote),('mlp', mlp_classifier)])

In [8]:
params_grid = {
    "smote__sampling_strategy": [0.25,0.33,0.5,1],
    "mlp__hidden_layer_sizes":[(4,4),(8,8),(16,16),(32,32),(64,64)],
    "mlp__alpha":[1e-5,1e-4,1e-3,1e-2,1e-1],
    "mlp__learning_rate_init" :[1e-4,1e-3,1e-2,1e-1],
    "mlp__activation" : ['logistic', 'tanh', 'relu']

}

In [17]:
clf = RandomizedSearchCV(pipeline, n_iter=50, param_distributions=params_grid, scoring = ["average_precision","roc_auc"], n_jobs = -2, verbose = 4, refit = False)

In [18]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [24]:
df_results = pd.DataFrame(clf.cv_results_)

In [29]:
df_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_smote__sampling_strategy', 'param_mlp__learning_rate_init',
       'param_mlp__hidden_layer_sizes', 'param_mlp__alpha',
       'param_mlp__activation', 'params', 'split0_test_average_precision',
       'split1_test_average_precision', 'split2_test_average_precision',
       'split3_test_average_precision', 'split4_test_average_precision',
       'mean_test_average_precision', 'std_test_average_precision',
       'rank_test_average_precision', 'split0_test_roc_auc',
       'split1_test_roc_auc', 'split2_test_roc_auc', 'split3_test_roc_auc',
       'split4_test_roc_auc', 'mean_test_roc_auc', 'std_test_roc_auc',
       'rank_test_roc_auc'],
      dtype='object')

In [77]:
df_results[df_results['rank_test_average_precision']==1][['mean_test_roc_auc','mean_test_average_precision']]

Unnamed: 0,mean_test_roc_auc,mean_test_average_precision
45,0.911728,0.45301


In [78]:
df_results[df_results['rank_test_average_precision']==1][['param_smote__sampling_strategy', 'param_mlp__learning_rate_init',
       'param_mlp__hidden_layer_sizes', 'param_mlp__alpha',
       'param_mlp__activation', 'params']]

Unnamed: 0,param_smote__sampling_strategy,param_mlp__learning_rate_init,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__activation,params
45,0.33,0.001,"(16, 16)",0.1,logistic,"{'smote__sampling_strategy': 0.33, 'mlp__learn..."


In [46]:
df_results.to_csv('hyperparameter_results.csv')

In [56]:
new_scaler = StandardScaler()
new_smote = SMOTE(sampling_strategy=0.33)
new_mlp_classifier = MLPClassifier(verbose = 2, hidden_layer_sizes=(16,16),learning_rate_init=0.001, activation="logistic",alpha = 0.1)
new_pipeline = Pipeline([('scaler',new_scaler),('smote', new_smote),('mlp', new_mlp_classifier)])
new_pipeline

In [57]:
new_pipeline.fit(X_train, y_train)

Iteration 1, loss = 0.50319981
Iteration 2, loss = 0.40473220
Iteration 3, loss = 0.36631454
Iteration 4, loss = 0.35136028
Iteration 5, loss = 0.34363762
Iteration 6, loss = 0.33862319
Iteration 7, loss = 0.33574388
Iteration 8, loss = 0.33356770
Iteration 9, loss = 0.33162309
Iteration 10, loss = 0.33032831
Iteration 11, loss = 0.32881886
Iteration 12, loss = 0.32777715
Iteration 13, loss = 0.32640979
Iteration 14, loss = 0.32549122
Iteration 15, loss = 0.32432163
Iteration 16, loss = 0.32366104
Iteration 17, loss = 0.32271177
Iteration 18, loss = 0.32185730
Iteration 19, loss = 0.32140422
Iteration 20, loss = 0.32076744
Iteration 21, loss = 0.32053082
Iteration 22, loss = 0.31975793
Iteration 23, loss = 0.31900387
Iteration 24, loss = 0.31849174
Iteration 25, loss = 0.31814309
Iteration 26, loss = 0.31788074
Iteration 27, loss = 0.31714873
Iteration 28, loss = 0.31698294
Iteration 29, loss = 0.31689869
Iteration 30, loss = 0.31637012
Iteration 31, loss = 0.31619427
Iteration 32, los

In [58]:
test_df = pd.read_csv('label_df_valid.csv')
X_test = test_df[[i for i in X_train]]
y_pred_proba = new_pipeline.predict_proba(X_test)[:,1]
roc_auc_score(test_df['label'], y_pred_proba)

0.8969866829147723

In [68]:
roc_auc_score(y_train,pipeline.predict_proba(X_train)[:,1])

0.9659837982928497

In [69]:
precision, recall, thresholds = precision_recall_curve(test_df['label'],y_pred_proba)
auc(recall, precision)

0.4161257740946135

In [70]:
precision, recall, thresholds = precision_recall_curve(y_train,pipeline.predict_proba(X_train)[:,1])
auc(recall, precision)

0.5466424724811536