In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve,auc, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

## Read Data

In [50]:
train_df = pd.read_csv('label_df_train.csv')
test_df = pd.read_csv('label_df_valid.csv')

In [51]:
train_df = train_df.drop(columns = ['transcript_id', 'transcript_position','gene_id','five_mer'])

In [52]:
X_train, y_train = train_df[[i for i in train_df.columns if i!='label']],train_df['label']

## Hyperparameter Tuning

In [7]:
scaler = StandardScaler()
smote = SMOTE()
mlp_classifier = MLPClassifier(verbose = 2)
pipeline = Pipeline([('scaler',scaler),('smote', smote),('mlp', mlp_classifier)])

In [8]:
params_grid = {
    "smote__sampling_strategy": [0.25,0.33,0.5,1],
    "mlp__hidden_layer_sizes":[(4,4),(8,8),(16,16),(32,32),(64,64)],
    "mlp__alpha":[1e-5,1e-4,1e-3,1e-2,1e-1],
    "mlp__learning_rate_init" :[1e-4,1e-3,1e-2,1e-1],
    "mlp__activation" : ['logistic', 'tanh', 'relu']

}

In [17]:
clf = RandomizedSearchCV(pipeline, n_iter=50, param_distributions=params_grid, scoring = ["average_precision","roc_auc"], n_jobs = -2, verbose = 4, refit = False)

In [18]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [24]:
df_results = pd.DataFrame(clf.cv_results_)

In [29]:
df_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_smote__sampling_strategy', 'param_mlp__learning_rate_init',
       'param_mlp__hidden_layer_sizes', 'param_mlp__alpha',
       'param_mlp__activation', 'params', 'split0_test_average_precision',
       'split1_test_average_precision', 'split2_test_average_precision',
       'split3_test_average_precision', 'split4_test_average_precision',
       'mean_test_average_precision', 'std_test_average_precision',
       'rank_test_average_precision', 'split0_test_roc_auc',
       'split1_test_roc_auc', 'split2_test_roc_auc', 'split3_test_roc_auc',
       'split4_test_roc_auc', 'mean_test_roc_auc', 'std_test_roc_auc',
       'rank_test_roc_auc'],
      dtype='object')

## Best Results

In [77]:
df_results[df_results['rank_test_average_precision']==1][['mean_test_roc_auc','mean_test_average_precision']]

Unnamed: 0,mean_test_roc_auc,mean_test_average_precision
45,0.911728,0.45301


## Best Parameters

In [78]:
df_results[df_results['rank_test_average_precision']==1][['param_smote__sampling_strategy', 'param_mlp__learning_rate_init',
       'param_mlp__hidden_layer_sizes', 'param_mlp__alpha',
       'param_mlp__activation', 'params']]

Unnamed: 0,param_smote__sampling_strategy,param_mlp__learning_rate_init,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__activation,params
45,0.33,0.001,"(16, 16)",0.1,logistic,"{'smote__sampling_strategy': 0.33, 'mlp__learn..."


In [46]:
#Save results
df_results.to_csv('hyperparameter_results.csv')

## Refitting

In [57]:
new_scaler = StandardScaler()
new_smote = SMOTE(sampling_strategy=0.33)
new_mlp_classifier = MLPClassifier(verbose = 2, hidden_layer_sizes=(16,16),learning_rate_init=0.001, activation="logistic", alpha  = 0.1)
new_pipeline = Pipeline([('scaler',new_scaler),('smote', new_smote),('mlp', new_mlp_classifier)])
new_pipeline

In [58]:
new_pipeline.fit(X_train, y_train)

Iteration 1, loss = 0.48530685
Iteration 2, loss = 0.39078938
Iteration 3, loss = 0.35943566
Iteration 4, loss = 0.34817361
Iteration 5, loss = 0.34166358
Iteration 6, loss = 0.33847754
Iteration 7, loss = 0.33579705
Iteration 8, loss = 0.33356104
Iteration 9, loss = 0.33197716
Iteration 10, loss = 0.33047760
Iteration 11, loss = 0.32920695
Iteration 12, loss = 0.32815921
Iteration 13, loss = 0.32721347
Iteration 14, loss = 0.32641818
Iteration 15, loss = 0.32540623
Iteration 16, loss = 0.32461615
Iteration 17, loss = 0.32368250
Iteration 18, loss = 0.32343881
Iteration 19, loss = 0.32252644
Iteration 20, loss = 0.32204165
Iteration 21, loss = 0.32176935
Iteration 22, loss = 0.32087545
Iteration 23, loss = 0.32044128
Iteration 24, loss = 0.32039476
Iteration 25, loss = 0.31970333
Iteration 26, loss = 0.31911936
Iteration 27, loss = 0.31939201
Iteration 28, loss = 0.31878871
Iteration 29, loss = 0.31875175
Iteration 30, loss = 0.31824769
Iteration 31, loss = 0.31817562
Iteration 32, los

## Evaluating on Unseen Test

In [59]:
X_test = test_df[[i for i in X_train]]
y_pred_proba = new_pipeline.predict_proba(X_test)[:,1]
roc = roc_auc_score(test_df['label'], y_pred_proba)
roc

0.9006655805047761

In [60]:
precision, recall, thresholds = precision_recall_curve(test_df['label'],y_pred_proba)
pr = auc(recall, precision)
pr

0.41832941238863297

In [61]:
0.5*(pr + roc)

0.6594974964467045