In [1]:
import os

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from IPython.core.debugger import set_trace

In [4]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [5]:
from pathlib import Path
import glob
import regex
import dill
import gzip
import toolz, itertools, more_itertools
from collections import Counter, OrderedDict

In [6]:
import optuna
import sklearn

In [26]:
import math
import pandas as pd 
import numpy as np
import seaborn as sns
from imblearn.combine import SMOTETomek
import statistics
import shap
np.random.seed(42)
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_curve, precision_recall_curve, precision_recall_fscore_support
from sklearn.metrics import average_precision_score
from matplotlib import pyplot
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## DATA directory
Note: Please change the DIR path accordingly  

In [49]:
DIR = Path(r'C:\Users\Abhij\OneDrive\Documents\GitHub\DNA-structure-prediction')
os.chdir(DIR)
assert DIR.exists()
DATA = DIR/"data"

## Helper Functions to read pickled data

In [50]:
try:
    _file = DIR / "data"
    os.mkdir(_file / "pkl")
    os.mkdir(_file/"results")
except FileExistsError:
    print("Dir exists")


def pkl_it(dataframe, filebase):
    with open(DIR / "data" / "pkl" / str(filebase + ".pkl"), "wb") as fh:
        dill.dump(dataframe, fh)
    return


def unpkl_it(filebase):
    with open(DIR / "data" / "pkl" / str(filebase + ".pkl"), "rb") as fh:
        return dill.load(fh)

Dir exists


## A flexible helper Class for running different ML algorithms  
It automatically chooses the best threshold for classification by locating the arg_max (index) of the best F-score  

In [46]:
import logging
from collections import defaultdict
logging.basicConfig(filename=DATA/'results.log', level=logging.INFO)
from utils.Evaluator import Call_Plot 


## Read curated dataset

In [14]:
curated_data = unpkl_it("curated_dataset")
curated_data

Unnamed: 0,target,sequence,AA/TT,GG/CC,AC/GT,CA/TG,AT/AT,TA/TA,AG/CT,GA/TC,CG/CG,GC/GC,item
6RSO,0,TCGGCGCCGA,0,2,0,0,0,0,0,2,3,2,B
6QJR,0,CGCAAAAAAGCG,5,0,0,1,0,0,1,0,2,2,B
6F3C,0,CGTACG,0,0,2,0,0,1,0,0,2,0,B
6GIM,0,AAATTT,4,0,0,0,1,0,0,0,0,0,B
6ASF,0,CCAAGATAG,1,1,0,1,1,1,2,1,0,0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117D,1,GCGTACGTACGC,0,0,4,0,0,2,0,0,3,2,A
118D,1,GTGCGCAC,0,0,2,2,0,0,0,0,1,2,A
1D26,1,GCCCGGGC,0,4,0,0,0,0,0,0,1,2,A
2D47,1,CCCCCGCGGGGG,0,8,0,0,0,0,0,0,2,1,A


In [15]:
#Check if any sequence has duplicate features
curated_data.drop_duplicates(subset=['AA/TT', 'GG/CC', 'AC/GT',
                                          'CA/TG', 'AT/AT', 'TA/TA', 'AG/CT', 'GA/TC', 'CG/CG', 'GC/GC'], keep='last')

Unnamed: 0,target,sequence,AA/TT,GG/CC,AC/GT,CA/TG,AT/AT,TA/TA,AG/CT,GA/TC,CG/CG,GC/GC,item
6RSO,0,TCGGCGCCGA,0,2,0,0,0,0,0,2,3,2,B
6QJR,0,CGCAAAAAAGCG,5,0,0,1,0,0,1,0,2,2,B
6F3C,0,CGTACG,0,0,2,0,0,1,0,0,2,0,B
6GIM,0,AAATTT,4,0,0,0,1,0,0,0,0,0,B
6ASF,0,CCAAGATAG,1,1,0,1,1,1,2,1,0,0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117D,1,GCGTACGTACGC,0,0,4,0,0,2,0,0,3,2,A
118D,1,GTGCGCAC,0,0,2,2,0,0,0,0,1,2,A
1D26,1,GCCCGGGC,0,4,0,0,0,0,0,0,1,2,A
2D47,1,CCCCCGCGGGGG,0,8,0,0,0,0,0,0,2,1,A


## Nested Cross-validation

In [65]:
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import RepeatedStratifiedKFold

from utils.NestedCV import gen_data, gen_data_for_tuningHP

The function gen_data is a flexible generator that implements outer fold of Nested CV  
Here, we are using 5-fold stratified Nested cross validation (n_splits = 5)  

This generator implements inner fold of Nested CV, where we tune hyperparameters.

## Read the best Logistic Regression hyperparameters

In [56]:
best_LR_params = pd.read_csv(DATA/"tuned_hyperparameters"/"best_LR_params.csv", index_col=0)

In [57]:
best_LR_params

Unnamed: 0,C,class_weight,dual,fit_intercept,intercept_scaling,max_iter,multi_class,penalty,random_state,solver,tol,verbose,warm_start
Model_1,0.4,balanced,False,True,1,100,auto,l2,42,lbfgs,0.0001,False,False
Model_2,0.5,balanced,False,True,1,100,auto,l2,42,lbfgs,0.0001,False,False
Model_3,0.2,balanced,False,True,1,100,auto,l2,42,lbfgs,0.0001,False,False
Model_4,0.2,balanced,False,True,1,100,auto,l2,42,lbfgs,0.0001,False,False
Model_5,0.9,balanced,False,True,1,100,auto,l2,42,lbfgs,0.0001,False,False


## Set up Logistic Regression training

In [43]:
#Class weight factor:
SCALE_POS_WEIGHT = lambda T,P: (100*(T-P)/T) / (100*P/T)

In [21]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, cross_val_predict
import optuna
from optuna.pruners import HyperbandPruner
import copy

In [75]:
def trainer(data, param_updater):
    train_x, train_y = data["X_train"], data["y_train"]
    
    param = {
        "class_weight": "balanced",
        "random_state": 42,
        "verbose": False,
    }
    param.update(param_updater)
    model = LogisticRegression(**param)
    #model = make_pipeline(StandardScaler(), model)
    model.fit(train_x, train_y)
    return model





## Run and evaluate performance of Logistic Regression using tuned hyperparameters

In [76]:
plt.close()
# *************OUTER*************
plot_Model_LR_test = Call_Plot(repeated_k_fold=False, model_name="LR", DIR=DIR)
for outer_idx, elem in enumerate(gen_data(curated_data.drop(labels=["item", "sequence"], axis=1), RESAMPLING=False)):
    # ***********INNER************
    model = trainer(elem, best_LR_params.T.to_dict()
                    [f"Model_{outer_idx + 1}"])
    plot_Model_LR_test.Plot(elem, model, outer_idx)
plot_Model_LR_test.post_Plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

       B-DNA       0.90      0.96      0.93        27
       A-DNA       0.90      0.75      0.82        12

    accuracy                           0.90        39
   macro avg       0.90      0.86      0.87        39
weighted avg       0.90      0.90      0.89        39

Average PR:  0.9108435545935546
AUC PR:  0.9070993820993821
Best Threshold_f-score=0.686987, F-Score=0.870
AUC:  0.9506172839506173
Best Threshold_ROC=0.521127, G-Mean_ROC=0.903
Accuracy:  0.8974358974358975
F1:  0.8181818181818182
MCC:  0.7535992817105361
              precision    recall  f1-score   support

       B-DNA       0.83      0.96      0.89        26
       A-DNA       0.89      0.62      0.73        13

    accuracy                           0.85        39
   macro avg       0.86      0.79      0.81        39
weighted avg       0.85      0.85      0.84        39

Average PR:  0.8160816035816034
AUC PR:  0.8070745671707209
Best Threshold_f-score=0.8357

In [77]:
pd.DataFrame(plot_Model_LR_test.results)

Unnamed: 0,Average PR,AUC PR,ROC AUC,Accuracy,F1,MCC,cohen_kappa_score
0,0.910844,0.907099,0.950617,0.897436,0.818182,0.753599,0.747573
1,0.816082,0.807075,0.887574,0.846154,0.727273,0.645497,0.625
2,0.96097,0.95951,0.977564,0.921053,0.869565,0.814697,0.813115
3,0.966346,0.965118,0.980769,0.921053,0.857143,0.820008,0.804124
4,0.896601,0.892854,0.929487,0.894737,0.833333,0.75641,0.75641


In [78]:
pd.DataFrame(plot_Model_LR_test.results).mean()

Average PR           0.910168
AUC PR               0.906331
ROC AUC              0.945202
Accuracy             0.896086
F1                   0.821099
MCC                  0.758042
cohen_kappa_score    0.749244
dtype: float64

## Running from scratch - Run and evaluate performance of Logistic Regression under 5-fold stratified Nested CV

In [79]:
import optuna

import copy

def objective(data, trial):
    train_x, valid_x, train_y, valid_y = data["X_train"], data["X_val"], data["y_train"], data["y_val"]


    param = {
        "C": trial.suggest_discrete_uniform("C",0.1,1.0,0.1),
        "class_weight": "balanced",
        "random_state": 42,
        "verbose": False,
        
    }
    
    model = LogisticRegression(**param)
    #model = make_pipeline(StandardScaler(), model)
    model.fit(train_x, train_y)
    return sklearn.metrics.roc_auc_score(valid_y, model.predict(valid_x))



In [80]:
import collections
Trial = collections.namedtuple("Trial",["value", "parameters"])

In [81]:
plt.close()
optuna.logging.set_verbosity(optuna.logging.WARNING)
results = []
outer_models = {}
best_models_LR = {}

# *************OUTER*************
plot_Model_LR = Call_Plot(sklearn_model=True, model_name="LR", repeated_k_fold=True, DIR=DIR)
for outer_idx, elem in enumerate(gen_data(curated_data.drop(labels=["item", "sequence"], axis=1), RESAMPLING = False)):
    
    study_dict = {}
    # ***********INNER************
    for idx, data_in in enumerate(gen_data_for_tuningHP(elem, RESAMPLING=False)):
        
        study = optuna.create_study(pruner=HyperbandPruner(max_resource="auto"),
                            direction="maximize")
        study.optimize((toolz.curry(objective)(data_in)), n_trials=100)

        #print("Number of finished trials: {}".format(len(study.trials)))

        trial = study.best_trial
        study_dict[idx] = Trial(trial.value, trial.params)
    arg_max = max(study_dict, key=lambda d: study_dict[d].value) #max for AUC
    best_models_LR[outer_idx] = trainer(elem, study_dict[arg_max].parameters)
    
    plot_Model_LR.Plot(elem, best_models_LR[outer_idx], outer_idx)
plot_Model_LR.post_Plot()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

       B-DNA       0.90      0.96      0.93        27
       A-DNA       0.90      0.75      0.82        12

    accuracy                           0.90        39
   macro avg       0.90      0.86      0.87        39
weighted avg       0.90      0.90      0.89        39

Average PR:  0.9212602212602213
AUC PR:  0.9182600963850965
Best Threshold_f-score=0.702217, F-Score=0.870
AUC:  0.9537037037037037
Best Threshold_ROC=0.527326, G-Mean_ROC=0.903
Accuracy:  0.8974358974358975
F1:  0.8181818181818182
MCC:  0.7535992817105361
              precision    recall  f1-score   support

       B-DNA       0.83      0.96      0.89        26
       A-DNA       0.89      0.62      0.73        13

    accuracy                           0.85        39
   macro avg       0.86      0.79      0.81        39
weighted avg       0.85      0.85      0.84        39

Average PR:  0.8217326476941862
AUC PR:  0.8128290833098524
Best Threshold_f-score=0.8217

In [82]:
pd.DataFrame(plot_Model_LR_test.results)

Unnamed: 0,Average PR,AUC PR,ROC AUC,Accuracy,F1,MCC,cohen_kappa_score
0,0.910844,0.907099,0.950617,0.897436,0.818182,0.753599,0.747573
1,0.816082,0.807075,0.887574,0.846154,0.727273,0.645497,0.625
2,0.96097,0.95951,0.977564,0.921053,0.869565,0.814697,0.813115
3,0.966346,0.965118,0.980769,0.921053,0.857143,0.820008,0.804124
4,0.896601,0.892854,0.929487,0.894737,0.833333,0.75641,0.75641


In [83]:
pd.DataFrame(plot_Model_LR_test.results).mean()

Average PR           0.910168
AUC PR               0.906331
ROC AUC              0.945202
Accuracy             0.896086
F1                   0.821099
MCC                  0.758042
cohen_kappa_score    0.749244
dtype: float64