In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, datasets, metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler

from imblearn.combine import *
from imblearn.over_sampling import *


## Trainset

In [16]:
df= pd.read_csv('total_df.csv')
df

Unnamed: 0,smiles,toxicity,ref,raw_MW,raw_ALOGP,raw_HBA,raw_HBD,raw_PSA,raw_ROTB,raw_AROM,...,1022,1023,MW,ALOGP,HBA,HBD,PSA,ROTB,AROM,ALERTS
0,CC1=NN=C(O1)C(=O)NC(C)(C)C2=N/C(=C(/NCC3=CC=C(...,1,DILIrank,444.423,0.98202,8,3,150.02,6,2,...,0.0,0.0,0.162797,-0.305971,0.405810,0.099910,0.346782,0.007493,0.434034,1.775999
1,C1=CC2=C(C=C1OC(F)(F)F)SC(=N2)N,1,DILIrank,234.202,2.77710,3,1,48.14,1,2,...,0.0,0.0,-0.434222,0.251281,-0.399196,-0.292517,-0.334802,-0.491163,0.434034,-0.893772
2,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1,DILIrank,1202.635,3.26900,12,5,278.80,15,0,...,0.0,0.0,2.316086,0.403983,1.049814,0.492336,1.208329,0.905074,-1.283533,-0.003848
3,CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O,1,DILIrank,242.274,3.66700,3,1,46.53,4,2,...,0.0,0.0,-0.411298,0.527535,-0.399196,-0.292517,-0.345573,-0.191969,0.434034,-0.893772
4,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,1,DILIrank,222.251,-0.85610,5,2,115.04,2,1,...,0.0,0.0,-0.468162,-0.876584,-0.077194,-0.096304,0.112764,-0.391432,-0.424749,-0.003848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1845,c1ccc(cc1)CC2C(=O)NC(C(=O)NC(C(=O)NC(CSSCCC(=O...,0,Liew,1069.238,-4.34170,16,13,437.91,19,2,...,0.0,0.0,1.937244,-1.958630,1.693819,2.062043,2.272785,1.303999,0.434034,2.665922
1846,CC1C(C(CC(O1)OC2CCC3(C(C2)CCC4C3CC(C5(C4(CCC5C...,0,Liew,943.090,0.04230,19,9,282.21,10,0,...,0.0,0.0,1.578989,-0.597691,2.176822,1.277190,1.231142,0.406418,-1.283533,0.886075
1847,COc1cc(cc(c1OC)OC)C(=O)OC2CC3CN4CCc5c6ccccc6[n...,0,Liew,578.662,4.16250,9,1,108.55,7,3,...,0.0,0.0,0.544030,0.681355,0.566811,-0.292517,0.069345,0.107225,1.292817,-0.003848
1848,CN(CCCCCCCCCCN(C)C(=O)Oc1cccc(c1)[N+](C)(C)C)C...,0,Liew,556.792,6.76220,4,0,59.08,15,2,...,0.0,0.0,0.481920,1.488388,-0.238195,-0.488730,-0.261613,0.905074,0.434034,0.886075


In [17]:
x = df.iloc[:,11:]
y = df['toxicity']

# Parameter tuning

In [18]:
import joblib

model = joblib.load('random_LR.pkl')

# StratifiedKFold 10-fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

In [28]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return specificity

# model evaluation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'specificity': make_scorer(specificity_score)
}

# cross validation
for metric, scorer in scoring.items():
    scores = cross_val_score(lr, x, y, cv=cv, scoring=scorer)
    print(f'{metric} scores: {scores}')
    print(f'Mean {metric} score: {np.mean(scores)}')
    print('---')

accuracy scores: [0.73706897 0.74137931 0.73160173 0.72294372 0.67965368 0.70995671
 0.70562771 0.69264069]
Mean accuracy score: 0.715109064785789
---
precision scores: [0.74166667 0.76576577 0.7394958  0.74774775 0.6744186  0.72413793
 0.72566372 0.71818182]
Mean precision score: 0.7296347561476414
---
recall scores: [0.74789916 0.71428571 0.7394958  0.69747899 0.73109244 0.70588235
 0.68907563 0.66386555]
Mean recall score: 0.7111344537815126
---
f1 scores: [0.74476987 0.73913043 0.7394958  0.72173913 0.7016129  0.71489362
 0.70689655 0.68995633]
Mean f1 score: 0.7198118302328321
---
specificity scores: [0.72566372 0.7699115  0.72321429 0.75       0.625      0.71428571
 0.72321429 0.72321429]
Mean specificity score: 0.7193129740834386
---
