In [60]:
import time
start_time = time.time()


###Loading packages
import os
import numpy as np
import pandas as pd
import math
import itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score, roc_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from numpy.random import seed
seed(1)


import itertools

In [61]:
def measurements(y_test, y_pred, y_pred_prob):  
    acc = metrics.accuracy_score(y_test, y_pred)
    sensitivity = metrics.recall_score(y_test, y_pred)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    specificity = TN/(TN+FP)
    precision = metrics.precision_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    mcc = metrics.matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    npv = TN/(TN+FN)       
    return [TN, FP, FN, TP, acc, auc, sensitivity, specificity, precision, npv, f1, mcc]

def model_predict(X, y, y_index, model, col_name):
    y_pred_prob = model.predict_proba(X)
    # keep probabilities for the positive outcome only
    y_pred_prob = y_pred_prob[:, 1]
    y_pred_class = np.where(y_pred_prob > 0.5, 1, 0)

    ###create dataframe
    pred_result = pd.DataFrame()
    pred_result['id'] = y_index
    pred_result['y_true'] = y
    pred_result['prob_'+col_name] = y_pred_prob
    pred_result['class_'+col_name] = y_pred_class
    
    performance =measurements(y, y_pred_class, y_pred_prob)

    return pred_result, performance

In [65]:

DICT= pd.read_excel(r'C:/Users/Yanyan.Qu/Desktop/Yanyan/Cardiotoxicity/DICTrank/mold2/DICTrank_Mold2_1006.xlsx')
## remove the "Ambiguous-DICT concern" drugs
print(DICT.shape)
DICT.drop(DICT[DICT['DICT']=='Ambiguous'].index,inplace=True)
## Classify the "Less- and Most- DICT concern " as cardiotox positive  and label "1"
## classify the "No-DICT concern" as cardiotox negtive and label "0"
DICT.loc[DICT['DICT']=="Less",'DICT']=1
DICT.loc[DICT['DICT']=="Most",'DICT']=1
DICT.loc[DICT['DICT']=="No",'DICT']=0

cols=DICT.columns[13:]
data=DICT[['DICT',*cols]]
print(data.shape)
zero_cols = data.columns[(data == 0).all()]

data.drop(zero_cols, axis=1, inplace=True)
print(data.shape)

X1=data.iloc[:,1:]
y1=data["DICT"]

## data and split(random and stratify)
## X, X_test, y, y_test = train_test_split(X1, y1, test_size=.2,random_state=42)
X, X_test, y, y_test = train_test_split(X1, y1, test_size=.2,stratify=y1,random_state=42)
print('X_train shape:', X.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y.shape)
print('y_test shape:',y_test.shape)


(1006, 790)
(928, 778)
(928, 636)
X_train shape: (742, 635)
X_test shape: (186, 635)
y_train shape: (742,)
y_test shape: (186,)


In [56]:
from sklearn.preprocessing import LabelEncoder

## label_encoder = LabelEncoder()
## y = label_encoder.fit_transform(y)

In [67]:
def para_selection(var, para, X, X_test, y, y_test):
    base_path = '/Users/Yanyan.Qu/Desktop/Yanyan/Cardiotoxicity/DICTrank/result/KNN/knn'+ var

    path10 = base_path + '/training_performance'
    path20 = base_path + '/test_performance'

    path1 = base_path + '/training_class'
    path2 = base_path + '/test_class'

    ###make the directory
    os.mkdir(base_path)
    os.mkdir(path10)
    os.mkdir(path20)

    os.mkdir(path1)
    os.mkdir(path2)
    
    #initial performance dictionary
    train_results={}
    test_results={}
    pred_test_df = pd.DataFrame()

    for i in range(20):
        skf = StratifiedKFold(n_splits=5, random_state=i, shuffle=True)
        j = 0
        for train_index, validation_index in skf.split(X,np.array(y, dtype=np.int)):
            ###get train, validation dataset
            X_train, X_validation = X.iloc[train_index,:], X.iloc[validation_index,:]
            y_train, y_validation = y.iloc[train_index], y.iloc[validation_index]

            ### scale the input
            ### scale the input
            sc = MinMaxScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            X_validation = sc.transform(X_validation)
            X_test_s = sc.transform(X_test)

            ### define column name
            col_name = 'knn_'+'seed_'+str(i)+'_skf_'+str(j)+'_paras_'+var+'_K_'+str(para)
            col_name1 = 'knn_'+'seed_'+str(i)+'_paras_'+var+'_K_'+str(para)
            col_name2 = 'knn_'+'paras_'+var
           
        ## +'_K_'+str(para)

            ###create classifier
            clf = KNeighborsClassifier(n_neighbors=para)
            clf.fit(X_train, np.array(y_train, dtype=np.int))

            ### predict validation results
            train_class, train_result=model_predict(X_validation, np.array(y_validation, dtype=np.int),y_validation.index, clf, col_name)
            train_results[col_name]=train_result


            ### predict test results
            test_class, test_result=model_predict(X_test_s, np.array(y_test, dtype=np.int),y_test.index, clf, col_name)

            test_results[col_name]=test_result

            pred_test_df = pd.concat([pred_test_df, test_class],axis=1, sort=False)
            j += 1
            train_class.to_csv(path1+'/train_'+col_name+'.csv')

    ###save the result of validation results
    pd.DataFrame(data=train_results.items()).to_csv(path10+'/train_'+col_name2+'.csv')
    pred_test_df.to_csv(path2+'/test_'+col_name2+'.csv')
    pd.DataFrame(data=test_results.items()).to_csv(path20+'/test_'+col_name2+'.csv')

In [68]:
paras = [3, 5, 7, 9, 11]

for var in range(len(paras)):
    print(var)
    para = paras[var]
    print(para)
    para_selection(str(var), para, X, X_test, y, y_test)
    
print("--- %s seconds ---" % (time.time() - start_time)) 

0
3
1
5
2
7
3
9
4
11
--- 367.9961750507355 seconds ---


In [213]:
## data analysis
knn0=pd.read_csv('/Users/Yanyan.Qu/Desktop/Yanyan/Cardiotoxicity/ML Model/KNN/knn0/training_performance/train_knn_paras_0_K_3.csv')
## knn0=knn0.rename(columns={'0':'name', '1':'value'})

## df= knn0['value'].astype('str')
print(knn0.columns)

Index(['Unnamed: 0', '0', '1'], dtype='object')


In [210]:
## cols = ['TN', 'FP', 'FN', 'TP', 'Accuracy', 'AUC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1', 'MCC']
## for i, col in enumerate(cols):
        if i == 0:
           knn0[col] = df.str.split(',').str[i].str.split('[').str[1].values
        elif i == len(cols)-1:
            knn0[col] = df.str.split(',').str[i].str.split(']').str[0].values
        else:
            knn0[col] = df.str.split(',').str[i].values
        
## print(knn0)

    Unnamed: 0                           name  \
0            0   knn_seed_0_skf_0_paras_0_K_3   
1            1   knn_seed_0_skf_1_paras_0_K_3   
2            2   knn_seed_0_skf_2_paras_0_K_3   
3            3   knn_seed_0_skf_3_paras_0_K_3   
4            4   knn_seed_0_skf_4_paras_0_K_3   
..         ...                            ...   
95          95  knn_seed_19_skf_0_paras_0_K_3   
96          96  knn_seed_19_skf_1_paras_0_K_3   
97          97  knn_seed_19_skf_2_paras_0_K_3   
98          98  knn_seed_19_skf_3_paras_0_K_3   
99          99  knn_seed_19_skf_4_paras_0_K_3   

                                                value  TN   FP   FN   TP  \
0   [9, 31, 14, 95, 0.697986577181208, 0.586238532...   9   31   14   95   
1   [12, 28, 10, 99, 0.7449664429530202, 0.6826834...  12   28   10   99   
2   [12, 27, 18, 91, 0.6959459459459459, 0.6604328...  12   27   18   91   
3   [11, 28, 13, 96, 0.722972972972973, 0.64714184...  11   28   13   96   
4   [7, 32, 18, 91, 0.662162162