In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

### import data

def txt_to_df(filename):
    
    path = "/home/hermuba/res/data/data_from_dr_wu/"
    i = 0
    with open(path + filename) as f:
        for line in f:
            l = line.replace('\n', '').split('\t')
            if i == 0:
                l[0] = "Genome ID"
                l[1] = "Resistant Phenotype"
                df = pd.DataFrame(columns = l)
 
            else:
                ID = l[0]
                
                df.loc[i-1, :] = l
            i += 1
    return(df)

In [3]:
def random_split(df, portion):
    arr = np.arange(len(df.index))
    np.random.shuffle(arr)
    shuffle_index = arr
    train_size = round(len(df.index)*portion)
    
    return(shuffle_index[:train_size], shuffle_index[train_size:])


In [23]:
to_binary={'Resistant': 1,
          'Susceptible': 0}

In [50]:
# throw into naive bayes
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import *   
def train_nb(df_ris, portion):
    
    # hold out test set
    train_index, test_index = random_split(df_ris, portion)
    X = df_ris.drop(["Genome ID", "Resistant Phenotype"], axis = 1)
    y = df_ris['Resistant Phenotype'].map(to_binary)
    X_train = X.loc[train_index, :]
    X_test = X.loc[test_index, :]
    y_train = y[train_index]
    y_test = y[test_index]
        
        #X_train, X_test, y_train, y_test = train_test_split(
        #df_ris.drop(["Genome ID", "Resistant Phenotype"], axis = 1)
        #, df_ris['Resistant Phenotype']
        #, test_size=0.2 
        #, random_state = 1
        #, startify = df_ris['Resistant Phenotype']) 
    # test if random
    ### i dont really know pseudo random status mean (please read about it), random status 20 yields best result?? 0.7
    ### it saids it is "random sampling test set but it is not (WTF)
    
    X_train.reset_index(inplace = True, drop = True)
    y_train.reset_index(inplace = True, drop = True)
    
    # seperate train and test into k fold
    n=10
    cv = ShuffleSplit(n_splits=n, test_size=0.3, random_state = 0)
    ### n_split = 10, test_size, does not change result; random_state yeilds best result 0.75
    
    
    # bnb.fit
    bnb = BernoulliNB()
    val_score = 0
    for train_index, test_index in cv.split(X_train, y_train):      
        bnb.fit(X_train.iloc[train_index,:], y_train[train_index])
        val = bnb.predict(X_train.iloc[test_index])
        score = accuracy_score(y_train[test_index], val)
        val_score = val_score + score
    
    v = val_score/n
    
    # model.predict()
    
    accuracy = accuracy_score(y_test, bnb.predict(X_test))
    precision = precision_score(y_test, bnb.predict(X_test))
    f = f1_score(y_test, bnb.predict(X_test))
    recall = recall_score(y_test, bnb.predict(X_test))
    
    return([v,accuracy, precision, f, recall])

In [27]:
train_nb(txt_to_df("gene_pattern.trimethoprim_sulfamethoxazole"), 0.8)

41    1
26    1
28    1
12    1
38    1
10    1
29    1
24    0
Name: Resistant Phenotype, dtype: int64


[0.67272727272727273,
 0.5,
 0.80000000000000004,
 0.66666666666666663,
 0.5714285714285714]

In [32]:
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold # this is the problem!!

    
def train_SVM(df_ris, portion):
    
    # split test, train
    train_index, test_index = random_split(df_ris, portion)
    X = df_ris.drop(["Genome ID", "Resistant Phenotype"], axis = 1)
    y = df_ris['Resistant Phenotype'].map(to_binary)
    X_train = X.loc[train_index, :]
    X_test = X.loc[test_index, :]
    y_train = y[train_index]
    y_test = y[test_index]
    
    # choose estimator (our model)
    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
    cv = skf.split(X_train, y_train)
   
    # tune hyperparameters
    gammas = np.logspace(-6, -1, 10)
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': gammas,
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    classifier = GridSearchCV(estimator=clf, cv=cv, param_grid=tuned_parameters)
    v = classifier.cv_results_['mean_test_score'][classifier.best_index_]
    
    classifier.fit(X_train.values, np.asarray(y_train))
    
    param = classifier.best_params_
    #print(param)
    
    
    
    accuracy = accuracy_score(y_test, classifier.predict(X_test))
    precision = precision_score(y_test, classifier.predict(X_test))
    f = f1_score(y_test, classifier.predict(X_test))
    recall = recall_score(y_test, classifier.predict(X_test))
    
    
    return([v,accuracy, precision, f, recall])
    

In [33]:
train_SVM(txt_to_df("gene_pattern.trimethoprim_sulfamethoxazole"), 0.8)

[0.76470588235294112, 0.5, 0.5, 0.66666666666666663, 1.0]

In [36]:
def run_hundred(model, df, portion):
    sum_score = np.zeros(5)
    for i in range(100):
        scoring_matrix = np.asarray(model(df, portion))
        sum_score = sum_score + scoring_matrix
    return(sum_score/100)

In [37]:
run_hundred(train_nb, txt_to_df("gene_pattern.trimethoprim_sulfamethoxazole"), 0.8)

22    0
21    1
26    1
41    1
0     0
10    1
24    0
15    0
Name: Resistant Phenotype, dtype: int64
11    0
12    1
23    1
17    1
13    0
7     0
8     0
22    0
Name: Resistant Phenotype, dtype: int64
32    1
5     1
12    1
9     0
14    1
4     0
27    0
38    1
Name: Resistant Phenotype, dtype: int64


array([ 0.6969697 ,  0.5       ,  0.5       ,  0.66045066,  1.        ])

In [71]:
#those drugs have nearly 50 data, therefore their data are selected for training
train_drug = ['meropenem','cefepime','ceftazidime' ,'gentamicin', 'ciprofloxacin',
              'trimethoprim_sulfamethoxazole', 'ampicillin', 'cefazolin', 'ampicillin_sulbactam']
train_drug  = train_drug[5:]
prefix = ['card_pattern', 'gene_pattern', 'acc_pattern', 'acc_card_pattern']
portion = [0.6, 0.8]
model = [train_SVM, train_nb]

In [72]:
total = pd.DataFrame(columns = ['drug', 'feature', 'train_size', 'model', 'validation_accuracy', 'accuracy', 'precision', 'F1_score', 'recall'])
i = 0
for d in train_drug:
    for pre in prefix:
        for p in portion:
            for m in model:
                result = list(run_hundred(m, txt_to_df(pre+'.'+d), p))
                param = [d,pre,p,str(m)]
                total.loc[i, :] = param + result
                i +=1

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
[1,2,3]+[4,5,6]

[1, 2, 3, 4, 5, 6]

In [66]:
total.to_excel("/home/hermuba/res/ml_data2.xlsx")


In [67]:
total.shape

(48, 9)

In [59]:
total

Unnamed: 0,drug,feature,train_size,model,validation_accuracy,accuracy,precision,F1_score,recall
0,meropenem,card_pattern,0.6,<function train_SVM at 0x7f4089afaf28>,0.878788,0.772727,0.92963,0.750446,0.636946
1,meropenem,card_pattern,0.6,<function train_nb at 0x7f4089b00268>,0.67,0.69697,0.761905,0.562564,0.518519
2,meropenem,card_pattern,0.8,<function train_SVM at 0x7f4089afaf28>,0.848485,0.878788,0.888889,0.901688,0.933333
3,meropenem,card_pattern,0.8,<function train_nb at 0x7f4089b00268>,0.65,0.636364,0.77381,0.670707,0.677381
4,meropenem,gene_pattern,0.6,<function train_SVM at 0x7f4089afaf28>,0.808081,0.818182,0.853535,0.812448,0.786869
5,meropenem,gene_pattern,0.6,<function train_nb at 0x7f4089b00268>,0.603333,0.636364,0.655556,0.501851,0.437037
6,meropenem,gene_pattern,0.8,<function train_SVM at 0x7f4089afaf28>,0.818182,0.848485,1.0,0.809524,0.716667
7,meropenem,gene_pattern,0.8,<function train_nb at 0x7f4089b00268>,0.661905,0.727273,0.833333,0.705128,0.711111
8,meropenem,acc_pattern,0.6,<function train_SVM at 0x7f4089afaf28>,0.838384,0.727273,0.797721,0.70313,0.657576
9,meropenem,acc_pattern,0.6,<function train_nb at 0x7f4089b00268>,0.69,0.606061,0.703704,0.536895,0.493136


In [68]:
total.to_pickle("meropenem_cefepime2.xlsx")