In [56]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

### import data

# ris contain RIS data: Genome_ID - Drug - RIS
ris = pd.read_pickle("/home/hermuba/resistanceExp/data/anno_sps_df").loc[ris['Species'] == 'Escherichia']
# df contain Genome_ID - cdhit(0101010)
df = pd.read_pickle("/home/hermuba/resistanceExp/EcoliGenomes/cdhitResult/ec0913_df")
# card contain Genome_ID - card_ARO(0101010)
card = pd.read_pickle('/home/hermuba/resistanceExp/data/aro_pattern_df')
# cluster_detail contain cdhit - prevalance - card
cluster_detail = pd.read_pickle("../../cdhitResult/cluster_detail_tmp1010")

In [None]:
#those drugs have nearly 50 data, therefore their data are selected for training
train_drug = ['meropenem', 'gentamicin', 'ciprofloxacin', 'trimethoprim/sulfamethoxaole', 'ampicillin', 'cefazolin']
                                                
### Function to join X with y
# Input: X(dataframe with Genome ID, 01010); y(Genome ID, RIS), abx(drug name)
    # X can be df(all genes), acc(accessory only), card(card AROs), card AND acc, card merge with acc
# Output: df(aligned Genome_ID - X - y)

def join_df(X, y, abx):
    # subset y
    ris_need = ris.loc[ris['Antibiotic'] == abx][['Genome ID', 'Resistant Phenotype']] 
    # join X with y
    df_ris = pd.merge(X, ris_need, left_index = True, right_on = "Genome ID")
    # reset_index to prevent problems with cross validation and train/test split. Drop the old index
    df_ris = df_ris.reset_index(drop = True)
    return(df_ris)

### Feature selection with existing knowledge
# select accessory genes
acc_index = cluster_detail.loc[cluster_detail['prevalance'] < 1].index
# select gene clusters that are identified by card
card_index = cluster_detail.loc[cluster_detail['card_portion'] > 0].index
#
acc_card_intersect_index = list(set(card_index) & set(acc_index))

### adding feature: merging two X dataframe
card_and_acc_X = pd.merge(card,df[acc_index], left_index = True, right_index = True)


In [71]:
# throw into naive bayes
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score     
def train_nb(df_ris):
    # hold out test set
    
    X_train, X_test, y_train, y_test = train_test_split(
        df_ris.drop(["Genome ID", "Resistant Phenotype"], axis = 1)
        , df_ris['Resistant Phenotype']
        , test_size=0.4 
        , random_state = 0) 
    ### i dont really know pseudo random status mean (please read about it), random status 20 yields best result?? 0.7
    ### it saids it is "random sampling test set but it is not (WTF)
    
    X_train.reset_index(inplace = True, drop = True)
    y_train.reset_index(inplace = True, drop = True)
    
    # seperate train and test into k fold
    n=10
    cv = StratifiedShuffleSplit(n_splits=n, test_size=0.3, random_state = 0)
    ### n_split = 10, test_size, does not change result; random_state yeilds best result 0.75
    
    
    # bnb.fit
    bnb = BernoulliNB()
    val_score = 0
    for train_index, test_index in cv.split(X_train, y_train):      
        bnb.fit(X_train.iloc[train_index,:], y_train[train_index])
        val = bnb.predict(X_train.iloc[test_index])
        score = accuracy_score(y_train[test_index], val)
        val_score = val_score + score
    
    print('validation accuracy is ', val_score/n)
    
    # model.predict()
    
    score = accuracy_score(y_test, bnb.predict(X_test))
    print('test set accuracy is', score)

In [75]:
train_nb(join_df(card,ris,"meropenem"))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [51]:
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold

    
def train_SVM(df):
    
    # split test, train
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(["Genome ID", "Resistant Phenotype"], axis = 1)
        , df['Resistant Phenotype']
        , test_size=0.4 
        , random_state = 0)
    
    # choose estimator (our model)
    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
    cv = skf.split(X_train, y_train)
   
    # tune hyperparameters
    gammas = np.logspace(-6, -1, 10)
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': gammas,
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    classifier = GridSearchCV(estimator=clf, cv=cv, param_grid=tuned_parameters)
    classifier.fit(X_train.values, np.asarray(y_train))
    
    # print rankings
    rankDf = pd.DataFrame.from_dict(classifier.cv_results_)
    rankDf = rankDf.sort_values('rank_test_score')
    print(rankDf.head())

    # final evaluation with out test set: default: accuracy
    print('best params:', classifier.best_params_, 'best scorer', classifier.scorer_, 'best gamma', classifier.best_estimator_.gamma)
    print('classifier score', classifier.score(X_test, y_test))

In [29]:

train_nb(join_acc("meropenem"))
train_nb(join_acc("gentamicin"))
train_nb(join_acc("ciprofloxacin"))


train_nb(join_card("meropenem"))
train_nb(join_card("gentamicin"))
train_nb(join_card("ciprofloxacin"))


train_nb(card_and_acc("meropenem"))
train_nb(card_and_acc("gentamicin"))
train_nb(card_and_acc("ciprofloxacin"))

train_nb(join_acc_AND_card("meropenem"))
train_nb(join_acc_AND_card("gentamicin"))
train_nb(join_acc_AND_card("ciprofloxacin"))

validation accuracy is  0.736363636364
test set accuracy is 0.541666666667
validation accuracy is  0.622222222222
test set accuracy is 0.8
validation accuracy is  0.433333333333
test set accuracy is 0.55
validation accuracy is  0.654545454545
test set accuracy is 0.416666666667
validation accuracy is  0.677777777778
test set accuracy is 0.6
validation accuracy is  0.588888888889
test set accuracy is 0.6
validation accuracy is  0.572727272727
test set accuracy is 0.583333333333
validation accuracy is  0.572727272727
test set accuracy is 0.583333333333
validation accuracy is  0.572727272727
test set accuracy is 0.583333333333
validation accuracy is  0.654545454545
test set accuracy is 0.708333333333
validation accuracy is  0.666666666667
test set accuracy is 0.8
validation accuracy is  0.511111111111
test set accuracy is 0.5


In [30]:
# feature selection using pvalue
from scipy import stats
def p_value_list(df):
    r = df.loc[df['Resistant Phenotype'] == "Resistant"]
    not_r = df.loc[df['Resistant Phenotype'] != "Resistant"]
    
    p_value_ma = []
    for cluster_name in df.columns[:-2]:
        p_value = stats.ttest_ind(r[cluster_name], not_r[cluster_name])[1]
        p_value_ma.append(p_value)
    p = pd.Series(data = p_value_ma, index = df.columns[:-2])
    return(p.sort_values(ascending = True).index) # p value small are in the front

In [31]:
p = p_value_list(join_df("meropenem"))
list(p[0:3])

['Cluster 4703', 'Cluster 327', 'Cluster 5512']

In [32]:
list(p[0:3])+['Resistant Phenotype', 'Genome ID']

['Cluster 4703',
 'Cluster 327',
 'Cluster 5512',
 'Resistant Phenotype',
 'Genome ID']

In [33]:
def p_feature_selection(complete_set):
    
    p = p_value_list(complete_set)
    for a in [5,10,15,30,60,120,180]:
        list_of_feature = list(p[0:a])
        list_of_col_name = list_of_feature + ['Resistant Phenotype', 'Genome ID']
        print("----------using ", a, ' features----------')
        train_nb(complete_set[list_of_col_name])
    

In [34]:
p_feature_selection(join_df("meropenem"))

----------using  5  features----------
validation accuracy is  0.809090909091
test set accuracy is 0.708333333333
----------using  10  features----------
validation accuracy is  0.809090909091
test set accuracy is 0.708333333333
----------using  15  features----------
validation accuracy is  0.845454545455
test set accuracy is 0.708333333333
----------using  30  features----------
validation accuracy is  0.8
test set accuracy is 0.708333333333
----------using  60  features----------
validation accuracy is  0.736363636364
test set accuracy is 0.625
----------using  120  features----------
validation accuracy is  0.727272727273
test set accuracy is 0.583333333333
----------using  180  features----------
validation accuracy is  0.709090909091
test set accuracy is 0.666666666667


In [35]:
p_feature_selection(join_card("meropenem"))

----------using  5  features----------
validation accuracy is  0.736363636364
test set accuracy is 0.708333333333
----------using  10  features----------
validation accuracy is  0.781818181818
test set accuracy is 0.666666666667
----------using  15  features----------
validation accuracy is  0.809090909091
test set accuracy is 0.666666666667
----------using  30  features----------
validation accuracy is  0.809090909091
test set accuracy is 0.666666666667
----------using  60  features----------
validation accuracy is  0.781818181818
test set accuracy is 0.666666666667
----------using  120  features----------
validation accuracy is  0.709090909091
test set accuracy is 0.625
----------using  180  features----------
validation accuracy is  0.654545454545
test set accuracy is 0.458333333333


In [36]:
p_feature_selection(card_and_acc("meropenem"))

----------using  5  features----------
validation accuracy is  0.736363636364
test set accuracy is 0.708333333333
----------using  10  features----------
validation accuracy is  0.854545454545
test set accuracy is 0.666666666667
----------using  15  features----------
validation accuracy is  0.736363636364
test set accuracy is 0.708333333333
----------using  30  features----------
validation accuracy is  0.627272727273
test set accuracy is 0.583333333333
----------using  60  features----------
validation accuracy is  0.581818181818
test set accuracy is 0.583333333333
----------using  120  features----------
validation accuracy is  0.554545454545
test set accuracy is 0.583333333333
----------using  180  features----------
validation accuracy is  0.536363636364
test set accuracy is 0.583333333333


In [52]:
train_SVM(join_df("meropenem"))

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
17       0.014175         0.005897         0.794118               1.0      10   
37       0.013424         0.005727         0.794118               1.0    1000   
36       0.013877         0.005704         0.794118               1.0    1000   
16       0.013930         0.006143         0.794118               1.0      10   
7        0.015290         0.006339         0.794118               1.0       1   

   param_gamma param_kernel  \
17  0.00774264          rbf   
37  0.00774264          rbf   
36  0.00215443          rbf   
16  0.00215443          rbf   
7   0.00774264          rbf   

                                               params  rank_test_score  \
17  {'C': 10, 'gamma': 0.00774263682681, 'kernel':...                1   
37  {'C': 1000, 'gamma': 0.00774263682681, 'kernel...                1   
36  {'C': 1000, 'gamma': 0.00215443469003, 'kernel...                1   
16  {'C': 10, 'gamma': 0.00215

In [54]:
def p_feature_selection_svm(complete_set):
    
    p = p_value_list(complete_set)
    for a in [5,10,15,30,60,120,180]:
        list_of_feature = list(p[0:a])
        list_of_col_name = list_of_feature + ['Resistant Phenotype', 'Genome ID']
        print("----------using ", a, ' features----------')
        train_SVM(complete_set[list_of_col_name])

In [55]:
p_feature_selection_svm(join_df("meropenem"))

----------using  5  features----------
    mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
25       0.000417         0.000319         0.823529          0.822793     100   
17       0.000441         0.000321         0.823529          0.822793      10   
8        0.000488         0.000345         0.823529          0.822793       1   
9        0.000389         0.000298         0.823529          0.822793       1   
16       0.000427         0.000327         0.823529          0.853096      10   

    param_gamma param_kernel  \
25  0.000599484          rbf   
17   0.00774264          rbf   
8     0.0278256          rbf   
9           0.1          rbf   
16   0.00215443          rbf   

                                               params  rank_test_score  \
25  {'C': 100, 'gamma': 0.000599484250319, 'kernel...                1   
17  {'C': 10, 'gamma': 0.00774263682681, 'kernel':...                1   
8   {'C': 1, 'gamma': 0.0278255940221, 'kernel': '...      