In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import random
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score # AUC score
from sklearn.metrics import average_precision_score # AUPR score
from sklearn.metrics import precision_recall_fscore_support # precision, recall
from imblearn.metrics import sensitivity_specificity_support # sensitivity, specificity
from sklearn.metrics import roc_curve # to draw auc curve
from sklearn.metrics import precision_recall_curve # to draw aupr curve
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#2017.5.2.
path = "/DAS_Storage1/aschoi/data/Drug_Repositioning/8_new_training/2_similarity_based-PREDICT/"
tn = pd.read_table(path + "4_5_tn_local_norm_1_1_descriptor.tsv")
no_tn = pd.read_table(path + "4_5_2_noTn_only_local_norm_1_1_descriptor.tsv")

In [4]:
#2017.5.2. independent dataset
path = "/DAS_Storage1/aschoi/data/Drug_Repositioning/8_new_training/2_similarity_based-PREDICT/"
indep = pd.read_table(path+"3_2_indep_descriptor.tsv")
indep_x = indep.values[:, 3:].astype(float)
indep_y = indep.values[:,2].astype(int)

In [5]:
f = open("/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/5_final/tn+noise_1by1_idex.txt", "r")
indices = list()
while True:
    line=f.readline()
    if not line : break
    indices.append(line.lstrip('[').rstrip(']\n').split(', '))
f.close()

lindices = list()
for i in indices:
    tmp = list()
    for j in i:
        tmp.append(int(j))
    lindices.append(tmp)

In [7]:
idx = lindices[1]
filtered = no_tn.filter(items = idx, axis=0) # 뽑은 index만큼 filter
noise_tn = pd.concat([tn,filtered])

x_whole_data = noise_tn[noise_tn.columns.values[3:].tolist()].values
y_whole_data = noise_tn["association"].values

In [None]:
# pick random noise by 1:1 herbal compounds (P:TN:Noise = 1:1:1 --> P:N = 1:2)
# SVM with RBF Kernel
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
path = '/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/5_final/'
with open(path + "6_herbal_tn+noise_1by1.txt", 'w') as fd :
    user_estimator = 300
    i=1
    
    fd.write("<{} : independent>\r\n".format(i))
    
    fp_results = dict()
    
    classifier = RandomForestClassifier(n_estimators=user_estimator, n_jobs=-1, class_weight='balanced')
    classifier.fit(x_whole_data, y_whole_data)
    indep_y_predicted_proba = classifier.predict_proba(herbal_x)
    indep_y_predicted_label = classifier.predict(herbal_x)
    fp_results['predicted_proba'] = indep_y_predicted_proba
    fp_results['Predicted_label'] = indep_y_predicted_label
    fp_results['y_true'] = herbal_y
    
    #write_output(fd, folds_results, i, user_estimator)
#li_noise1.append(predicted_results(fp_results))
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [None]:
params = {'k': k, 'kernel': 'rbf', 'user_c': i, 'degree':3}

In [8]:
def SVM_K_fold_graph(X, y, params):
       
    skf = StratifiedKFold(n_splits=params['k'], shuffle=True) # n_splits = k (k fold라서.), pos:neg의 비율을 고려해서 k 개의 subgroup으로 나누어줌.
    folds_results = {'acc':[], 'auc':[], 'aupr':[], 'confusion_matrix':[], 'sn':[], 'sp':[], 'precision':[], 'recall':[]}
    draw_results = {'fpr':[], 'tpr':[], 'precision_vec':[], 'recall_vec':[]}
    for training_index, validation_index in skf.split(X, y):
        x_training_set = X[training_index]
        y_training_set = y[training_index]
        x_validation_set = X[validation_index]
        y_validation_set = y[validation_index]
        
        classifier = SVC(C=params['user_c'], n_jobs=-1, kernel = params['kernel'], degree = params['degree']) #n_jobs=-1 이면 모든 node 쓰는것, 신경쓰이면 30개 정도.
        classifier.fit(x_training_set, y_training_set)
        
        y_predicted_proba = classifier.predict_proba(x_validation_set) # [0에 대한 확률, 1에 대한 확률], shpae = [n_samples, n_class]
        y_predicted_label = classifier.predict(x_validation_set) # 예측된 label을 보여줌, shpae = [n_samples]
        
        # Accuracy 
        current_acc = classifier.score(x_validation_set, y_validation_set)
        folds_results['acc'].append(current_acc)

        # AUC
        current_auc = roc_auc_score(y_validation_set, y_predicted_proba[:,1])
        folds_results['auc'].append(current_auc)
        
        # Sensitivity, Specificity
        sn, sp, support = sensitivity_specificity_support(y_validation_set, y_predicted_label)
        folds_results['sn'].append(sn)
        folds_results['sp'].append(sp)
        
        # AUPR
        current_aupr = average_precision_score(y_validation_set, y_predicted_proba[:,1])
        folds_results['aupr'].append(current_aupr)
        
        # Precision, Recall
        precision, recall, _, _ =  precision_recall_fscore_support(y_validation_set, y_predicted_label, average = 'binary')
        folds_results['precision'].append(precision)
        folds_results['recall'].append(recall)
        
        # Confusion Matrix
        current_confusion_matrix = confusion_matrix(y_validation_set,y_predicted_label)
        folds_results['confusion_matrix'].append(current_confusion_matrix)
        
        #draw graph
        fpr, tpr, thresholds = roc_curve(y_validation_set, y_predicted_proba[:, 1], pos_label=1)
        draw_results['fpr'].append(fpr)
        draw_results['tpr'].append(tpr)
        
        precision_vec, recall_vec, _ = precision_recall_curve(y_validation_set, y_predicted_proba[:, 1])
        draw_results['precision_vec'].append(precision_vec)
        draw_results['recall_vec'].append(recall_vec)
        
    return folds_results, draw_results

SyntaxError: invalid syntax (<ipython-input-8-eef8ed9b3241>, line 12)