In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

In [2]:
association_data = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/7_new_association/9_gold_learning_association.tsv")
descriptor_data = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/8_new_training/2_1_descriptor_combined.tsv", header=None)
x_whole_data = descriptor_data.values
y_whole_data = association_data["association"].values

In [29]:
from sklearn.metrics import roc_curve, auc, average_precision_score

def libsvm_10_fold(x_whole_data, y_whole_data, user_c):
    print("{}----------------------------------------------------".format(user_c))
    skf = StratifiedKFold(n_splits=10, shuffle=True) # n_splits = 10 (10 fold라서.) data가 nega : pos = 76 : 1이기 때문에 validation 뽑을때 그 비율 지켜서 stratified로 한다.
    whole_accuracy = list()
    whole_auc = list()
    whole_aupr = list()
    whole_confusion_mat = list()
    for training_index, validation_index in skf.split(x_whole_data, y_whole_data):
        x_training_set = x_whole_data[training_index] # 0.9에 해당하는 부분
        y_training_set = y_whole_data[training_index]
        x_validation_set = x_whole_data[validation_index] # 0.1에 해당하는 부분
        y_validation_set = y_whole_data[validation_index]

        #classifier = LogisticRegression(penalty='l1', C=0.0001, n_jobs=-1) #n_jobs = -1이면 모든 node 쓰는것. 신경쓰이면 30개 정도.
        #classifier = RandomForestClassifier(n_estimators=120, n_jobs=-1, class_weight='balanced')
        classifier = LinearSVC(penalty='l2', C=user_c, class_weight='balanced')
        classifier.fit(x_training_set, y_training_set)
        #y_predicted_proba = classifier.predict_proba(x_validation_set) # return값이 probability 각각에 대한 확률 결과값 [[0.8(0에대한 확률),0.2(1에대한 확률)],[0.7,0.3],... ]
        y_predicted_label = classifier.predict(x_validation_set) # return이 label임. 위 확률 결과에서 확률이 높은거에 대한 label을 보여줌.
    
        current_accuracy = classifier.score(x_validation_set, y_validation_set) # accuracy score를 보여줌.
        fpr, tpr, thresholds = roc_curve(y_validation_set, y_predicted_label, pos_label=1) # powitive label만 본것.
        current_auc = auc(fpr, tpr)
        current_confusion_matrix = confusion_matrix(y_validation_set,
                                                    y_predicted_label)
        current_aupr = average_precision_score(y_validation_set, y_predicted_label)
    
        whole_accuracy.append(current_accuracy)
        whole_auc.append(current_auc)
        whole_aupr.append(current_aupr)
        whole_confusion_mat.append((current_confusion_matrix))
        
        print("Accuracy of this fold: {}".format(current_accuracy))
        print("AUC of this fold: {}".format(current_auc))
        print("AUPR of this fold: {}".format(current_aupr))
        print("Confustion matrix of this fold")
        print(current_confusion_matrix)
        print("\n")
    print("Accuracy avg : {}").format(sum(whole_accuracy)/len(whole_accuracy))
    print("AUC avg : {}").format(sum(whole_auc)/len(whole_auc))
    print("AUPR avg : {}").format(sum(whole_aupr)/len(whole_aupr))
    print("Confustion matrix")
    print(sum(whole_confusion_mat))
    
    
    return [whole_accuracy, whole_auc, whole_aupr, whole_confusion_mat]

In [28]:
results = list()
for user_c in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]:
    results.append(libsvm_10_fold(x_whole_data, y_whole_data, user_c))

0.0001----------------------------------------------------
Accuracy of this fold: 0.777687443541
AUC of this fold: 0.716055738605
AUPR of this fold: 0.347409719856
Confustion matrix of this fold
[[8515 2411]
 [  50   94]]


0.0001----------------------------------------------------
Accuracy of this fold: 0.777938386485
AUC of this fold: 0.699050661073
AUPR of this fold: 0.329369336571
Confustion matrix of this fold
[[8522 2403]
 [  55   89]]


0.0001----------------------------------------------------
Accuracy of this fold: 0.774505375373
AUC of this fold: 0.731576086957
AUPR of this fold: 0.365194468654
Confustion matrix of this fold
[[8474 2451]
 [  45   99]]


0.0001----------------------------------------------------
Accuracy of this fold: 0.774505375373
AUC of this fold: 0.66304697432
AUPR of this fold: 0.292978735273
Confustion matrix of this fold
[[8494 2431]
 [  65   79]]


0.0001----------------------------------------------------
Accuracy of this fold: 0.785507770148
AUC of t

In [30]:
indep = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/8_new_training/2_indep_asso_descriptor.tsv")

from sklearn.metrics import roc_curve, auc, average_precision_score

x_whole_data = descriptor_data.values
y_whole_data = association_data["association"].values
indep_x = indep.values[:, 3:].astype(float)
indep_y = indep.values[:,2].astype(int)

#classifier = LogisticRegression(penalty='l1', C=0.1, n_jobs = -1)
#classifier = RandomForestClassifier(n_estimators=800, n_jobs=-1, class_weight='balanced')
classifier = LinearSVC(penalty='l2', C=0.001, class_weight='balanced')
classifier.fit(x_whole_data, y_whole_data)
#indep_y_predicted_proba = classifier.predict_proba(indep_x)
indep_y_predicted_label = classifier.predict(indep_x)

indep_accuracy = classifier.score(indep_x, indep_y)
indep_fpr, indep_tpr, indep_thresholds = roc_curve(indep_y, indep_y_predicted_proba[:, 1], pos_label = 1)
indep_auc = auc(indep_fpr, indep_tpr)
indep_confusion_matirx = confusion_matrix(indep_y, indep_y_predicted_label)
indep_aupr = average_precision_score(indep_y, indep_y_predicted_proba[:, 1])

print("Independent Accuracy : {}".format(indep_accuracy))
print("Independent AUC : {}".format(indep_auc))
print("Independent AUPR : {}".format(indep_aupr))
print("Independent Confustion matrix")
print(indep_confusion_matirx)
print("\t")

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [32]:
indep_y_predicted_label = classifier.predict(indep_x)

indep_accuracy = classifier.score(indep_x, indep_y)
indep_fpr, indep_tpr, indep_thresholds = roc_curve(indep_y, indep_y_predicted_label, pos_label = 1)
indep_auc = auc(indep_fpr, indep_tpr)
indep_confusion_matirx = confusion_matrix(indep_y, indep_y_predicted_label)
indep_aupr = average_precision_score(indep_y, indep_y_predicted_label)

print("Independent Accuracy : {}".format(indep_accuracy))
print("Independent AUC : {}".format(indep_auc))
print("Independent AUPR : {}".format(indep_aupr))
print("Independent Confustion matrix")
print(indep_confusion_matirx)
print("\t")

Independent Accuracy : 0.65988258317
Independent AUC : 0.642453464707
Independent AUPR : 0.35118848282
Independent Confustion matrix
[[1628  834]
 [  35   58]]
	
