In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score # AUC score
from sklearn.metrics import average_precision_score # AUPR score
from sklearn.metrics import precision_recall_fscore_support # precision, recall
from imblearn.metrics import sensitivity_specificity_support # sensitivity, specificity
from sklearn.metrics import roc_curve # to draw auc curve
from sklearn.metrics import precision_recall_curve # to draw aupr curve
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

In [2]:
gold_pos = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/data/desc_12/gold_pos_desc12.tsv")
gold_neg = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/data/desc_12/gold_neg_desc12.tsv")
gold = pd.concat([gold_pos, gold_neg])
x_whole_data = gold[gold.columns.values[3:].tolist()].values
y_whole_data = gold["association"].values

In [3]:
indep = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/data/desc_12/indep_desc12.tsv")
indep_x = indep.values[:, 3:].astype(float)
indep_y = indep.values[:,2].astype(int)

In [4]:
# undersampler
rus = RandomUnderSampler(ratio=0.5)

In [5]:
#x_resampled, y_resampled = rus.fit_sample(x_whole_data, y_whole_data)
x_rs = list()
y_rs = list()
for i in range(30):
    x_resampled, y_resampled = rus.fit_sample(x_whole_data, y_whole_data)
    x_rs.append(x_resampled)
    y_rs.append(y_resampled)

In [7]:
# Random forest 2017.6.4. # 2017.6.11.
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
path = '/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/'
user_estimator = 1200
classifier = RandomForestClassifier(n_estimators=user_estimator, n_jobs=-1, class_weight='balanced')
trials = dict()
print "start " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for x_resampled, y_resampled, i in zip(x_rs, y_rs, range(30)):
    if i% 10 == 0:
        print "   start : {} trials ".format(i) + datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    classifier.fit(x_resampled, y_resampled)
    indep_y_predicted_proba = classifier.predict_proba(indep_x)
    indep_y_predicted_label = classifier.predict(indep_x)
    fp_results = dict()
    fp_results['predicted_proba'] = indep_y_predicted_proba
    fp_results['Predicted_label'] = indep_y_predicted_label
    fp_results['y_true'] = indep_y
    trials[i] = fp_results
print "end : " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')


2017-06-11 15:49:37
start 2017-06-11 15:49:37
   start : 0 trials 2017-06-11 15:49:37
   start : 10 trials 2017-06-11 15:50:31
   start : 20 trials 2017-06-11 15:51:26
end : 2017-06-11 15:52:21


In [None]:
###### 회수별로 공통적으로 1이라고 판명된 drug를 찾는다. 

In [24]:
def predicted_results(fp_results):
    df_predicted_result = pd.DataFrame({'Drug_id':indep['drug_id'],'Disease_id':indep['disease_id'],
                                    'Predicted_label' : fp_results['Predicted_label'],
                                    'Predicted_proba' : fp_results['predicted_proba'][:,1], 
                                    'y_true':fp_results['y_true']})
    df_predicted_result= df_predicted_result[['Drug_id', 'Disease_id', 'y_true', 'Predicted_label', 'Predicted_proba']]
    return df_predicted_result

In [25]:
df_dic = dict()
for i in range(30):
    df_dic[i] = predicted_results(trials[i])

In [26]:
fp_dic = dict()
fp_lst = list()
for i in range(30):
    fp_dic[i] = df_dic[i][(df_dic[i].Predicted_label == 1)&(df_dic[i].y_true == 0)]
    fp_lst.append(df_dic[i][(df_dic[i].Predicted_label == 1)&(df_dic[i].y_true == 0)])

In [27]:
fp_final = reduce(lambda left, right: pd.merge(left, right, on=['Disease_id', 'Drug_id', 'y_true', 'Predicted_label']), fp_lst)

In [28]:
fp_final.head(1)

Unnamed: 0,Drug_id,Disease_id,y_true,Predicted_label,Predicted_proba_x,Predicted_proba_y,Predicted_proba_x.1,Predicted_proba_y.1,Predicted_proba_x.2,Predicted_proba_y.2,...,Predicted_proba_x.3,Predicted_proba_y.3,Predicted_proba_x.4,Predicted_proba_y.4,Predicted_proba_x.5,Predicted_proba_y.5,Predicted_proba_x.6,Predicted_proba_y.6,Predicted_proba_x.7,Predicted_proba_y.7
0,DB00176,103780,0,1,0.755833,0.729167,0.6575,0.7025,0.798333,0.835,...,0.7225,0.7325,0.815,0.5725,0.679167,0.5825,0.664167,0.660833,0.720833,0.708333


In [29]:
columns_names = ['Drug_id','Disease_id','y_true', 'Predicted_label' ] + ['trial'+str(i) for i in range(30)] 

In [30]:
fp_final.columns = columns_names

In [31]:
fp_final.head(2)

Unnamed: 0,Drug_id,Disease_id,y_true,Predicted_label,trial0,trial1,trial2,trial3,trial4,trial5,...,trial20,trial21,trial22,trial23,trial24,trial25,trial26,trial27,trial28,trial29
0,DB00176,103780,0,1,0.755833,0.729167,0.6575,0.7025,0.798333,0.835,...,0.7225,0.7325,0.815,0.5725,0.679167,0.5825,0.664167,0.660833,0.720833,0.708333
1,DB00193,103780,0,1,0.780833,0.6675,0.79,0.695,0.801667,0.675,...,0.7025,0.728333,0.584167,0.66,0.809167,0.754167,0.575833,0.745833,0.7375,0.7225


In [32]:
fp_final['avg'] = fp_final[fp_final.columns.values.tolist()[4:]].mean(axis=1)

In [34]:
fp_final = fp_final[['Drug_id','Disease_id','y_true', 'Predicted_label', 'avg' ] + ['trial'+str(i) for i in range(30)]]

In [35]:
sorted_fp = fp_final.sort_values(by='avg', ascending=False)

In [39]:
print len(sorted_fp.Drug_id.drop_duplicates()), len(sorted_fp.Disease_id.drop_duplicates()), len(sorted_fp), len(sorted_fp.drop_duplicates())

20 18 37 37


In [40]:
sorted_fp.to_csv("/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/100_indep_predicted_results_30_times.tsv", sep='\t', index=False)

In [None]:
##########이름 매칭

In [41]:
omim = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/6_new_disease/1_omim_umls_mapping_association.tsv")
drugbank = pd.read_csv("/DAS_Storage1/aschoi/data/Drug_Repositioning/7_new_association/drugbank vocabulary.csv")
drugbank_part = drugbank[['DrugBank ID', 'Common name', 'Synonyms']]
drugbank_part = drugbank_part.rename(columns={'DrugBank ID' : 'DrugBank_ID', 'Common name':'Common_name'})

In [42]:
fp_omim = pd.merge(right = sorted_fp, left = omim, right_on = 'Disease_id', left_on = 'OMIM_ID')
fp_omim.head(1)

Unnamed: 0,OMIM_ID,OMIM_disease_name,UMLS_concept_ID,UMLS_concept_name,Drug_id,Disease_id,y_true,Predicted_label,avg,trial0,...,trial20,trial21,trial22,trial23,trial24,trial25,trial26,trial27,trial28,trial29
0,102300,"Restless Legs Syndrome, Susceptibility To, 1; ...",C0035258,Restless Legs Syndrome,DB01182,102300,0,1,0.781472,0.861667,...,0.664167,0.674167,0.759167,0.8225,0.92,0.751667,0.85,0.8625,0.885833,0.83


In [43]:
fp_omim2 = fp_omim[['Drug_id','Disease_id','OMIM_disease_name', 'avg']]
print len(sorted_fp), len(fp_omim2), len(fp_omim2.drop_duplicates())
fp_omim2.head(2)

37 54 37


Unnamed: 0,Drug_id,Disease_id,OMIM_disease_name,avg
0,DB01182,102300,"Restless Legs Syndrome, Susceptibility To, 1; ...",0.781472
1,DB00376,102300,"Restless Legs Syndrome, Susceptibility To, 1; ...",0.77825


In [44]:
fp_omim_drug = pd.merge(right = fp_omim2, left = drugbank_part, right_on = 'Drug_id', left_on = 'DrugBank_ID')
fp_omim_drug.head(1)

Unnamed: 0,DrugBank_ID,Common_name,Synonyms,Drug_id,Disease_id,OMIM_disease_name,avg
0,DB00176,Fluvoxamine,Fluvoxamina | Fluvoxamine | Fluvoxaminum,DB00176,103780,Alcohol Dependence,0.713056


In [45]:
fp_omim_drug2 = fp_omim_drug[['Drug_id','Common_name','Synonyms','Disease_id','OMIM_disease_name', 'avg']]
print len(fp_omim2), len(fp_omim_drug2), len(fp_omim_drug2.drop_duplicates())
fp_omim_drug2.head(2)

54 54 37


Unnamed: 0,Drug_id,Common_name,Synonyms,Disease_id,OMIM_disease_name,avg
0,DB00176,Fluvoxamine,Fluvoxamina | Fluvoxamine | Fluvoxaminum,103780,Alcohol Dependence,0.713056
1,DB00193,Tramadol,(+)-Tramadol | (+)-trans-2-(Dimethylaminomethy...,103780,Alcohol Dependence,0.700222


In [46]:
fp_omim_drug2.sort_values('avg', ascending=False).drop_duplicates().to_csv(
    "/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/100_indep_predicted_results_30times_name.tsv",
    sep='\t', index=False)

In [50]:
fp_omim_drug2.sort_values('avg', ascending=False).drop_duplicates().head()

Unnamed: 0,Drug_id,Common_name,Synonyms,Disease_id,OMIM_disease_name,avg
50,DB01200,Bromocriptine,(5'alpha)-2-bromo-12'-hydroxy-2'-(1-methylethy...,168600,Parkinson Disease; Pd,0.987917
22,DB00810,Biperiden,1-Bicyclo[2.2.1]hept-5-en-2-yl-1-phenyl-3-pipe...,104300,Alzheimer Disease; Ad,0.959
25,DB00860,Prednisolone,"(11beta)-11,17,21-Trihydroxypregna-1,4-diene-3...",140600,Osteoarthritis Susceptibility 2; Os2,0.9475
14,DB00620,Triamcinolone,"11β,16α,17α,21-tetrahydroxy-9α-fluoro-1,4-preg...",212050,"Candidiasis, Familial, 2; Candf2",0.9475
38,DB00959,Methylprednisolone,"(6alpha,11beta)-11,17,21-Trihydroxy-6-methylpr...",607850,Osteoarthritis Susceptibility 3; Os3,0.940333


In [None]:
# Match omim id to umls id : to search in PubMed

In [52]:
mim_umls = pd.read_table("/DAS_Storage1/aschoi/data/Drug_Repositioning/6_new_disease/1_omim_umls_mapping_association.tsv", sep='\t')

In [57]:
raw = pd.merge(fp_omim_drug2, mim_umls, left_on = 'Disease_id', right_on = 'OMIM_ID', how='inner')

In [58]:
raw.head(1)

Unnamed: 0,Drug_id,Common_name,Synonyms,Disease_id,OMIM_disease_name_x,avg,OMIM_ID,OMIM_disease_name_y,UMLS_concept_ID,UMLS_concept_name
0,DB00176,Fluvoxamine,Fluvoxamina | Fluvoxamine | Fluvoxaminum,103780,Alcohol Dependence,0.713056,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."


In [59]:
raw.rename(columns = {'Drug_id':'drugID', 'Disease_id' : 'diseaseID', 'OMIM_ID':'mimID', 'OMIM_disease_name_x' : 'omim_disease_name', 'UMLS_concept_ID' : 'umls_id', 'UMLS_concept_name' :'umls_name'}, inplace=True)

In [60]:
raw.head(1)

Unnamed: 0,drugID,Common_name,Synonyms,diseaseID,omim_disease_name,avg,mimID,OMIM_disease_name_y,umls_id,umls_name
0,DB00176,Fluvoxamine,Fluvoxamina | Fluvoxamine | Fluvoxaminum,103780,Alcohol Dependence,0.713056,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."


In [56]:
print len(raw), len(raw.drugID.drop_duplicates()), len(raw.diseaseID)

92 20 92


In [61]:
df = raw[['drugID','Common_name','Synonyms','diseaseID', 'omim_disease_name', 'umls_id', 'umls_name', 'avg']].sort_values(by='avg', ascending=False)

In [64]:
print len(df), len(df.drop_duplicates())

92 54


In [153]:
print len(dr_umls), len(dr_umls.drop_duplicates('drugID')), len(dr_umls.drop_duplicates('diseaseID')), len(dr_umls.drop_duplicates(['drugID', 'diseaseID']))
print len(dr_umls.drop_duplicates(['drugID', 'umls_id'])),len(dr_umls.drop_duplicates('umls_id')), len(dr_umls.drop_duplicates('umls_name'))

90 20 18 37
41 22 35


In [65]:
df.drop_duplicates().to_csv("/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/100_indep_predicted_results_30times_name_umls.tsv",sep='\t', index=False)

In [None]:
# searched in PubMed

In [66]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime

In [69]:
dr_umls = df.drop_duplicates()
#dr_umls = pd.read_table("/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/100_indep_predicted_results_30times_name_umls.tsv"

In [70]:
print len(dr_umls)
dr_umls.head(1)

54


Unnamed: 0,drugID,Common_name,Synonyms,diseaseID,omim_disease_name,umls_id,umls_name,avg
89,DB01200,Bromocriptine,(5'alpha)-2-bromo-12'-hydroxy-2'-(1-methylethy...,168600,Parkinson Disease; Pd,C0030567,PD (Parkinson Disease),0.987917


In [71]:
# umls_name to multiple rows
s = dr_umls['umls_name'].str.split('\(').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_name'
del dr_umls['umls_name']
dr_umls = dr_umls.join(s)
dr_umls.umls_name = dr_umls.umls_name.str.rstrip('\)')

In [72]:
dr_umls.index = range(len(dr_umls))

In [79]:
row

drugID                                                         DB01586
Common_name                                       Ursodeoxycholic acid
Synonyms             (3alpha,5beta,7beta)-3,7-dihydroxycholan-24-oi...
diseaseID                                                       152700
omim_disease_name                    Systemic Lupus Erythematosus; Sle
umls_id                                                       C0024141
avg                                                           0.747722
umls_name                                Lupus Erythematosus, Systemic
Name: 89, dtype: object

In [104]:
df = pd.DataFrame()
print len(dr_umls)
cnt = 0
print "start : " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for index, row in dr_umls.iterrows():
    dr = row['Common_name']
    di = row['umls_name']
    addr = 'https://www.ncbi.nlm.nih.gov/pubmed?term=(' + dr +'%5BTitle%2FAbstract%5D)%20AND%20' + di + '%5BTitle%2FAbstract%5D'
    
    html = urllib.urlopen(addr)
    soup = BeautifulSoup(html)
    
    a = soup.find_all("h3",attrs={'class':'result_count left'})

    if len(a) == 0: # 딱 하나만 검색 됐을 때
        searched = 1
    else :
        searched = int(a[0].text.split(" ")[-1])
    
    temp_df = pd.DataFrame({'drugID': row['drugID'], 'Common_name' : dr,
                            'diseaseID':row['diseaseID'], 'omim_disease_name' : row['omim_disease_name'],
                            'umls_id' : row['umls_id'],'umls_name' : di,
                            'searched_count' : searched, 'avg' : row['avg']}, index=[cnt])
    df = df.append(temp_df)
    
    cnt = cnt + 1
    if cnt % 100 ==0 :
        print "cnt : {} ".format(cnt) + datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print "end : " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')

90
start : 2017-06-11 16:47:53
end : 2017-06-11 16:50:38


In [107]:
df = df.drop_duplicates()

In [109]:
print len(df), len(df.drop_duplicates(['Common_name', 'umls_id'])), len(df.drop_duplicates(['Common_name', 'umls_name']))

81 41 72


In [142]:
df_gr = df[df['searched_count'] == df.groupby(['drugID','Common_name','diseaseID','umls_id'])['searched_count'].transform(max)]
#df_gr = df.groupby(['drugID','Common_name','diseaseID','umls_id'])['avg', 'umls_name'].max()

In [143]:
a = df_gr.reset_index(level=['drugID', 'Common_name', 'diseaseID', 'umls_id'])

In [147]:
df[df.searched_count != 0].sort_values(['Common_name', 'umls_name', 'searched_count'], ascending=False)

Unnamed: 0,Common_name,avg,diseaseID,drugID,omim_disease_name,searched_count,umls_id,umls_name
89,Ursodeoxycholic acid,0.747722,152700,DB01586,Systemic Lupus Erythematosus; Sle,9,C0024141,"Lupus Erythematosus, Systemic"
19,Trihexyphenidyl,0.923972,104300,DB00376,Alzheimer Disease; Ad,8,C0002395,Alzheimer's Disease
35,Triamcinolone,0.842056,140600,DB00620,Osteoarthritis Susceptibility 2; Os2,115,C0029408,Osteoarthritis
43,Triamcinolone,0.891389,601626,DB00620,"Leukemia, Acute Myeloid; Aml",4,C0023467,"Leukemia, Myelocytic, Acute"
41,Triamcinolone,0.9475,212050,DB00620,"Candidiasis, Familial, 2; Candf2",25,C0006840,Candidiasis
2,Tramadol,0.700222,103780,DB00193,Alcohol Dependence,2,C0001973,Alcohol Dependence
88,Salsalate,0.760222,152700,DB01399,Systemic Lupus Erythematosus; Sle,1,C0024141,"Lupus Erythematosus, Systemic"
37,Prednisone,0.9295,140600,DB00635,Osteoarthritis Susceptibility 2; Os2,47,C0029408,Osteoarthritis
58,Prednisone,0.809389,607850,DB00635,Osteoarthritis Susceptibility 3; Os3,47,C0029408,Osteoarthritis
47,Prednisone,0.899194,601626,DB00635,"Leukemia, Acute Myeloid; Aml",334,C0023467,"Leukemia, Myelocytic, Acute"


In [162]:
print len(df_gr.drop_duplicates(['drugID', 'umls_name'])), len(df_gr.drop_duplicates()), len(df_gr)
df_gr.drop_duplicates(['drugID', 'umls_name']).sort_values('searched_count', ascending=False)

52 57 57


Unnamed: 0,Common_name,avg,diseaseID,drugID,omim_disease_name,searched_count,umls_id,umls_name
47,Prednisone,0.899194,601626,DB00635,"Leukemia, Acute Myeloid; Aml",334,C0023467,"Leukemia, Myelocytic, Acute"
51,Prednisolone,0.789444,601626,DB00860,"Leukemia, Acute Myeloid; Aml",297,C0023467,"Leukemia, Myelocytic, Acute"
86,Bromocriptine,0.987917,168600,DB01200,Parkinson Disease; Pd,159,C0030567,PD
55,Methylprednisolone,0.821389,601626,DB00959,"Leukemia, Acute Myeloid; Aml",131,C0023467,"Leukemia, Myelocytic, Acute"
35,Triamcinolone,0.842056,140600,DB00620,Osteoarthritis Susceptibility 2; Os2,115,C0029408,Osteoarthritis
62,Methylprednisolone,0.940333,607850,DB00959,Osteoarthritis Susceptibility 3; Os3,71,C0029408,Osteoarthritis
39,Prednisolone,0.9475,140600,DB00860,Osteoarthritis Susceptibility 2; Os2,62,C0029408,Osteoarthritis
37,Prednisone,0.9295,140600,DB00635,Osteoarthritis Susceptibility 2; Os2,47,C0029408,Osteoarthritis
67,Methylprednisolone,0.932778,205700,DB00959,"Anemia, Autoimmune Hemolytic",41,C0002880,Autoimmune hemolytic anemia
41,Triamcinolone,0.9475,212050,DB00620,"Candidiasis, Familial, 2; Candf2",25,C0006840,Candidiasis


In [163]:
print len(a.drop_duplicates(['drugID', 'umls_name'])), len(a.drop_duplicates()), len(a)
a.sort_values('searched_count', ascending=False)

52 57 57


Unnamed: 0,index,Common_name,avg,diseaseID,drugID,omim_disease_name,searched_count,umls_id,umls_name
33,47,Prednisone,0.899194,601626,DB00635,"Leukemia, Acute Myeloid; Aml",334,C0023467,"Leukemia, Myelocytic, Acute"
34,51,Prednisolone,0.789444,601626,DB00860,"Leukemia, Acute Myeloid; Aml",297,C0023467,"Leukemia, Myelocytic, Acute"
54,86,Bromocriptine,0.987917,168600,DB01200,Parkinson Disease; Pd,159,C0030567,PD
35,55,Methylprednisolone,0.821389,601626,DB00959,"Leukemia, Acute Myeloid; Aml",131,C0023467,"Leukemia, Myelocytic, Acute"
28,35,Triamcinolone,0.842056,140600,DB00620,Osteoarthritis Susceptibility 2; Os2,115,C0029408,Osteoarthritis
38,62,Methylprednisolone,0.940333,607850,DB00959,Osteoarthritis Susceptibility 3; Os3,71,C0029408,Osteoarthritis
45,73,Methylprednisolone,0.914333,165720,DB00959,Osteoarthritis Susceptibility 1; Os1,71,C0029408,Osteoarthritis
37,60,Prednisolone,0.905528,607850,DB00860,Osteoarthritis Susceptibility 3; Os3,62,C0029408,Osteoarthritis
30,39,Prednisolone,0.9475,140600,DB00860,Osteoarthritis Susceptibility 2; Os2,62,C0029408,Osteoarthritis
44,71,Prednisolone,0.893278,165720,DB00860,Osteoarthritis Susceptibility 1; Os1,62,C0029408,Osteoarthritis


In [164]:
df_gr.drop_duplicates(['drugID', 'umls_name']).to_csv("/home/share/aschoi/nas/users/asolchoi/data/Drug_Repositioning/8_new_training/7_non/100_indep_predicted_results_30times_name_umls_tidy.tsv"
                                                 , sep='\t', index=False)

In [144]:
a[a.searched_count != 0].Common_name.drop_duplicates()

0              Fluvoxamine
1                 Tramadol
11              Amantadine
17         Trihexyphenidyl
18               Biperiden
24          Cyproheptadine
27           Bromocriptine
28           Triamcinolone
29              Prednisone
30            Prednisolone
35      Methylprednisolone
46          Etidronic acid
55               Salsalate
56    Ursodeoxycholic acid
Name: Common_name, dtype: object

In [121]:
a[a.searched_count != 0].Common_name.drop_duplicates()

6          Trihexyphenidyl
9           Cyproheptadine
12           Triamcinolone
14              Prednisone
18               Biperiden
20            Prednisolone
24              Amantadine
27      Methylprednisolone
31          Etidronic acid
41           Bromocriptine
43               Salsalate
44    Ursodeoxycholic acid
Name: Common_name, dtype: object

In [125]:
a[a.Common_name == 'Fluvoxamine']

Unnamed: 0,drugID,Common_name,diseaseID,umls_id,avg,omim_disease_name,searched_count,umls_name
0,DB00176,Fluvoxamine,103780,C0001973,0.713056,Alcohol Dependence,7,Alcohol Dependence


In [None]:
df_gr.reset_index(level=['drugID', 'umls_id'], inplace=True)

In [None]:
umls에 여러개의 umls name이 있을 경우, cnt가 큰 것을 기준으로 한다.

In [None]:
df_gr[(df_gr.Common_name == "Pyridoxal")&(df_gr.umls_id == 'C0085681')]

In [None]:
print len(a.drop_duplicates()), len(a), len(df_gr)

In [5]:
def Random_Forest_independent_graph(X, y, indep_X, indep_y, user_estimator):

    folds_results = dict()
    draw_results = {'fpr':[], 'tpr':[], 'precision_vec':[], 'recall_vec':[]}
    fp_results = dict()
    
    classifier = RandomForestClassifier(n_estimators=user_estimator, n_jobs=-1, class_weight='balanced')
    classifier.fit(X, y)
    indep_y_predicted_proba = classifier.predict_proba(indep_X)
    indep_y_predicted_label = classifier.predict(indep_X)
    fp_results['predicted_proba'] = indep_y_predicted_proba
    fp_results['Predicted_label'] = indep_y_predicted_label
    fp_results['y_true'] = indep_y

    # Accuracy
    indep_acc = classifier.score(indep_X, indep_y)
    folds_results['acc']=indep_acc
    
    # AUC
    indep_auc = roc_auc_score(indep_y, indep_y_predicted_proba[:,1])
    folds_results['auc']=indep_auc
    
    # Sensitivity, Specificity
    indep_sn, indep_sp, support = sensitivity_specificity_support(indep_y, indep_y_predicted_label)
    folds_results['sn']=indep_sn
    folds_results['sp']=indep_sp
    
    # AUPR
    indep_aupr = average_precision_score(indep_y, indep_y_predicted_proba[:,1])
    folds_results['aupr']=indep_aupr
    
    # Precision, Recall
    indep_precision, indep_recall, _, _ = precision_recall_fscore_support(indep_y, indep_y_predicted_label, average = 'binary')
    folds_results['precision']=indep_precision
    folds_results['recall']=indep_recall
    
    # Confusion Matirx
    indep_confusion_matirx = confusion_matrix(indep_y, indep_y_predicted_label)
    folds_results['confusion_matrix']=indep_confusion_matirx
    
    # draw graph
    indep_fpr, indep_tpr, thresholds = roc_curve(indep_y, indep_y_predicted_proba[:, 1], pos_label=1)
    draw_results['fpr'].append(indep_fpr)
    draw_results['tpr'].append(indep_tpr)
        
    indep_precision_vec, indep_recall_vec, _ = precision_recall_curve(indep_y, indep_y_predicted_proba[:, 1])
    draw_results['precision_vec'].append(indep_precision_vec)
    draw_results['recall_vec'].append(indep_recall_vec)

    return folds_results, draw_results, fp_results