In [191]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [321]:
# Load train and test data
train_file_path = '/Users/zang/Downloads/drugsCom_raw/drugsComTrain_raw.tsv'
test_file_path = '/Users/zang/Downloads/drugsCom_raw/drugsComTest_raw.tsv'
train_data = pd.read_table(train_file_path, engine='python')
train_data = train_data.dropna()
train_data = train_data[~train_data['condition'].str.contains('users found this comment helpful.', na = False)]

In [322]:
#show the total number of each symptoms treated by the reviewed drugs
symptom = train_data.groupby(by = ['condition']).count().reset_index()
symptom.sort_values(by=symptom.columns[1], ascending= False)

Unnamed: 0.1,condition,Unnamed: 0,drugName,review,rating,date,usefulCount
92,Birth Control,28788,28788,28788,28788,28788,28788
189,Depression,9069,9069,9069,9069,9069,9069
521,Pain,6145,6145,6145,6145,6145,6145
52,Anxiety,5904,5904,5904,5904,5904,5904
7,Acne,5588,5588,5588,5588,5588,5588
91,Bipolar Disorde,4224,4224,4224,4224,4224,4224
368,Insomnia,3673,3673,3673,3673,3673,3673
766,Weight Loss,3609,3609,3609,3609,3609,3609
490,Obesity,3568,3568,3568,3568,3568,3568
0,ADHD,3383,3383,3383,3383,3383,3383


In [None]:
#show the total number of each kind of reviewed drug
drugs = train_data.groupby(by = ['drugName','condition']).count().reset_index()
drugs.sort_values(by=drugs.columns[2], ascending= False)

In [215]:
#create a new data frame just contains the positive reviews drugName and codition
medicine = train_data[train_data['rating'] > 5.0]
medicine = medicine[['drugName','condition']].reset_index()
medicine = medicine.drop(['index'], axis = 1)
m1 = medicine.head(500)
m1

Unnamed: 0,drugName,condition
0,Valsartan,Left Ventricular Dysfunction
1,Guanfacine,ADHD
2,Ortho Evra,Birth Control
3,Buprenorphine / naloxone,Opiate Dependence
4,Aripiprazole,Bipolar Disorde
5,Ethinyl estradiol / levonorgestrel,Birth Control
6,Topiramate,Migraine Prevention
7,L-methylfolate,Depression
8,Liraglutide,Obesity
9,Trimethoprim,Urinary Tract Infection


In [216]:
#turn every row into a itemset with two items, one is the drug, another is the symptom
med_set = m1.apply(lambda x: x.tolist(), axis=1)

In [217]:
len(med_set)

500

In [218]:
#Concatenate every itemset together as a candidate list
dataset = []
for x in range(len(med_set)):
    dataset.append(med_set[x])
dataset

[['Valsartan', 'Left Ventricular Dysfunction'],
 ['Guanfacine', 'ADHD'],
 ['Ortho Evra', 'Birth Control'],
 ['Buprenorphine / naloxone', 'Opiate Dependence'],
 ['Aripiprazole', 'Bipolar Disorde'],
 ['Ethinyl estradiol / levonorgestrel', 'Birth Control'],
 ['Topiramate', 'Migraine Prevention'],
 ['L-methylfolate', 'Depression'],
 ['Liraglutide', 'Obesity'],
 ['Trimethoprim', 'Urinary Tract Infection'],
 ['Amitriptyline', 'ibromyalgia'],
 ['Lamotrigine', 'Bipolar Disorde'],
 ['Nilotinib', 'Chronic Myelogenous Leukemia'],
 ['Atripla', 'HIV Infection'],
 ['Trazodone', 'Insomnia'],
 ['Etonogestrel', 'Birth Control'],
 ['Etanercept', 'Rheumatoid Arthritis'],
 ['Azithromycin', 'Chlamydia Infection'],
 ['Eflornithine', 'Hirsutism'],
 ['Daytrana', 'ADHD'],
 ['Ativan', 'Panic Disorde'],
 ['Imitrex', 'Migraine'],
 ['Sertraline', 'Depression'],
 ['Toradol', 'Pain'],
 ['Viberzi', 'Irritable Bowel Syndrome'],
 ['Mobic', 'Osteoarthritis'],
 ['Morphine', 'Pain'],
 ['Trilafon', 'Psychosis'],
 ['Flucona

In [219]:
oht = OnehotTransactions()
oht_ary = oht.fit(dataset).transform(dataset)
df_train = pd.DataFrame(oht_ary, columns=oht.columns_)



In [318]:
#Define the frequent itemsets
frequent_itemsets = apriori(df_train, min_support=0.001, use_colnames=True)
print (frequent_itemsets)

#Association Rule Mining
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
print (rules)


     support                                           itemsets
0      0.032                                             (ADHD)
1      0.002                                 (Abilify Discmelt)
2      0.014                        (Abnormal Uterine Bleeding)
3      0.008                                         (Accutane)
4      0.002               (Acetaminophen / aspirin / caffeine)
5      0.002            (Acetaminophen / butalbital / caffeine)
6      0.002  (Acetaminophen / dichloralphenazone / isomethe...
7      0.004                      (Acetaminophen / hydrocodone)
8      0.004                        (Acetaminophen / oxycodone)
9      0.002                                      (Acidophilus)
10     0.048                                             (Acne)
11     0.002                                        (Acyclovir)
12     0.006                                           (Aczone)
13     0.002                     (Adapalene / benzoyl peroxide)
14     0.002                            

In [314]:
frequent_itemsets.sort_values(by=frequent_itemsets.columns[0], ascending= False)

Unnamed: 0,support,itemsets
59,0.138,(Birth Control)
112,0.064,(Depression)
302,0.052,(Pain)
10,0.048,(Acne)
31,0.034,(Anxiety)
0,0.032,(ADHD)
283,0.032,(Obesity)
222,0.028,(Levonorgestrel)
58,0.028,(Bipolar Disorde)
139,0.026,(Emergency Contraception)


In [319]:
#sort the rules based on rule support
display(rules.sort_values(by = rules.columns[4], ascending=False))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
230,(Etonogestrel),(Birth Control),0.018,0.138,0.018,1.000000,7.246377,0.015516,inf
231,(Birth Control),(Etonogestrel),0.138,0.018,0.018,0.130435,7.246377,0.015516,1.129300
222,(Birth Control),(Ethinyl estradiol / levonorgestrel),0.138,0.016,0.016,0.115942,7.246377,0.013792,1.113049
223,(Ethinyl estradiol / levonorgestrel),(Birth Control),0.016,0.138,0.016,1.000000,7.246377,0.013792,inf
449,(Levonorgestrel),(Emergency Contraception),0.028,0.026,0.014,0.500000,19.230769,0.013272,1.948000
448,(Emergency Contraception),(Levonorgestrel),0.026,0.028,0.014,0.538462,19.230769,0.013272,2.106000
236,(Levonorgestrel),(Birth Control),0.028,0.138,0.010,0.357143,2.587992,0.006136,1.340889
237,(Birth Control),(Levonorgestrel),0.138,0.028,0.010,0.072464,2.587992,0.006136,1.047937
250,(Nexplanon),(Birth Control),0.010,0.138,0.010,1.000000,7.246377,0.008620,inf
510,(Ledipasvir / sofosbuvir),(Hepatitis C),0.010,0.014,0.010,1.000000,71.428571,0.009860,inf
