In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools #for plotting confusion matrix
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('drugsComTrain_raw.csv')

In [3]:
dataset.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [4]:
dataset.tail()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10,31-May-15,125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1,1-Nov-11,34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2,15-Mar-14,35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,19-Sep-15,79
161296,215220,Lubiprostone,"Constipation, Chronic","""I&#039;ve had chronic constipation all my adu...",9,13-Dec-14,116


In [5]:
dataset.condition.value_counts()

Birth Control                                              28788
Depression                                                  9069
Pain                                                        6145
Anxiety                                                     5904
Acne                                                        5588
                                                           ...  
Dissociative Identity Disorde                                  1
Hydrocephalus                                                  1
Hyperlipoproteinemia Type III, Elevated beta-VLDL   IDL        1
Q Feve                                                         1
Neutropenia                                                    1
Name: condition, Length: 884, dtype: int64

In [6]:
new_dataset = dataset[(dataset['condition'] == 'Birth Control') | (dataset['condition'] == 'Depression') | (dataset['condition'] == 'High Blood Pressure') | (dataset['condition'] == 'Diabetes, Type 2')]

In [7]:
dataset.shape,new_dataset.shape

((161297, 7), (42732, 7))

In [8]:
X = new_dataset.drop(['uniqueID','drugName','rating','date','usefulCount'],axis=1)

In [9]:
X.condition.value_counts()

Birth Control          28788
Depression              9069
Diabetes, Type 2        2554
High Blood Pressure     2321
Name: condition, dtype: int64

In [10]:
X.head()

Unnamed: 0,condition,review
2,Birth Control,"""I used to take another oral contraceptive, wh..."
3,Birth Control,"""This is my first time using any form of birth..."
9,Birth Control,"""I had been on the pill for many years. When m..."
11,Depression,"""I have taken anti-depressants for years, with..."
14,Birth Control,"""Started Nexplanon 2 months ago because I have..."


In [11]:
X['review'][2]

'"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."'

In [12]:
X['review'][11]

'"I have taken anti-depressants for years, with some improvement but mostly moderate to severe side affects, which makes me go off them.\r\n\r\nI only take Cymbalta now mostly for pain.\r\n\r\nWhen I began Deplin, I noticed a major improvement overnight. More energy, better disposition, and no sinking to the low lows of major depression. I have been taking it for about 3 months now and feel like a normal person for the first time ever. Best thing, no side effects."'

In [13]:
for i,col in enumerate(X.columns):
    X.iloc[:,i] = X.iloc[:,i].str.replace('"','')

In [14]:
X['review'][2]

'I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas.'

In [15]:
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not w in stop]
    lemmitize_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
    return( ' '.join(lemmitize_words))

In [16]:
X['review_clean'] = X['review'].apply(review_to_words)
X_feat=X['review_clean']
y = X['condition']

X.head()



Unnamed: 0,condition,review,review_clean
2,Birth Control,"I used to take another oral contraceptive, whi...",used take another oral contraceptive pill cycl...
3,Birth Control,This is my first time using any form of birth ...,first time using form birth control glad went ...
9,Birth Control,I had been on the pill for many years. When my...,pill many year doctor changed rx chateal effec...
11,Depression,"I have taken anti-depressants for years, with ...",taken anti depressant year improvement mostly ...
14,Birth Control,Started Nexplanon 2 months ago because I have ...,started nexplanon month ago minimal amount con...


In [17]:
cv = CountVectorizer(stop_words='english')
X_feat = cv.fit_transform(X_feat).toarray()

In [18]:
X_feat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X_feat,y,test_size=0.2,random_state=0)

In [20]:
X_train.shape,y_train.shape

((34185, 17015), (34185,))

In [21]:
X_test.shape,y_test.shape

((8547, 17015), (8547,))

## Naive Bayes

In [22]:
mnb_cv = MultinomialNB()
mnb_cv.fit(X_train, y_train)
y_pred = mnb_cv.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

accuracy:   0.973
[[5720   35    5   15]
 [  35 1710   11   18]
 [  16   18  466   13]
 [  12   49    6  418]]


## Passive Aggressive Classifier

In [23]:
pass_cv = PassiveAggressiveClassifier()
pass_cv.fit(X_train,y_train)
y_pred = pass_cv.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

accuracy:   0.982
[[5745   24    3    3]
 [  13 1736   12   13]
 [   6   20  477   10]
 [   6   36    8  435]]


## Most Informative Features For Each Class

In [24]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print (classlabel, feat, coef)

In [25]:
most_informative_feature_for_class(cv, pass_cv, 'Birth Control')

Birth Control bc 2.7377616661754622
Birth Control bleeding 2.7983987490681113
Birth Control pregnant 2.8475395393180607
Birth Control mirena 2.9564022208304093
Birth Control intended 2.9843408655607617
Birth Control spotting 3.0827667319992442
Birth Control period 3.1042222828382444
Birth Control aviane 3.2086529976578135
Birth Control acne 3.2998404152200105
Birth Control birth 3.574578055241945




In [26]:
most_informative_feature_for_class(cv, pass_cv, 'Diabetes, Type 2')

Diabetes, Type 2 diabetes 2.848238002295933
Diabetes, Type 2 trulicity 2.905873357737222
Diabetes, Type 2 unbelievable 2.9679219524647253
Diabetes, Type 2 metphomin 3.12911334083247
Diabetes, Type 2 byetta 3.2648180375609566
Diabetes, Type 2 januvia 3.2708842588877562
Diabetes, Type 2 sugar 3.743177690491514
Diabetes, Type 2 invokana 3.852763239869249
Diabetes, Type 2 metformin 3.8825397155457058
Diabetes, Type 2 victoza 4.068528250081219


In [27]:
most_informative_feature_for_class(cv, pass_cv, 'Depression')

Depression withdrawal 3.693549069485101
Depression effexor 3.8812744524434506
Depression lexapro 3.9148779899771413
Depression cymbalta 4.368940288242753
Depression zoloft 4.415715217687391
Depression fetzima 4.4727625792790775
Depression wellbutrin 4.532482990416989
Depression viibryd 4.554617860342436
Depression celexa 4.780855487675525
Depression pristiq 5.35858378608845


In [28]:
most_informative_feature_for_class(cv, pass_cv, 'High Blood Pressure')

High Blood Pressure researching 2.4878049079422437
High Blood Pressure grandma 2.641236145228348
High Blood Pressure overheating 2.6522287691205206
High Blood Pressure amlodipine 2.76953346114739
High Blood Pressure shoulder 2.829610251110788
High Blood Pressure bystolic 2.9781744281083253
High Blood Pressure pressure 3.0675868577072705
High Blood Pressure cough 3.199543900153137
High Blood Pressure bp 3.380749956520243
High Blood Pressure lisinopril 3.6233786455448005


In [29]:
text = ["I can't sleep at nights."]
test = cv.transform(text)
y_pred = pass_cv.predict(test)[0]
y_pred

'Depression'

In [30]:
text = ["I have fluctuating sugar levels. Recommend me some medicines."]
test = cv.transform(text)
y_pred = pass_cv.predict(test)[0]
y_pred

'Diabetes, Type 2'

In [31]:
X.tail()

Unnamed: 0,condition,review,review_clean
161273,Birth Control,"I have had the Nexplanon since Dec. 27, 2016 \...",nexplanon since dec got first period end janua...
161278,"Diabetes, Type 2",I just got diagnosed with type 2. My doctor pr...,got diagnosed type doctor prescribed invokana ...
161286,Depression,This is the third med I&#039;ve tried for anxi...,third med tried anxiety mild depression week h...
161290,High Blood Pressure,I have only been on Tekturna for 9 days. The e...,tekturna day effect immediate also calcium cha...
161291,Birth Control,This would be my second month on Junel. I&#039...,would second month junel birth control year ch...


In [32]:
text = [X['review'][161290]]
test = cv.transform(text)
y_pred = pass_cv.predict(test)[0]
y_pred

'High Blood Pressure'

In [33]:
text = [X['review'][161273]]
test = cv.transform(text)
y_pred = pass_cv.predict(test)[0]
y_pred

'Birth Control'

In [34]:
def predict_disease(lst_text):
    df_test = pd.DataFrame(lst_text, columns = ['test_sent'])
    df_test["test_sent"] = df_test["test_sent"].apply(review_to_words)
    vectors = cv.transform(lst_text)
    prediction = pass_cv.predict(vectors)
    df_test['prediction'] = prediction
    return df_test


def top_drugs_extractor(condition):
    df_top = dataset[(dataset['rating']>=9)&(dataset['usefulCount']>=100)].sort_values(by = ['rating', 'usefulCount'], ascending = [False, False])
    drug_lst = df_top[df_top['condition']==condition]['drugName'].head(3).tolist()
    return drug_lst

In [35]:
# sample sentences for recommending drugs
sentences = [
  "I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.",
    "This is the third med I&#039;ve tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap.",
    "I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me",]


predictions = predict_disease(sentences)

for text, label in zip(sentences, predictions['prediction']):
    if label=="High Blood Pressure":
        target="High Blood Pressure"
        top_drugs = top_drugs_extractor(label)
        print("Text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Depression":
        target="Depression"
        top_drugs = top_drugs_extractor(label)
        print("Text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Diabetes, Type 2":
        target="Diabetes, Type 2"
        top_drugs = top_drugs_extractor(label)
        print("Text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    else:
        target="Birth Control"
        print("text:", text, "\Condition:", target)
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()

Text: I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations. 
Condition: High Blood Pressure
Top 3 Suggested Drugs:
Losartan
Aldactone
Spironolactone

Text: This is the third med I&#039;ve tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap. 
Condition: Depression
Top 3 Suggested Drugs:
Sertraline
Zoloft
Viibryd

Text: I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losi

In [36]:
import joblib
joblib.dump(cv,'count_vectorizer.pkl')
joblib.dump(pass_cv, 'pass_agg_model.pkl')

['pass_agg_model.pkl']

In [37]:
vectorizer = joblib.load('count_vectorizer.pkl')
model = joblib.load('pass_agg_model.pkl')

pred = model.predict(vectorizer.transform(['This is the third med I&#039;ve tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap.']))
pred[0]

'Depression'