In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")


import string
from time import time

from bs4 import BeautifulSoup  
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag


import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/qin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# https://github.com/will-zw-wang/Yelp_Data_Challenge-NLP_Sentiment_Analysis_and_Review_Clustering_and_Recommender_System/blob/master/code/Yelp_Data_Challenge%20-%20Clustering.ipynb

### Read in the dataset

In [11]:
# path of the current directory
cur_dir = os.getcwd()
print("Current directory: {}".format(cur_dir))
# list files in the current directory
#print("Files in the current directory: {}".format(os.listdir(cur_dir)))
data_path = cur_dir+"/drugsCom"
train_path = data_path + "/drugsComTrain_raw.tsv"
test_path = data_path + "/drugsComTest_raw.tsv"
#rawdata_path = cur_dir+"/drugsCom"
#metadata_path = cur_dir+"/meta_Health_and_Personal_Care.json.gz"
print("Path of the raw data: {}".format(data_path))
print("Files in the drugCom directory: {}".format(os.listdir(data_path)))

Current directory: /Users/qin/Desktop/amazon
Path of the raw data: /Users/qin/Desktop/amazon/drugsCom
Files in the drugCom directory: ['drugsComTest_raw.tsv', 'drugsComTrain_raw.tsv']


In [12]:
train = pd.read_table(train_path, sep='\t')
test = pd.read_table(test_path, sep='\t')

In [13]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [36]:
train.shape

(161297, 7)

In [37]:
train['drugName'].nunique()

3436

## 1. Data preprocessing

#### 1.1 Define feature variables

In [15]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = train['review'].values

In [16]:
documents

array(['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
       '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."',
       '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But 

In [17]:
from sklearn.cross_validation import train_test_split

In [22]:
# X: documents
# Y: targets
# Now split the data to training set 80% and test set 20%
documents_train, documents_test = train_test_split(documents, test_size = 0.2, random_state = 42)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Create TfidfVectorizer, and name it vectorizer
# choose a reasonable max_features, e.g. 1000 to fast the computation speed
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)

In [23]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [24]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [25]:
# Use the trained model to transform all the reviews
vectors_documents = vectorizer.transform(documents).toarray()

In [26]:
from sklearn.cluster import KMeans

kmeans = KMeans()

kmeans.fit(vectors_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [27]:
assigned_cluster = kmeans.predict(vectors_documents)

In [28]:
# Default of kmeans uses 8 clusters
print ('number of clusters:' + str(kmeans.cluster_centers_.shape))

number of clusters:(8, 1000)


In [29]:
# print top 10 words of each cluster centers
# step (1) Sort each centroid vector to find the top 10 features
top_centroids = kmeans.cluster_centers_.argsort()[:, -1:-11:-1]
print("top 10 features for each cluster:")
# step (2) Go back to our vectorizer object to find out what words each of these features corresponds to
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(words[i] for i in centroid)))

top 10 features for each cluster:
0: pain, 039, day, relief, severe, years, taking, medicine, medication, doctor
1: day, 039, sleep, night, took, taking, days, hours, like, felt
2: period, 039, pill, birth, control, bleeding, periods, months, month, weight
3: quot, 039, day, years, like, feel, effects, taking, just, time
4: anxiety, 039, panic, depression, attacks, life, feel, taking, medication, effects
5: effects, years, 039, works, medicine, medication, taking, great, drug, life
6: 039, ve, don, like, feel, just, taking, effects, years, weight
7: acne, skin, face, 039, clear, using, months, dry, month, cleared


In [30]:
# Find the top 10 features for each cluster.
kmeans = KMeans(n_clusters = 5)
kmeans.fit(vectors_train)
assigned_cluster = kmeans.predict(vectors_documents)

top_centroids = kmeans.cluster_centers_.argsort()[:, -1:-11:-1]
print("top 10 features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ",".join(words[i] for i in centroid)))

top 10 features for each cluster:
0: anxiety,039,depression,panic,attacks,life,feel,taking,medication,day
1: 039,ve,don,like,day,feel,just,taking,effects,years
2: pain,039,day,relief,severe,taking,years,medicine,medication,doctor
3: 039,day,effects,taking,years,medication,medicine,works,started,days
4: 039,period,pill,birth,control,periods,months,bleeding,month,weight


<b>Print out the rating and review of a random sample of the reviews assigned to each cluster to get a sense of the cluster.</b>

In [35]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, vectors_documents.shape[0])[assigned_cluster==i]
    sample_reviews = np.random.choice(cluster, 1, replace=False)
    print("cluster %d:" % i)
    for review in sample_reviews:
        print("    %s" % train.loc[review]['review'])

cluster 0:
    "I suffered with a pretty severe anxiety, stress and depression. Took Paxil 4 days ago and honestly I feel great. It already started working. Lot of it could be in my head but trust me all my symptoms disappeared. Side effects: from day one I had a light headache, day two -a little harder headache, day three unbearable headache, day four(now) no headache, feeling perfectly fine, great mood, even though its raining and dark outside.  GREAT medicine. Also its worth to add, it worked the same wonders for me about 8 years ago when I had it for the last time. I was on Paxil for 6 months and it healed me. I had 7 years of great life and suddenly 6-7 months ago this thing attacked me again. Paxil works again, same way."
cluster 1:
    "[History]- I had started taking melatonin (3mg) to help me sleep, shortly after my divorce. As my body &#039;adjusted&#039; to it, I began to increase the dose, eventually (over several years) working up to 30mg. Well, not too long after that, I 

In [38]:
# Find the drugname which got most reviews, get your filtered df, name it df_top_drug
df_top_drug = train['drugName'].value_counts().index[0]
df_top_drug


'Levonorgestrel'

In [49]:
train.shape

(161297, 7)

In [47]:
train['condition'].nunique()

884

In [50]:
train['condition'][:40]

0     Left Ventricular Dysfunction
1                             ADHD
2                    Birth Control
3                    Birth Control
4                Opiate Dependence
5     Benign Prostatic Hyperplasia
6          Emergency Contraception
7                  Bipolar Disorde
8                         Epilepsy
9                    Birth Control
10             Migraine Prevention
11                      Depression
12                 Crohn's Disease
13                           Cough
14                   Birth Control
15                         Obesity
16         Urinary Tract Infection
17                     ibromyalgia
18                 Bipolar Disorde
19    Chronic Myelogenous Leukemia
20                   HIV Infection
21                        Insomnia
22                   Birth Control
23            Rheumatoid Arthritis
24         Vaginal Yeast Infection
25             Chlamydia Infection
26                       Hirsutism
27                            ADHD
28                  

In [None]:
# dendrogram

In [45]:
train.groupby('drugName').size()

drugName
A + D Cracked Skin Relief                              1
A / B Otic                                             1
Abacavir / dolutegravir / lamivudine                  52
Abacavir / lamivudine / zidovudine                     1
Abatacept                                             19
Abilify                                              446
Abilify Discmelt                                       2
Abilify Maintena                                       4
Abiraterone                                            9
AbobotulinumtoxinA                                     3
Abraxane                                               4
Abreva                                               158
Absorbine Jr.                                          1
Absorica                                               2
Acamprosate                                          109
Acanya                                                45
Acarbose                                               1
Accolate              