https://archive.ics.uci.edu/dataset/462/drug+review+dataset+drugs+com

# Flatiron Phase 5 Project

## Aaron Galbraith

https://www.linkedin.com/in/aarongalbraith \
https://github.com/aarongalbraith

### Submitted: November 21, 2023

## working contents

- **[functions](#functions)<br>**
- **[rough overview](#rough-overview)<br>**
- **[missing values](#missing-values)<br>**
- **[duplicates](#duplicates)<br>**
- **[brand / generic pairs](#brand-/-generic-pairs)<br>**
- **[further exploration of duplicates (skip for now)](#further-exploration-of-duplicates-(skip-for-now))<br>**
- **[contractions](#contractions)<br>**
- **[dates](#dates)<br>**
- **[ratings](#ratings)<br>**
- **[focusing on birth control](#focusing-on-birth-control)<br>**
- **[save and reload preprocessed set](#save-and-reload-preprocessed-set)<br>**
- **[feature engineering ideas](#feature-engineering-ideas)<br>**
- **[rudimentary word cloud maker](#rudimentary-word-cloud-maker)<br>**
- **[end](#end)<br>**


## Contents

- **[Business Understanding](#Business-Understanding)<br>**
- **[Data Understanding](#Data-Understanding)**<br>
- **[Data Preparation](#Data-Preparation)**<br>
- **[Exploration](#Exploration)**<br>
- **[Modeling](#Modeling)**<br>
- **[Evaluation](#Evaluation)**<br>
- **[Recommendations](#Recommendations)<br>**
- **[Further Inquiry](#Further-Inquiry)**<br>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

import html
import contractions

import re

from IPython.display import display

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

from pathlib import Path

SEED = 1979

do_grids = True

In [2]:
d1 = pd.read_csv('../data/drugsComTrain_raw.tsv', delimiter='\t', encoding='latin-1')
d2 = pd.read_csv('../data/drugsComTest_raw.tsv', delimiter='\t', encoding='latin-1')
df = pd.concat([d1,d2]).reset_index().drop(columns=['Unnamed: 0', 'index'])

# functions

In [3]:
def show_review(index):
    print(df.review.loc[index])
    display(df[df.review == df.loc[index].review][['drugName', 'condition', 'rating', 'date', 'usefulCount']])

In [4]:
def show_similar(index):
    
    count_total = df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.date == df.loc[index].date)
    ].review.count()
    
    count_similar = df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating) & \
        (df.date == df.loc[index].date)
    ].review.count()
    
    print('On', df.loc[index].date, df.loc[index].drugName, 'was reviewed', count_total, \
          'times and received a rating of', df.loc[index].rating, count_similar, 'times.\n')
    print('From that date, here are all', count_similar, 'reviews with the same rating:\n')
    for ind in df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating) & \
        (df.date == df.loc[index].date)
    ].index:
        print(df.loc[ind].review,'\n')
    
    print('Here is a breakdown of all the dates when reviewers gave the same drug name and condition THIS RATING:')
    display(df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating)
    ].date.value_counts())

# rough overview

In [5]:
df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [6]:
df.shape

(215063, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215063 entries, 0 to 215062
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   drugName     215063 non-null  object 
 1   condition    213869 non-null  object 
 2   review       215063 non-null  object 
 3   rating       215063 non-null  float64
 4   date         215063 non-null  object 
 5   usefulCount  215063 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 9.8+ MB


In [8]:
df.drugName.value_counts()

drugName
Levonorgestrel                       4930
Etonogestrel                         4421
Ethinyl estradiol / norethindrone    3753
Nexplanon                            2892
Ethinyl estradiol / norgestimate     2790
                                     ... 
Melpaque HP                             1
Cortisone                               1
Reyataz                                 1
Striant                                 1
Allergy DN PE                           1
Name: count, Length: 3671, dtype: int64

In [9]:
df.condition.value_counts()

condition
Birth Control                                   38436
Depression                                      12164
Pain                                             8245
Anxiety                                          7812
Acne                                             7435
                                                ...  
Systemic Candidiasis                                1
Wilson's Disease                                    1
unctional Gastric Disorde                           1
Sepsis                                              1
105</span> users found this comment helpful.        1
Name: count, Length: 916, dtype: int64

In [10]:
df.rating.value_counts()

rating
10.0    68005
9.0     36708
1.0     28918
8.0     25046
7.0     12547
5.0     10723
2.0      9265
3.0      8718
6.0      8462
4.0      6671
Name: count, dtype: int64

In [11]:
df.groupby('drugName').condition.nunique().value_counts()

condition
1     1869
2      782
3      334
4      195
5      122
6       83
7       52
8       51
9       38
11      20
10      19
12      19
14      18
13      18
15      11
18       8
16       6
17       5
0        4
20       3
23       3
19       3
25       2
22       2
39       1
24       1
31       1
21       1
Name: count, dtype: int64

This means that, for example, 2047 drugs treat one condition only, and 753 drugs treat two conditions, etc.

In [12]:
df.groupby('condition').drugName.nunique().value_counts()

drugName
2      188
1      166
4       78
3       72
5       44
      ... 
127      1
95       1
81       1
56       1
39       1
Name: count, Length: 71, dtype: int64

This means that 180 conditions are treatable by two drugs, etc.

In [13]:
pd.set_option("display.max_rows", None)
print(df.drugName.value_counts())
pd.set_option("display.max_rows", 10)

drugName
Levonorgestrel                                                                                      4930
Etonogestrel                                                                                        4421
Ethinyl estradiol / norethindrone                                                                   3753
Nexplanon                                                                                           2892
Ethinyl estradiol / norgestimate                                                                    2790
Ethinyl estradiol / levonorgestrel                                                                  2503
Phentermine                                                                                         2085
Sertraline                                                                                          1868
Escitalopram                                                                                        1747
Mirena                                        

A casual overview of the drug names indicates that they all seem valid.

In [14]:
pd.set_option("display.max_rows", None)
print(df.condition.value_counts())
pd.set_option("display.max_rows", 10)

condition
Birth Control                                                          38436
Depression                                                             12164
Pain                                                                    8245
Anxiety                                                                 7812
Acne                                                                    7435
Bipolar Disorde                                                         5604
Insomnia                                                                4904
Weight Loss                                                             4857
Obesity                                                                 4757
ADHD                                                                    4509
Diabetes, Type 2                                                        3362
Emergency Contraception                                                 3290
High Blood Pressure                                               

Oddly, the condition labels often (always?) omit initial 'F' and terminal 'r'. We can isolate instances of the former by searching for conditions that start with a lower case letter.

# missing values

In [15]:
len(df[df.condition.isna()])

1194

In [16]:
df.condition.fillna('missing', inplace=True)

In [17]:
len(df[df.condition == 'missing'])

1194

We noticed another condition label that was meant to indicate missing and should be accordingly changed.

In [18]:
df.condition = df.condition.apply(lambda x: 'missing' if 'Not Listed' in x else x)

In [19]:
len(df[df.condition == 'missing'])

1786

We've identified some actual missing condition labels, but we noticed there are more condition labels that seem suspicious, particularly ones that start with something other than an upper case character. Let's look at all such condition labels.

In [20]:
set(df[(~df.condition.str[0].isin(list(string.ascii_uppercase))) &
   (df.condition != 'missing')
  ].condition)

{'0</span> users found this comment helpful.',
 '100</span> users found this comment helpful.',
 '105</span> users found this comment helpful.',
 '10</span> users found this comment helpful.',
 '110</span> users found this comment helpful.',
 '11</span> users found this comment helpful.',
 '121</span> users found this comment helpful.',
 '123</span> users found this comment helpful.',
 '12</span> users found this comment helpful.',
 '135</span> users found this comment helpful.',
 '13</span> users found this comment helpful.',
 '142</span> users found this comment helpful.',
 '145</span> users found this comment helpful.',
 '146</span> users found this comment helpful.',
 '14</span> users found this comment helpful.',
 '15</span> users found this comment helpful.',
 '16</span> users found this comment helpful.',
 '17</span> users found this comment helpful.',
 '18</span> users found this comment helpful.',
 '19</span> users found this comment helpful.',
 '1</span> users found this comm

These fall into three categories. Ones that include "users found this comment helpful" should be regarded as erroneous and therefore missing.

In [21]:
df.condition = df.condition.apply(lambda x: 'missing' if 'users found' in x else x)

In [22]:
len(df[df.condition == 'missing'])

2957

 Ones that show a clipped copy of the drug name and end with a parenthesis should also be regarded as missing.

In [23]:
df.condition = df.condition.apply(lambda x: 'missing' \
                                  if x[0] not in list(string.ascii_uppercase) and \
                                  x[-1] in ['(', ')'] \
                                  else x)

In [24]:
len(df[df.condition == 'missing'])

3286

Most of the ones that show a clipped version of the condition label can possibly be restored.

In [25]:
def condition_restore(condition):
    if condition.split()[-1] in ['Disorde', 'eve', 'Shoulde', 'Cance']:
        condition = condition+'r'
    if condition.split()[0] in ['acial', 'ibrocystic', 'ungal', 'amilial', 'ailure', 'ever', \
                                'emale', 'unctional', 'actor', 'ibromyalgia', 'atigue']:
        condition = 'F'+condition
    if condition.split()[0] in ['llicular', 'llicle', 'lic', 'cal']:
        condition = 'Fo'+condition
    if condition.split()[0] in ['mance']:
        condition = 'Perfor'+condition
    if condition.split()[0] in ['zen']:
        condition = 'Fro'+condition
    if condition.split()[0] in ['mis']:
        condition = 'Dermatitis Herpetifor'+condition
    return condition

df.condition = df.condition.apply(lambda x: condition_restore(x))

Let's look at what we have left.

In [26]:
set(df[(~df.condition.str[0].isin(list(string.ascii_uppercase))) &
   (df.condition != 'missing')
  ].condition)

{'m Pain Disorder', 'me', 't Care', "von Willebrand's Disease"}

"von Willebrand's Disease" appears to be a naturally uncapitalized condition. The others have been impossible to restore and will also be regarded as missing.

In [27]:
df.condition = df.condition.apply(lambda x: 'missing' \
                                  if x[0] not in list(string.ascii_uppercase) and \
                                  x.split()[0] != 'von' \
                                  else x)

In [28]:
len(df[df.condition == 'missing'])

3293

We will be able to restore more of these missing condition labels after we do some work with duplicates.

# duplicates

In [29]:
df.duplicated().value_counts()

False    215061
True          2
Name: count, dtype: int64

In [30]:
df[df.duplicated()]

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
178703,Levonorgestrel,Emergency Contraception,"""I had a quickie n he decided to finish it off...",1.0,"September 23, 2016",10
191001,Plan B,Emergency Contraception,"""I had a quickie n he decided to finish it off...",1.0,"September 23, 2016",10


In [31]:
show_review(178703)

"I had a quickie n he decided to finish it off in me... Well IMMEDIATELY we went 2 our local pharmacy n bought this plan b 1 step pill.I took it immediately.2 weeks later,took a pregnancy test n got the world&#039;s BIGGEST POSITIVE. The small pill was $50.That was the 1st time in a year n a half that I had intercourse n the last after I had my first son. I honestly believe this pill is ineffective because they just want u to think it works when n reality, it would never work. Alot of women don&#039;t know their bodies when they ovulate so if your not fertile and he ejaculates n u and u take the pill n dont get preg., The pill is supposed to make u think it worked. DO NOT buy. Was NEVER effective. Thank u!"


Unnamed: 0,drugName,condition,rating,date,usefulCount
131531,Levonorgestrel,Emergency Contraception,1.0,"September 23, 2016",10
143768,Plan B,Emergency Contraception,1.0,"September 23, 2016",10
178703,Levonorgestrel,Emergency Contraception,1.0,"September 23, 2016",10
191001,Plan B,Emergency Contraception,1.0,"September 23, 2016",10


This is curious. The same review is recorded four times. There are two identical pairs, where the difference between the pairs is the drug name. We can drop one from each pair, but this will need to be revisited.

In [32]:
df.drop_duplicates(inplace=True)

# brand / generic pairs

The main type of duplicate we should look out for is records with duplicate reviews, as those likely indicate some kind of actual erroneous duplication. Let's see how many of those there are.

In [33]:
len(df[df.duplicated(subset=['review'])])

86583

That's a lot!

Let's explore some facets of these duplicates.

In [34]:
len(df[df.duplicated(subset=df.columns.difference(['drugName']))])

85876

The vast majority of duplicate reviews are accounted for by different drug names. Let's explore some examples.

In [35]:
df[df.duplicated(subset=df.columns.difference(['drugName']))].head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
524,Nexplanon,Birth Control,"""First had implanon then got Nexplanon, had a ...",9.0,"April 21, 2017",5
574,Cymbalta,Anxiety,"""Prescribed via a Psychiatrist for severe Pani...",1.0,"September 5, 2010",27
726,Orsythia,Birth Control,"""I have only been on orsythia for about 1 mont...",2.0,"October 8, 2015",7
1070,Desvenlafaxine,Depression,"""I have suffered from severe anxiety (GAD) and...",8.0,"November 1, 2013",81
1375,Ethinyl estradiol / norethindrone,Birth Control,"""I have been taking my first pack of Lo Loestr...",8.0,"February 1, 2012",7


In [36]:
show_review(524)

"First had implanon then got Nexplanon, had a period first month and I have not had one since. I&#039;m due to remove it next year.  I do notice spotting  sometimes for a day but it honestly  usually coincides with when I&#039;m stressed. 
Had some weight gain also.

So far the best BC I&#039;ve  had in all my years.  I plan on trying for a baby next year then I will be back on it."


Unnamed: 0,drugName,condition,rating,date,usefulCount
374,Etonogestrel,Birth Control,9.0,"April 21, 2017",5
524,Nexplanon,Birth Control,9.0,"April 21, 2017",5


In [37]:
show_review(574)

"Prescribed via a Psychiatrist for severe Panic attacks for 2 years.
If I take dosage late or forget to take it the withdrawal symptoms kick in.  
Gnawing physical pain, breathlessness, disorientation to time, difficulties in word finding while speaking, severe muscle pain and stiffness, nausea, labile emotions and panic.

"


Unnamed: 0,drugName,condition,rating,date,usefulCount
321,Duloxetine,Anxiety,1.0,"September 5, 2010",27
574,Cymbalta,Anxiety,1.0,"September 5, 2010",27


In [38]:
show_review(726)

"I have only been on orsythia for about 1 month and I just started my second week of my second month.  I guess I didn&#039;t notice earlier but I started to get slight headaches and I didn&#039;t feel very well physically and mentally.  It do help with my cramps and my period, not so much my acne but it&#039;s better.  The worst part of orsythia has to be the mood swings and the sweating!  I sweat a lot even if it&#039;s cold I&#039;ll start a light sweat, it&#039;s gross.  But the mood swings are the worst I just started feeling this and it happened while I was talking to one of my friends I just blew up on him, for no reason.  Then later on I started to feel really bad(mentally) and I cried for a while and I couldn&#039;t figure out why I was crying! I don&#039;t recommend!"


Unnamed: 0,drugName,condition,rating,date,usefulCount
378,Ethinyl estradiol / levonorgestrel,Birth Control,2.0,"October 8, 2015",7
726,Orsythia,Birth Control,2.0,"October 8, 2015",7


In [39]:
show_review(1070)

"I have suffered from severe anxiety (GAD) and was taking more and more Klonopin as time went on.  I am very sensitive to medication and have tried many different SSRI/SNRI&#039;s through the year with horrible side effects.  Finally, I had DNA testing to see what I would respond to and the result was Pristiq.  I started it several months ago in a small dose (I split the pills even though they say don&#039;t do this) and within a few days my anxiety literally went away.  I was able to cut my Klonopin in 1/2 over a two month period.  The first week or two I was extremely tired but that passed.  The only side effect I get from time to time is migraines.  It still amazes me that my anxiety has disappeared.  I no longer keep Klonopin in my  pocket!"


Unnamed: 0,drugName,condition,rating,date,usefulCount
855,Pristiq,Depression,8.0,"November 1, 2013",81
1070,Desvenlafaxine,Depression,8.0,"November 1, 2013",81


In [40]:
show_review(1375)

"I have been taking my first pack of Lo Loestrin Fe and I must say it really works for me. I was a little nervous at first because this is my first time taking birth control and I&#039;ve heard all the negative side effects of taking birth control. I have had spotting [brown-ish color] for three weeks after my period, but that&#039;s normal for the first month. I have breast tenderness and mood swings every now and then, then again it&#039;s expected for the first few months. I have not yet experienced any weight gain. So far I am satisfied, but I wish it wasn&#039;t so expensive."


Unnamed: 0,drugName,condition,rating,date,usefulCount
609,Lo Loestrin Fe,Birth Control,8.0,"February 1, 2012",7
1375,Ethinyl estradiol / norethindrone,Birth Control,8.0,"February 1, 2012",7


These five examples make clear that the vast majority of duplicates are due to double-entry; (nearly) every review is entered once with its generic name and once with its brand name.

We can use this phenomenon to restore some of the missing condition labels. If a missing condition label is part of such a unique pair, then we can confidently assign it the condition of its pair-mate.

Let's broaded our search to records that duplicate every feature other than drug name and condition.

In [41]:
len(df[df.duplicated(subset=df.columns.difference(['drugName', 'condition']))])

86221

This is how many records are duplicates of other records in all values EXCEPT (POSSIBLY) drug name and condition. If a record is duplicated in this manner, the second (and third, fourth, etc.) instance will be captured in this bucket of dupes.

If we check only this bucket for dupes, we can see whether there are any triplets, etc.

In [42]:
df_dupes = df[df.duplicated(subset=df.columns.difference(['drugName', 'condition']))]

In [43]:
len(df_dupes[df_dupes.duplicated(subset=df_dupes.columns.difference(['drugName', 'condition']))])

1

There is only one.

In [44]:
df_dupes[df_dupes.duplicated(subset=df_dupes.columns.difference(['drugName', 'condition']))]

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
140144,Octreotide,"Diabetes, Type 1","""Great medicine. No side effects.""",9.0,"October 31, 2011",2


In [45]:
show_review(140144)

"Great medicine. No side effects."


Unnamed: 0,drugName,condition,rating,date,usefulCount
39512,Reprexain,Pain,7.0,"October 5, 2012",10
60998,Insulin regular,"Diabetes, Type 1",9.0,"October 31, 2011",2
119972,Insulin glulisine,"Diabetes, Type 1",9.0,"October 31, 2011",3
133212,Sandostatin,"Diabetes, Type 1",9.0,"October 31, 2011",2
140144,Octreotide,"Diabetes, Type 1",9.0,"October 31, 2011",2
141100,Insulin isophane / insulin regular,"Diabetes, Type 1",9.0,"October 31, 2011",10
148049,ReliOn / Novolin 70 / 30,"Diabetes, Type 1",9.0,"October 31, 2011",10
184262,Hydrocodone / ibuprofen,Pain,7.0,"October 5, 2012",10


There are 6 records with the same review, date, rating, and condition. Because they're on the *same day*, it seems likely that these reviews were entered repeatedly by the same person. The two with a useful count of 10 are likely a brand/generic pair. As for the other 4, a possible explanation is that Sandostatin and Octreotide are brand names for the two types of insulin, and one of them somehow acquired an erroneous useful count. Let's reassign the useful count of Octreotide to 3 and let them pair off that way.

In [46]:
df.at[140144, 'usefulCount'] = 3

In [47]:
%%time
# ⏰ record the time for this cell -- 11.1 s

# create stripped down dataframe that does not have drug names or conditions
# we don't need these features for this operation because we're checking for matches on all other features
df_pairs = df.drop(columns=['drugName', 'condition']).copy()

# create a list of indices of records that duplicate everything other than drug name and condition
df_dupes = df_pairs[df_pairs.duplicated()].index.tolist().copy()

# create and populate a dictionary whose keys are dates and values are indices
dates_bucket = {}
# populate dictionary with keys that are dates belonging to the duplicates
for date_ in list(set(df[df.index.isin(df_dupes)].date.tolist())):
    dates_bucket[date_] = []
# populate dictionary with values that are indices that are NOT from the duplicate list but DO share that date
for i in df[~df.index.isin(df_dupes)].index:
    dates_bucket[df.loc[i].date].append(i)

CPU times: user 11.1 s, sys: 87.3 ms, total: 11.2 s
Wall time: 11.4 s


In [48]:
%%time
# ⏰ record the time for this cell -- 2min 31s

# create a list of record pairs where each entry is a list of two indices
pairs = []

# iterate over the indices from the dupes list
for i in df_dupes:
    # set the date to the date from index i
    date_i = df.loc[i].date
    # iterate over OTHER indices who share that date
    for j in dates_bucket[date_i]:
        # check for a match
        if df_pairs.loc[i].equals(df_pairs.loc[j]):
            # remove this index from the dates dictionary so we have fewere to search through in later iterations
            dates_bucket[date_i].remove(j)
            # add this pair to the pairs list
            pairs.append([i,j])
            break

CPU times: user 2min 38s, sys: 675 ms, total: 2min 38s
Wall time: 2min 40s


In [49]:
pairs[:10]

[[524, 374],
 [574, 321],
 [726, 378],
 [1070, 855],
 [1375, 609],
 [1397, 1281],
 [1735, 1043],
 [1965, 299],
 [2014, 1844],
 [2091, 1609]]

In [50]:
pairs_dict = {}

for pair in pairs:
    for i in range(2):
        pairs_dict[pair[i]] = pair[1-i]

In [51]:
len(df[df.condition == 'missing'])

3293

In [52]:
%%time
# ⏰ record the time for this cell -- 15.4 s

# iterate over each record pair
for pair in pairs:
    # iterate over each member of the pair
    for i in range(2):
        # identify a pair member whose condition is missing
        if df.loc[pair[i]].condition == 'missing':
            # assign to the pair member the condition of its pair-mate
            df.at[pair[i], 'condition'] = df.loc[pairs_dict[pair[i]]].condition

CPU times: user 14.2 s, sys: 62.4 ms, total: 14.2 s
Wall time: 14.7 s


In [53]:
len(df[df.condition == 'missing'])

2968

We'll make a feature that names the indicated drug and, if applicable, the paired drug.

In [60]:
%%time
# ⏰ record the time for this cell -- 21.2 s

df['ind'] = df.index

def drugSet_fix(index, drugName_):
    drugSet = {drugName_}
    if index in pairs_dict:
        drugSet.add(df.loc[pairs_dict[index]].drugName)
    return drugSet

df['drugSet'] = df.apply(lambda x: drugSet_fix(x.ind, x.drugName), axis=1)

df.drop(columns='ind', inplace=True)

CPU times: user 19.7 s, sys: 169 ms, total: 19.9 s
Wall time: 20.2 s


In [61]:
%%time
# ⏰ record the time for this cell -- 21.2 s

df['ind'] = df.index

def drugList_fix(index, drugName_):
    drugList = [drugName_]
    if index in pairs_dict:
        drugList.append(df.loc[pairs_dict[index]].drugName)
        drugList.sort()
    return drugList

df['drugList'] = df.apply(lambda x: drugList_fix(x.ind, x.drugName), axis=1)

df.drop(columns='ind', inplace=True)

CPU times: user 20.6 s, sys: 174 ms, total: 20.8 s
Wall time: 21.6 s


In [74]:
df['drugSetString'] = df.drugList.apply(lambda x: x[0] + ' ' + x[1] if len(x) == 2 else x[0])

In [79]:
len(df[df.duplicated(subset=df.columns.difference(['drugName', 'drugSet', 'drugList']))])

86201

In [83]:
df_bookmark = df.copy()

In [84]:
df.drop_duplicates(subset=df.columns.difference(['drugName', 'drugSet', 'drugList']), inplace=True)

## proposed solutions for missing values

1. For every record with a missing condition, we will assign it the condition that is most common for the drug indicated by that record.

2. Before executing solution 1, find each record's twin and use the condition label from the twin where applicable.

For now, we'll just execute solution 2.

In [87]:
drugs_w_missing_condition = list(set(df[df.condition == 'missing'].drugName))

In [88]:
len(drugs_w_missing_condition)

637

This applies to about a quarter of the drugs. We'll create a dictionary that reports the most common condition for these drugs.

In [89]:
%%time
# record the time for this cell -- 22.2 s

most_common_condition = {}

for drug in drugs_w_missing_condition:
    condition = df[df.drugName == drug].condition.value_counts().idxmax()
    if condition == 'missing' and len(set(df[df.drugName == drug].condition)) > 1:
        condition = df[(df.drugName == drug) &
                       (df.condition != 'missing')
                      ].condition.value_counts().idxmax()
    proportion = round(df[df.drugName == drug].condition.value_counts(normalize=True)[0],2)
    most_common_condition[drug] = [condition, proportion]

CPU times: user 11.3 s, sys: 43 ms, total: 11.4 s
Wall time: 11.5 s


In [90]:
most_common_condition['Viagra']

['Erectile Dysfunction', 0.91]

For example, if a review with an unlisted condition is about Viagra, we will assume the condition is Erectile Dysfunction.

In [91]:
df['condition'] = df.apply(lambda x: most_common_condition[x.drugName][0] \
                           if x.condition == 'missing' \
                           else x.condition, axis = 1)

In [92]:
len(df[df.condition == 'missing'])

25

This is how many records there are that still have no label for condition. This means the drugs indicated in these records are *only* indicated in references without an indicated condition. As such, there's not really anything we can do with these records, and we may as well drop them.

In [93]:
df.drop(df[df.condition == 'missing'].index, inplace=True)

# further exploration of duplicates (skip for now)

In [None]:
len(df[df.duplicated(subset=['drugName', 'condition', 'rating', 'date'])])

That also seems like a lot. Let's explore these now.

In [None]:
df[df.duplicated(subset=['drugName', 'condition', 'rating', 'date'])].head()

We'll use the "show_similar" function to explore these reviews that duplicate drug name

In [None]:
show_similar(2450)

In [None]:
show_similar(3597)

In [None]:
show_similar(4892)

In [None]:
df[df.duplicated(subset=['drugName', 'condition', 'rating', 'date'])].rating.value_counts()

In [None]:
df[
    (df.drugName == df.loc[8576].drugName) & \
    (df.condition == df.loc[8576].condition) & \
    (df.date == df.loc[8576].date)
    
]

In [None]:
df[(df.drugName == 'Miconazole') & \
   (df.condition == 'Vaginal Yeast Infection') & \
   (df.rating == 1.0) & \
   (df.date == 'May 25, 2016') & \
   (df.usefulCount == 6) \
  ]

In [None]:
show_review(8737)

In [None]:
len(df[df.duplicated(subset=['review'])])

An enormous number of records have duplicated reviews.

In [None]:
show_review(524)

In [None]:
show_review(574)

In [None]:
show_review(726)

In [None]:
show_review(1070)

In [None]:
show_review(1375)

In all of the instances we checked, the duplicated record occurs because it is listed once under its chemical name and once under its brand name. We'll assume this is mostly the reason for the vast majority of review duplications and deal with them after we address other types of review duplications.

In [None]:
len(df[(df.duplicated(subset=['review'])) &
   ~df.duplicated((['drugName']))
  ])

This is how many records have identical reviews but differences *other than the drug name*. Let's explore a few of these.

In [None]:
df[(df.duplicated(subset=['review'])) &
   ~df.duplicated(subset=df.columns.difference(['drugName']))
  ].head(15)

In [None]:
show_review(2664)

In [None]:
show_review(6465)

In [None]:
show_review(9735)

In [None]:
show_review(13125)

Some of these are just common, short reviews, e.g. "Great". But others seem to have issues with the condition label as well.

We found earlier that many duplicate reviews come in pairs where the drug name is generic and brand name in the two records. It seems that more of these pairs exist in instances where the condition is "missing" for some reason. Where this specific phenomenon occurs, we'll relabel the condition to match its partner in the pair. This will reduce the number of "missing" conditions but increase the number of duplicate pairs.

In [None]:
len(df[df.condition == 'missing'])

In [None]:
len(df[df.duplicated(subset=df.columns.difference(['drugName']))])

In [None]:
len(df[df.duplicated(subset=df.columns.difference(['condition']))])

In [None]:
len(df[df.duplicated(subset=df.columns.difference(['rating']))])

In [None]:
len(df[df.duplicated(subset=df.columns.difference(['date']))])

In [None]:
len(df[df.duplicated(subset=df.columns.difference(['usefulCount']))])

In [None]:
df[df.duplicated(subset=df.columns.difference(['usefulCount']))].head()

In [None]:
show_review(33451)

In [None]:
show_review(42728)

In [None]:
show_review(61617)

In [None]:
show_review(69518)

In [None]:
show_review(72794)

This appears to be an instance of someone re-posting a review multiple times. It seems that we should drop the duplicates in this case, but possibly we should tally up the useful count?

# contractions

Here is an example of a contraction.

In [94]:
df.review[3][56:69]

'I&#039;m glad'

Here is how the html function fixes it.

In [95]:
html.unescape(df.loc[3][2])[56:64]

"I'm glad"

Here is how the contractions function fixes (the html function's fix of) it.

In [96]:
contractions.fix(html.unescape(df.loc[3][2]))[56:65]

'I am glad'

Here is an instance of "ain't" with the same functions applied.

In [97]:
df.review.loc[507][75:99]

'I ain&#039;t complaining'

In [98]:
html.unescape(df.review.loc[507])[75:94]

"I ain't complaining"

In [99]:
contractions.fix(html.unescape(df.review.loc[507]))[75:96]

'I are not complaining'

In [100]:
len(df[df.review.str.contains('ain&#039;t')])

31

There are 53 instances of "ain't".

I'm currently having difficulty downloading the package that appropriately fixes "ain't" into "is not" or "are not" etc. This shouldn't matter after I remove stop words. I think it will be helpful to exclude negatives like "no" and "not" from the stop words. It could certainly be of help to look for bigrams like "not good".

In [101]:
df.review = df.review.apply(lambda x: html.unescape(x))

# dates

In [None]:
sample = df.date.loc[0]

In [None]:
sample

In [None]:
re.split(r'\W+', sample)

There's probably a datetime method for this, but the following will produce month // day // year, and then we can figure out the earliest and latest dates.

In [None]:
df['month'] = df.date.apply(lambda x: re.split(r'\W+', x)[0])
df['day'] = df.date.apply(lambda x: int(re.split(r'\W+', x)[1]))
df['year'] = df.date.apply(lambda x: int(re.split(r'\W+', x)[2]))

In [None]:
df.year.min()

In [None]:
df[df.year == 2008].month.value_counts()

In [None]:
df[(df.year == 2008) &
   (df.month == 'February')
  ].day.min()

In [None]:
df.year.max()

In [None]:
df[df.year == 2017].month.value_counts()

In [None]:
df[(df.year == 2017) &
   (df.month == 'November')
  ].day.max()

The reviews span from February 24, 2008 to November 30, 2017.

# ratings

In [None]:
len(df)/2

In [None]:
df.rating.value_counts()

In [None]:
len(df[df.rating > 8.5])

In [None]:
len(df[df.rating < 8.5])

To split the review roughly in half we would split between 8 and 9

To split the ratings roughly in half we would make the splits 1-8 and 9-10.

In [None]:
len(df)/3

In [None]:
len(df[df.rating > 9.5])

In [None]:
len(df[df.rating < 6.5])

To split the ratings roughly in thirds we would make the splits 1-6, 7-9, and 10.

# focusing on birth control

In [None]:
len(df[df.condition == 'Birth Control'])

This many records pertain to the condition of birth control.

In [None]:
birth_control_drugs = set(df[df.condition == 'Birth Control'].drugName)

In [None]:
len(birth_control_drugs)

This many drugs treat birth control.

In [None]:
df[df.condition == 'Birth Control'].drugName.value_counts()

These are the most frequent drug names that treat birth control.

In [None]:
list(set(df[(df.condition != 'Birth Control') &
   (df.drugName.isin(birth_control_drugs))
  ].condition))

These are other conditions that are (at least sometimes) treated by drugs that (also) treat birth control.

# save and reload preprocessed set

At this stage we will save and reload the preprocessed set in order to avoid taking the time to repeat earlier work everytime we open the notebook.

The saved version has restored or deleted all records with missing condition labels.

We have established pairs in the list `twins` but we have NOT yet deleted either member of any pair or dealt with the confusion between brand and generic drug names.

The size of the dateframe is nearly the same as its original version, roughly 215,000 records.

In [None]:
filepath = Path('../data/preprocessed.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath)

In [None]:
%store twins

In [None]:
df = pd.read_csv('../data/preprocessed.csv')
df.drop(columns='Unnamed: 0', inplace=True)

In [None]:
%store -r twins

# feature engineering ideas

- word count
- character count
- words in all caps
- average word length
- whether words are in English (spelled correctly)
- whether it includes characters such as exclamation points, question marks, (especially repeatedly), and emoticons
- whether it mentions the brand or generic name in the review

In [108]:
df['word_count'] = df.review.apply(lambda x: len(x.split()))

In [110]:
df['char_count'] = df.review.apply(lambda x: len(x))

In [124]:
'!' in df.loc[5].review

True

In [125]:
df_bookmark_2 = df.copy()

# truncate to just birth control

In [126]:
df.drop(df[df.condition != 'Birth Control'].index, inplace=True)

In [127]:
df.usefulCount.value_counts()

usefulCount
1      2157
2      2142
0      2109
3      1920
4      1830
       ... 
140       1
730       1
77        1
505       1
190       1
Name: count, Length: 145, dtype: int64

In [144]:
df.usefulCount.quantile(.99)

51.279999999998836

In [159]:
df[df.rating > 8].usefulCount.quantile(.95)

32.0

In [160]:
df[df.rating < 2].usefulCount.quantile(.95)

16.0

In [153]:
show_review(17598)

"I'm 22 years old and have two little boys. My husband says he hopes to one day have a girl so I figured 5 years should be enough time to decide. I got the Mirena last year and the only side effect I had was hair loss but I'm not quite sure if it was the Mirena or from just having another baby. But other than that I ABSOLUTELY LOVE IT! I have been reading all these reviews and they were almost all negative I was wondering if something was wrong with me. Since I got the Mirena last year I lost weight, no more cramps, lightened periods, and a better sex drive (probably because I don't have to worry if I'm going to get pregnant), I feel safe. So would I recommend the Mirena, YES."


Unnamed: 0,drugName,condition,rating,date,usefulCount
17598,Mirena,Birth Control,10.0,"June 21, 2009",397


# rudimentary word cloud maker

In [None]:
df['review'] = df['review'].str.lower()

In [None]:
dfbc = df[df.condition == 'Birth Control']

dfbc['sentiment'] = dfbc.rating.apply(lambda x: 1 if x > 5 else 0)

dfbcpos = df[
    (df.condition == 'Birth Control') & \
    (df.rating > 9.5)
]

dfbcneg = df[
    (df.condition == 'Birth Control') & \
    (df.rating < 6.5)
]

In [None]:
# make list of all reviews
reviews_pos = dfbcpos.review.to_list()
reviews_neg = dfbcneg.review.to_list()

In [None]:
# # make tokenizer
# tokenizer = TweetTokenizer(
#     preserve_case=False,
#     strip_handles=True
# )

# create list of tokens from data set
tokens_pos = word_tokenize(','.join(reviews_pos))
tokens_neg = word_tokenize(','.join(reviews_neg))


# tokens = [word for word in tokens]

In [None]:
# make lemmatizer
lemmatizer = WordNetLemmatizer()

# lemmatize the list of words
tokens_lemmatized_pos = [lemmatizer.lemmatize(word) for word in tokens_pos]
tokens_lemmatized_neg = [lemmatizer.lemmatize(word) for word in tokens_neg]

In [None]:
# show the most frequently occurring tokens
FreqDist(tokens_lemmatized_pos).most_common(25)

In [None]:
# show the most frequently occurring tokens
FreqDist(tokens_lemmatized_neg).most_common(25)

In [None]:
negatives = ['no', 'not', "don't", "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", \
             "isn't", "wasn't", "weren't", "won't", "wouldn't"]

In [None]:
# obtain the standard list of stopwords
nltk.download('stopwords', quiet=True)
# start our own list of stopwords with these words
stop_list = [word for word in stopwords.words('english') if word not in negatives]
# add punctuation characters
for char in string.punctuation:
    stop_list.append(char)
# add empty string
stop_list.extend(['', 'ha', 'wa'])

In [None]:
stop_list

In [None]:
# make stopped list of tokens
tokens_stopped_pos = [word for word in tokens_lemmatized_pos if word not in stop_list]
tokens_stopped_neg = [word for word in tokens_lemmatized_neg if word not in stop_list]

In [None]:
# show the most frequently occurring tokens
FreqDist(tokens_stopped_pos).most_common(25)

In [None]:
# show the most frequently occurring tokens
FreqDist(tokens_stopped_neg).most_common(25)

In [None]:
# a function that generates a word cloud of a given list of words
def make_wordcloud(wordlist, colormap='Greens', title=None):
    # instantiate wordcloud
    wordcloud = WordCloud(
        width=600,
        height=400,
        colormap=colormap,
        collocations = True
    )
    return wordcloud.generate(','.join(wordlist))

def plot_wordcloud(wordcloud):
    # plot wordcloud
    plt.figure(figsize = (12, 15)) 
    plt.imshow(wordcloud) 
    plt.axis('off');

In [None]:
# word cloud of stopped words
plot_wordcloud(make_wordcloud(tokens_stopped_pos))

In [None]:
# word cloud of stopped words
plot_wordcloud(make_wordcloud(tokens_stopped_neg))

# end