In [2]:
import pandas as pd

In [3]:
d1 = pd.read_csv('../data/drugsComTrain_raw.tsv', delimiter='\t', encoding='latin-1')
d2 = pd.read_csv('../data/drugsComTest_raw.tsv', delimiter='\t', encoding='latin-1')
df = pd.concat([d1,d2]).reset_index().drop(columns=['Unnamed: 0', 'index'])

In [13]:
df[:61]

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
...,...,...,...,...,...,...
56,Pyridium,Dysuria,"""I&#039;ve been having UTIs for 7 years, my mo...",1.0,"October 13, 2016",8
57,Latuda,Bipolar Disorde,"""I have had great experience so far with Latud...",8.0,"February 20, 2012",39
58,Bupropion,Smoking Cessation,"""Love this, no mouth sores, or ulcers like Wel...",10.0,"August 22, 2014",15
59,Implanon,Birth Control,"""Never again! After being on depo I was suppos...",2.0,"August 20, 2015",1


# functions

In [None]:
def show_review(index):
    print(df.review.loc[index])
    display(df[df.review == df.loc[index].review][['drugName', 'condition', 'rating', 'date', 'usefulCount']])

In [None]:
def show_similar(index):
    
    count_total = df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.date == df.loc[index].date)
    ].review.count()
    
    count_similar = df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating) & \
        (df.date == df.loc[index].date)
    ].review.count()
    
    print('On', df.loc[index].date, df.loc[index].drugName, 'was reviewed', count_total, \
          'times and received a rating of', df.loc[index].rating, count_similar, 'times.\n')
    print('From that date, here are all', count_similar, 'reviews with the same rating:\n')
    for ind in df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating) & \
        (df.date == df.loc[index].date)
    ].index:
        print(df.loc[ind].review,'\n')
    
    print('Here is a breakdown of all the dates when reviewers gave the same drug name and condition THIS RATING:')
    display(df[
        (df.drugName == df.loc[index].drugName) & \
        (df.condition == df.loc[index].condition) & \
        (df.rating == df.loc[index].rating)
    ].date.value_counts())

# missing values

In [None]:
len(df[df.condition.isna()])

In [None]:
df.condition.fillna('missing', inplace=True)

In [None]:
len(df[df.condition == 'missing'])

We noticed another condition label that was meant to indicate missing and should be accordingly changed.

In [None]:
df.condition = df.condition.apply(lambda x: 'missing' if 'Not Listed' in x else x)

In [None]:
len(df[df.condition == 'missing'])

We've identified some actual missing condition labels, but we noticed there are more condition labels that seem suspicious, particularly ones that start with something other than an upper case character. Let's look at all such condition labels.

In [None]:
set(df[(~df.condition.str[0].isin(list(string.ascii_uppercase))) &
   (df.condition != 'missing')
  ].condition)

These fall into three categories. Ones that include "users found this comment helpful" should be regarded as erroneous and therefore missing.

In [None]:
df.condition = df.condition.apply(lambda x: 'missing' if 'users found' in x else x)

In [None]:
len(df[df.condition == 'missing'])

 Ones that show a clipped copy of the drug name and end with a parenthesis should also be regarded as missing.

In [None]:
df.condition = df.condition.apply(lambda x: 'missing' \
                                  if x[0] not in list(string.ascii_uppercase) and \
                                  x[-1] in ['(', ')'] \
                                  else x)

In [None]:
len(df[df.condition == 'missing'])

Most of the ones that show a clipped version of the condition label can possibly be restored.

In [None]:
def condition_restore(condition):
    if condition.split()[-1] in ['Disorde', 'eve', 'Shoulde', 'Cance']:
        condition = condition+'r'
    if condition.split()[0] in ['acial', 'ibrocystic', 'ungal', 'amilial', 'ailure', 'ever', \
                                'emale', 'unctional', 'actor', 'ibromyalgia', 'atigue']:
        condition = 'F'+condition
    if condition.split()[0] in ['llicular', 'llicle', 'lic', 'cal']:
        condition = 'Fo'+condition
    if condition.split()[0] in ['mance']:
        condition = 'Perfor'+condition
    if condition.split()[0] in ['zen']:
        condition = 'Fro'+condition
    if condition.split()[0] in ['mis']:
        condition = 'Dermatitis Herpetifor'+condition
    return condition

df.condition = df.condition.apply(lambda x: condition_restore(x))

Let's look at what we have left.

In [None]:
set(df[(~df.condition.str[0].isin(list(string.ascii_uppercase))) &
   (df.condition != 'missing')
  ].condition)

"von Willebrand's Disease" appears to be a naturally uncapitalized condition. The others have been impossible to restore and will also be regarded as missing.

In [None]:
df.condition = df.condition.apply(lambda x: 'missing' \
                                  if x[0] not in list(string.ascii_uppercase) and \
                                  x.split()[0] != 'von' \
                                  else x)

In [None]:
len(df[df.condition == 'missing'])

## proposed solutions for missing values

1. For every record with a missing condition, we will assign it the condition that is most common for the drug indicated by that record.

2. Before executing solution 1, find each record's twin and use the condition label from the twin where applicable.

For now, we'll just execute solution 2.

In [None]:
drugs_w_missing_condition = list(set(df[df.condition == 'missing'].drugName))

In [None]:
len(drugs_w_missing_condition)

This applies to about a quarter of the drugs. We'll create a dictionary that reports the most common condition for these drugs.

In [None]:
most_common_condition = {}

for drug in drugs_w_missing_condition:
    condition = df[df.drugName == drug].condition.value_counts().idxmax()
    if condition == 'missing' and len(set(df[df.drugName == drug].condition)) > 1:
        condition = df[(df.drugName == drug) &
                       (df.condition != 'missing')
                      ].condition.value_counts().idxmax()
    proportion = round(df[df.drugName == drug].condition.value_counts(normalize=True)[0],2)
    most_common_condition[drug] = [condition, proportion]

In [None]:
most_common_condition['Viagra']

For example, if a review with an unlisted condition is about Viagra, we will assume the condition is Erectile Dysfunction.

In [None]:
df['condition'] = df.apply(lambda x: most_common_condition[x.drugName][0] \
                           if x.condition == 'missing' \
                           else x.condition, axis = 1)

In [None]:
len(df[df.condition == 'missing'])

This is how many records there are that still have no label for condition. This means the drugs indicated in these records are *only* indicated in references without an indicated condition. They may still have a "twin" records that we could match them to, but while we're skipping that solution step, there's not really anything we can do with these records, and we may as well drop them.

In [None]:
df.drop(df[df.condition == 'missing'].index, inplace=True)

# duplicates

In [None]:
df.duplicated().value_counts()

In [None]:
df[df.duplicated()]

In [None]:
show_review(178703)

This is curious. The same review is recorded four times. There are two identical pairs, where the difference between the pairs is the drug name. We can drop one from each pair, but this will need to be revisited.

In [None]:
df.drop_duplicates(inplace=True)

# contractions

Here is an example of a contraction.

In [None]:
df.review[3][56:69]

Here is how the html function fixes it.

In [None]:
html.unescape(df.loc[3][2])[56:64]

Here is how the contractions function fixes (the html function's fix of) it.

In [None]:
contractions.fix(html.unescape(df.loc[3][2]))[56:65]

Here is an instance of "ain't" with the same functions applied.

In [None]:
df.review.loc[507][75:99]

In [None]:
html.unescape(df.review.loc[507])[75:94]

In [None]:
contractions.fix(html.unescape(df.review.loc[507]))[75:96]

In [None]:
len(df[df.review.str.contains('ain&#039;t')])

There are 53 instances of "ain't".

I'm currently having difficulty downloading the package that appropriately fixes "ain't" into "is not" or "are not" etc. This shouldn't matter after I remove stop words. I think it will be helpful to exclude negatives like "no" and "not" from the stop words. It could certainly be of help to look for bigrams like "not good".

In [None]:
df.review = df.review.apply(lambda x: html.unescape(x))

# make some dummy dfs to use

In [None]:
df_old = df.copy()
len(df_old[df_old.duplicated(subset = df_old.columns.difference(['drugName']))])

In [None]:
df_bc = df_old.drop(df_old[df_old.condition != 'Birth Control'].index)
len(df_bc[df_bc.duplicated(subset = df_bc.columns.difference(['drugName']))])

In [None]:
df_20000 = df_bc[df_bc.index.isin(np.random.choice(df_bc.index.tolist(), 20000, replace=False))]
len(df_20000[df_20000.duplicated(subset = df_20000.columns.difference(['drugName']))])

In [None]:
df_10000 = df_bc[df_bc.index.isin(np.random.choice(df_bc.index.tolist(), 10000, replace=False))]
len(df_10000[df_10000.duplicated(subset = df_10000.columns.difference(['drugName']))])

In [None]:
df_5000 = df_bc[df_bc.index.isin(np.random.choice(df_bc.index.tolist(), 5000, replace=False))]
len(df_5000[df_5000.duplicated(subset = df_5000.columns.difference(['drugName']))])

In [None]:
df_2000 = df_bc[df_bc.index.isin(np.random.choice(df_bc.index.tolist(), 2000, replace=False))]
len(df_2000[df_2000.duplicated(subset = df_2000.columns.difference(['drugName']))])

# date buckets with df_bc

In [None]:
df = df_bc.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
dates_bucket = {}
for date in list(set(df[~df.index.isin(bucket_A)].date.tolist())):
    dates_bucket[date] = []
for i in df.index:
    dates_bucket[df.loc[i].date].append(i)

In [None]:
%%time
found_pairs = 0
twins = []

for i in bucket_A:
    date_i = df.loc[i].date
    for j in dates_bucket[date_i]:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            dates_bucket[date_i].remove(j)
            twins.append([i,j])
            break
print(found_pairs)

# date buckets with 20000

In [None]:
df = df_20000.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
bucket_A = df[df.duplicated].index.tolist()
dates_bucket = {}
for date in list(set(df[~df.index.isin(bucket_A)].date.tolist())):
    dates_bucket[date] = []
for i in df.index:
    dates_bucket[df.loc[i].date].append(i)

In [None]:
%%time
found_pairs = 0
twins = []

for i in bucket_A:
    date_i = df.loc[i].date
    for j in dates_bucket[date_i]:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            dates_bucket[date_i].remove(j)
            twins.append([i,j])
            break
print(found_pairs)

In [None]:
len(twins)

# NEXT:

experiment with the code below to see if separating into date buckets makes it faster.

# DON'T FORGET:

run tests to see whether there are triples??

# experiment: 2,000 records // 50 duplicates

In [None]:
df = df_2000.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
bucket_A = df[df.duplicated].index.tolist()
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
print(found_pairs)

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            break
print(found_pairs)

In [None]:
%%time
found_pairs = 0
twins = []
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            twins.append([i,j])
            break
print(found_pairs)

# experiment: 5,000 records // 310 duplicates

In [None]:
df = df_5000.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
bucket_A = df[df.duplicated].index.tolist()
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
print(found_pairs)

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            break
print(found_pairs)

In [None]:
%%time
found_pairs = 0
twins = []
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            twins.append([i,j])
            break
print(found_pairs)

# experiment: 5,000 records // 310 duplicates // extra dates

In [None]:
df = df_5000.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
bucket_A = df[df.duplicated].index.tolist()
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
print(found_pairs)

In [None]:
%%time
found_pairs = 0
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            break
print(found_pairs)

In [None]:
%%time
found_pairs = 0
twins = []
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            twins.append([i,j])
            break
print(found_pairs)

# experiment: 39,499 records // 19,420 duplicates

In [None]:
df = df_bc.copy()

In [None]:
df.drop(columns='drugName', inplace=True)

In [None]:
%%time
bucket_A = df[df.duplicated].index.tolist()
bucket_B = df[~df.index.isin(bucket_A)].index.tolist()

In [None]:
%%time
found_pairs = 0
twins = []
for i in bucket_A:
    for j in bucket_B:
        if df.loc[i].equals(df.loc[j]):
            found_pairs += 1
            bucket_B.remove(j)
            twins.append([i,j])
            break
print(found_pairs)

In [None]:
len(twins)

# end