In [1]:
import os
import pandas as pd
import numpy as np

import dateparser

from bs4 import BeautifulSoup  
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

[nltk_data] Downloading package stopwords to /Users/qin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Load data

In [2]:
# path of the current directory
cur_dir = os.getcwd()
print("Current directory: {}".format(cur_dir))
# list files in the current directory
#print("Files in the current directory: {}".format(os.listdir(cur_dir)))
data_path = cur_dir+"/drugsCom"
train_path = data_path + "/drugsComTrain_raw.tsv"
test_path = data_path + "/drugsComTest_raw.tsv"
fda_path = data_path + "/drugs_recalls_fda.tsv"
#rawdata_path = cur_dir+"/drugsCom"
#metadata_path = cur_dir+"/meta_Health_and_Personal_Care.json.gz"
print("Path of the raw data: {}".format(data_path))
print("Files in the drugCom directory: {}".format(os.listdir(data_path)))

Current directory: /Users/qin/Desktop/DrugCom/CS6250_Project
Path of the raw data: /Users/qin/Desktop/DrugCom/CS6250_Project/drugsCom
Files in the drugCom directory: ['whole.csv', 'test.csv', 'drugs_recalls_fda.tsv', 'readme.md', 'drugsComTest_raw.tsv', 'fda.csv', 'drugsComTrain_raw.tsv', 'train.csv']


In [3]:
train = pd.read_table(train_path, sep='\t')
test = pd.read_table(test_path, sep='\t')
fda = pd.read_csv(fda_path, sep='\t')

#### helper functions

In [4]:
# print(drugsCom['condition'].unique())
# hardcode drugsCom['condition']
# 23</span> users found this comment helpful.' ==> ''
# 'mist (' ==> 'mist'
# 'not listed / othe' ==> ''
# 'min / sitagliptin)' ==> 'min / sitagliptin'
# 'pe', 'me' ==> ''
# other: ==> .lower()

def parse_condition(string):
    if np.isnan(string):
        return ""
    if "</span>" in string:
        return ""
    if "not listed" in string:
        return ""
    if len(string) < 4:
        return ""
    if '(' in string and (not ')' in string):
        return string.strip('( ').lower()
    if (not '(' in string) and ')' in string:
        return string.strip(') ').lower()
    return string.lower()

# Convert a raw review to a cleaned review
def cleanText(raw_text, remove_stopwords=True, stemming=False, split_text=True):    
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case 
    
    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # stemming
        # Stemmers remove morphological affixes from words, leaving only the word stem.
        # http://www.nltk.org/howto/stem.html
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:  # split text
        return (words)
    
    return(" ".join(words))

def average_word_length(list_of_words, is_list = True):
    if is_list == True:
        if type(list_of_words) != list:
            list_of_words = list(list_of_words)
        if list_of_words == []:
            return 0
        res = 0
        count = len(list_of_words)
        for item in list_of_words:
            res += len(item)
        return res/count
    list_of_words = list_of_words.split(' ')
    return average_word_length(list_of_words, is_list = True)

In [5]:
def construct_df(drugsCom):
    drugsCom['date'] = drugsCom['date'].apply(lambda x: pd.Timestamp(dateparser.parse(x), unit = 'D'))
    drugsCom['partial_name'] = drugsCom['drugName'].apply(lambda x: x.split('/')[0].strip().lower())
    
    drugsCom['is_recalled'] = drugsCom['partial_name'].apply(lambda x: fda['Product Description'].str.contains(x, case=False, regex=False).any())
    
    drugsCom['condition'] = drugsCom['condition'].fillna(' ')
    drugsCom['condition'] = drugsCom['condition'].apply(lambda x: x.lower().strip())
    
    drugsCom['review_length'] = drugsCom['review'].apply(len)
    drugsCom['review_word_count'] = drugsCom['review'].apply(lambda x: len(x.split(' ')))
    
    drugsCom['cleaned_words'] = drugsCom['review'].apply(lambda x: cleanText(x, remove_stopwords=True, stemming=False, split_text=True))
    drugsCom['review_cleaned_word_count'] = drugsCom['cleaned_words'].apply(len)
    
    drugsCom['review_avg_word_length'] = drugsCom['review'].apply(lambda x: average_word_length(x, is_list = False))
    drugsCom['review_avg_cleaned_word_length'] = drugsCom['cleaned_words'].apply(lambda x: average_word_length(x, is_list = True))
    return drugsCom
    

In [6]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [7]:
train = construct_df(train)
test = construct_df(test)

In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,partial_name,is_recalled,review_length,review_word_count,cleaned_words,review_cleaned_word_count,review_avg_word_length,review_avg_cleaned_word_length
0,206461,Valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27,valsartan,True,79,17,"[side, effect, take, combination, bystolic, mg...",8,3.705882,5.25
1,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192,guanfacine,False,741,141,"[son, halfway, fourth, week, intuniv, became, ...",65,4.262411,5.630769
2,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17,lybrel,False,755,133,"[used, take, another, oral, contraceptive, pil...",69,4.684211,5.637681
3,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,2015-11-03,10,ortho evra,False,448,89,"[first, time, using, form, birth, control, gla...",39,4.044944,5.282051
4,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37,buprenorphine,False,719,134,"[suboxone, completely, turned, life, around, f...",59,4.373134,6.355932


In [9]:
test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,partial_name,is_recalled,review_length,review_word_count,cleaned_words,review_cleaned_word_count,review_avg_word_length,review_avg_cleaned_word_length
0,163740,Mirtazapine,depression,"""I&#039;ve tried a few antidepressants over th...",10.0,2012-02-28,22,mirtazapine,False,439,68,"[tried, antidepressants, years, citalopram, fl...",36,5.470588,7.083333
1,206473,Mesalamine,"crohn's disease, maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,2009-05-17,17,mesalamine,False,268,51,"[son, crohn, disease, done, well, asacol, comp...",24,4.27451,5.5
2,159672,Bactrim,urinary tract infection,"""Quick reduction of symptoms""",9.0,2017-09-29,3,bactrim,False,29,4,"[quick, reduction, symptoms]",3,6.5,7.333333
3,39293,Contrave,weight loss,"""Contrave combines drugs that were used for al...",9.0,2017-03-05,35,contrave,False,782,143,"[contrave, combines, drugs, used, alcohol, smo...",64,4.475524,5.625
4,97768,Cyclafem 1 / 35,birth control,"""I have been on this birth control for one cyc...",9.0,2015-10-22,4,cyclafem 1,False,762,149,"[birth, control, one, cycle, reading, reviews,...",63,4.120805,5.603175


In [10]:
# The variable 'Unnamed: 0' is the id for each review
assert(train.shape[0] == train.iloc[:, 0].nunique())
assert(test.shape[0] == test.iloc[:, 0].nunique())
train = train.rename(columns = {'Unnamed: 0':'id'})
test = test.rename(columns = {'Unnamed: 0':'id'})

In [11]:
train.to_csv('train.csv', sep=',', na_rep='', header=True, index=False)
test.to_csv('test.csv', sep=',', na_rep='', header=True, index=False)

In [12]:
whole = pd.concat([train, test], ignore_index=True)
whole.to_csv('whole.csv', sep=',', na_rep='', header=True, index=False)