In [1]:
import pandas as pd
data_test = 'https://raw.githubusercontent.com/aryan-bu/BA820/main/drug_reviews_dataset/drugsComTest_raw.tsv'
data_train = 'https://raw.githubusercontent.com/aryan-bu/BA820/main/drug_reviews_dataset/drugsComTrain_raw.tsv'

df_test = pd.read_csv(data_test, delimiter='\t')
df_train = pd.read_csv(data_train, delimiter='\t')

In [2]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [4]:
df_test.shape

(53766, 7)

In [5]:
df_train.shape

(161297, 7)

In [6]:
df = pd.concat([df_test, df_train])
df.shape

(215063, 7)

In [7]:
df = df.rename(columns={'Unnamed: 0': 'index'})
df = df.sort_values(by='index', ascending=True)
df.head()

Unnamed: 0,index,drugName,condition,review,rating,date,usefulCount
47805,0,Medroxyprogesterone,Abnormal Uterine Bleeding,"""Been on the depo injection since January 2015...",3.0,"October 28, 2015",4
93135,2,Medroxyprogesterone,Amenorrhea,"""I&#039;m 21 years old and recently found out ...",10.0,"October 27, 2015",11
143331,3,Medroxyprogesterone,Abnormal Uterine Bleeding,"""I have been on the shot 11 years and until a ...",8.0,"October 27, 2015",7
57030,4,Medroxyprogesterone,Birth Control,"""Ive had four shots at this point. I was on bi...",9.0,"October 26, 2015",12
106347,5,Medroxyprogesterone,Abnormal Uterine Bleeding,"""I had a total of 3 shots. I got my first one ...",1.0,"October 25, 2015",4


In place of the ' symbol, we have some unwanted characters. We can remove them for our analysis. 

In [12]:
df['review'] = df['review'].str.replace('&#039;', '')
df.head()

Unnamed: 0,index,drugName,condition,review,rating,date,usefulCount
47805,0,Medroxyprogesterone,Abnormal Uterine Bleeding,"""Been on the depo injection since January 2015...",3.0,"October 28, 2015",4
93135,2,Medroxyprogesterone,Amenorrhea,"""Im 21 years old and recently found out I migh...",10.0,"October 27, 2015",11
143331,3,Medroxyprogesterone,Abnormal Uterine Bleeding,"""I have been on the shot 11 years and until a ...",8.0,"October 27, 2015",7
57030,4,Medroxyprogesterone,Birth Control,"""Ive had four shots at this point. I was on bi...",9.0,"October 26, 2015",12
106347,5,Medroxyprogesterone,Abnormal Uterine Bleeding,"""I had a total of 3 shots. I got my first one ...",1.0,"October 25, 2015",4


In [13]:
df.isnull().sum()

index          0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64

In [14]:
df = df.dropna()
df.isnull().sum()

index          0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64

In [10]:
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,index,drugName,condition,review,rating,date,usefulCount


There are no duplicate values as well and we can move ahead with our analysis. Since the dataset is large, we can randomly select 20,000 rows for initial analysis.

## Sentiment Categorization

In [16]:
df.rating.unique()

array([ 3., 10.,  8.,  9.,  1.,  5.,  2.,  7.,  4.,  6.])

In [17]:
df['sentiment'] = df['rating'].apply(lambda x: 'negative' if x <= 3 else ('neutral' if x <= 6 else 'positive'))

##Sampling

In [18]:
sample_df = df.sample(n=20000, random_state=42)
sample_df.shape

(20000, 8)

##Tokenization

In [23]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer
from nltk.tokenize.casual import TweetTokenizer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
review_len = len(df['review'])
review_len

213869

###Sentence Tokenization

In [27]:
tokenized = [[sent_tokenize(review)] for review in df['review']]
#tokenized

###Word Tokenization

In [28]:
tokenized_word = [word_tokenize(review) for review in df['review']]
#tokenized_word.head()

###Punctuation Tokenization

In [29]:
tokenized_punct = [WhitespaceTokenizer().tokenize(review) for review in df['review']]
#tokenized_punct