# Sentiment Analysis of Drug Reviews Using Natural Language Processing Techniques

## Problem Statement:

This study applies sentiment analysis and textual pattern extraction on drug reviews to identify not only user sentiment but also key factors such as side effects, ineffectiveness, or condition mismatch that contribute to negative medication experiences.

In [None]:
import pandas as pd
import re
import nltk
import string
import html
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# download NLTK resources (do only once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Basic Data Cleaning

1. Have basic understanding and cleaning of the data.

In [None]:
#df = pd.read_csv('drugsComTrain_raw.tsv' , sep='\t')
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t', on_bad_lines='skip')

df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [None]:
df.tail()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1.0,"November 1, 2011",34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2.0,"March 15, 2014",35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10.0,"September 19, 2015",79
161296,215220,Lubiprostone,"Constipation, Chronic","""I&#039;ve had chronic constipation all my adu...",9.0,"December 13, 2014",116


In [None]:
#Basic checks
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   161297 non-null  int64  
 1   drugName     161297 non-null  object 
 2   condition    160398 non-null  object 
 3   review       161297 non-null  object 
 4   rating       161297 non-null  float64
 5   date         161297 non-null  object 
 6   usefulCount  161297 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 8.6+ MB


In [None]:
#Check for missing values
df.isna().sum()


Unnamed: 0,0
Unnamed: 0,0
drugName,0
condition,899
review,0
rating,0
date,0
usefulCount,0


In [None]:
#Remove missing values as it only contributes to ~0.9% of the data
df.dropna(subset=['condition'], inplace=True)
#Double check for missing values
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
drugName,0
condition,0
review,0
rating,0
date,0
usefulCount,0


In [None]:
#Rename columns
df.rename(columns={'Unnamed: 0': 'patient_id'}, inplace=True)

In [None]:
#Check for duplicates
df.duplicated().sum()

np.int64(0)

In [None]:
#Change date to proper format
df['date'] = pd.to_datetime(df['date'], format='%B %d, %Y')


In [None]:
#Final check
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160398 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   patient_id   160398 non-null  int64         
 1   drugName     160398 non-null  object        
 2   condition    160398 non-null  object        
 3   review       160398 non-null  object        
 4   rating       160398 non-null  float64       
 5   date         160398 non-null  datetime64[ns]
 6   usefulCount  160398 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 9.8+ MB


In [None]:
#Create copy of cleaned data
df2=df.copy()

In [None]:
df2.head()

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37


## Text Preprocessing (from lab 3 & 4)

1. Sentence Segmentation

In [None]:
from nltk.tokenize import sent_tokenize

pd.set_option('display.max_colwidth', None)

df2['sentences'] = df2['review'].apply(sent_tokenize)
df2[['sentences']].head()

Unnamed: 0,sentences
0,"[""It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil""]"
1,"[""My son is halfway through his fourth week of Intuniv., We became concerned when he began this last week, when he started taking the highest dose he will be on., For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.), I called his doctor on Monday morning and she said to stick it out a few days., See how he did at school, and with getting up in the morning., The last two days have been problem free., He is MUCH more agreeable than ever., He is less emotional (a good thing), less cranky., He is remembering all the things he should., Overall his behavior is better., We have tried many different medications and so far this is the most effective.""]"
2,"[""I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects., But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar., When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said., And the period lasted for two weeks., When taking the second pack- same two weeks., And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge., The positive side is that I didn&#039;t have any other side effects., The idea of being period free was so tempting..., Alas.""]"
3,"[""This is my first time using any form of birth control., I&#039;m glad I went with the patch, I have been on it for 8 months., At first It decreased my libido but that subsided., The only downside is that it made my periods longer (5-6 days to be exact) I used to only have periods for 3-4 days max also made my cramps intense for the first two days of my period, I never had cramps before using birth control., Other than that in happy with the patch""]"
4,"[""Suboxone has completely turned my life around., I feel healthier, I&#039;m excelling at my job and I always have money in my pocket and my savings account., I had none of those before Suboxone and spent years abusing oxycontin., My paycheck was already spent by the time I got it and I started resorting to scheming and stealing to fund my addiction., All that is history., If you&#039;re ready to stop, there&#039;s a good chance that suboxone will put you on the path of great life again., I have found the side-effects to be minimal compared to oxycontin., I&#039;m actually sleeping better., Slight constipation is about it for me., It truly is amazing., The cost pales in comparison to what I spent on oxycontin.""]"


2. Tokenization

In [None]:
df2['tokens'] = df2['review'].apply(word_tokenize)
df2[['tokens']].head()

Unnamed: 0,tokens
0,"[``, It, has, no, side, effect, ,, I, take, it, in, combination, of, Bystolic, 5, Mg, and, Fish, Oil, '']"
1,"[``, My, son, is, halfway, through, his, fourth, week, of, Intuniv, ., We, became, concerned, when, he, began, this, last, week, ,, when, he, started, taking, the, highest, dose, he, will, be, on, ., For, two, days, ,, he, could, hardly, get, out, of, bed, ,, was, very, cranky, ,, and, slept, for, nearly, 8, hours, on, a, drive, home, from, school, vacation, (, very, unusual, for, him, ., ), I, called, his, doctor, on, Monday, morning, and, she, said, to, stick, it, out, a, few, days, ., See, how, he, did, at, school, ,, and, with, getting, up, in, ...]"
2,"[``, I, used, to, take, another, oral, contraceptive, ,, which, had, 21, pill, cycle, ,, and, was, very, happy-, very, light, periods, ,, max, 5, days, ,, no, other, side, effects, ., But, it, contained, hormone, gestodene, ,, which, is, not, available, in, US, ,, so, I, switched, to, Lybrel, ,, because, the, ingredients, are, similar, ., When, my, other, pills, ended, ,, I, started, Lybrel, immediately, ,, on, my, first, day, of, period, ,, as, the, instructions, said, ., And, the, period, lasted, for, two, weeks, ., When, taking, the, second, pack-, same, two, weeks, ., And, now, ,, ...]"
3,"[``, This, is, my, first, time, using, any, form, of, birth, control, ., I, &, #, 039, ;, m, glad, I, went, with, the, patch, ,, I, have, been, on, it, for, 8, months, ., At, first, It, decreased, my, libido, but, that, subsided, ., The, only, downside, is, that, it, made, my, periods, longer, (, 5-6, days, to, be, exact, ), I, used, to, only, have, periods, for, 3-4, days, max, also, made, my, cramps, intense, for, the, first, two, days, of, my, period, ,, I, never, had, cramps, before, using, birth, control, ., Other, than, that, in, happy, ...]"
4,"[``, Suboxone, has, completely, turned, my, life, around, ., I, feel, healthier, ,, I, &, #, 039, ;, m, excelling, at, my, job, and, I, always, have, money, in, my, pocket, and, my, savings, account, ., I, had, none, of, those, before, Suboxone, and, spent, years, abusing, oxycontin, ., My, paycheck, was, already, spent, by, the, time, I, got, it, and, I, started, resorting, to, scheming, and, stealing, to, fund, my, addiction, ., All, that, is, history, ., If, you, &, #, 039, ;, re, ready, to, stop, ,, there, &, #, 039, ;, s, a, good, chance, that, suboxone, ...]"


3. Case folding

In [None]:
#lowercasing
df2['tokens_lower'] = df2['tokens'].apply(lambda x: [word.lower() for word in x])
print(df2[['tokens_lower']].head())

#uppercasing
df2['tokens_upper'] = df2['tokens'].apply(lambda x: [word.upper() for word in x])
print(df2[['tokens_upper']].head())



                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           tokens_lower
0                                                                                                                                                                                                                                                                                                                                                                                                               

4. Punctuation Removal

In [None]:
df2['tokens_no_punct'] = df2['tokens_lower'].apply(lambda x: [word for word in x if word not in string.punctuation])
df2 [['tokens_no_punct']].head()

Unnamed: 0,tokens_no_punct
0,"[``, it, has, no, side, effect, i, take, it, in, combination, of, bystolic, 5, mg, and, fish, oil, '']"
1,"[``, my, son, is, halfway, through, his, fourth, week, of, intuniv, we, became, concerned, when, he, began, this, last, week, when, he, started, taking, the, highest, dose, he, will, be, on, for, two, days, he, could, hardly, get, out, of, bed, was, very, cranky, and, slept, for, nearly, 8, hours, on, a, drive, home, from, school, vacation, very, unusual, for, him, i, called, his, doctor, on, monday, morning, and, she, said, to, stick, it, out, a, few, days, see, how, he, did, at, school, and, with, getting, up, in, the, morning, the, last, two, days, have, been, problem, free, he, ...]"
2,"[``, i, used, to, take, another, oral, contraceptive, which, had, 21, pill, cycle, and, was, very, happy-, very, light, periods, max, 5, days, no, other, side, effects, but, it, contained, hormone, gestodene, which, is, not, available, in, us, so, i, switched, to, lybrel, because, the, ingredients, are, similar, when, my, other, pills, ended, i, started, lybrel, immediately, on, my, first, day, of, period, as, the, instructions, said, and, the, period, lasted, for, two, weeks, when, taking, the, second, pack-, same, two, weeks, and, now, with, third, pack, things, got, even, worse-, my, third, period, lasted, for, two, weeks, and, now, ...]"
3,"[``, this, is, my, first, time, using, any, form, of, birth, control, i, 039, m, glad, i, went, with, the, patch, i, have, been, on, it, for, 8, months, at, first, it, decreased, my, libido, but, that, subsided, the, only, downside, is, that, it, made, my, periods, longer, 5-6, days, to, be, exact, i, used, to, only, have, periods, for, 3-4, days, max, also, made, my, cramps, intense, for, the, first, two, days, of, my, period, i, never, had, cramps, before, using, birth, control, other, than, that, in, happy, with, the, patch, '']"
4,"[``, suboxone, has, completely, turned, my, life, around, i, feel, healthier, i, 039, m, excelling, at, my, job, and, i, always, have, money, in, my, pocket, and, my, savings, account, i, had, none, of, those, before, suboxone, and, spent, years, abusing, oxycontin, my, paycheck, was, already, spent, by, the, time, i, got, it, and, i, started, resorting, to, scheming, and, stealing, to, fund, my, addiction, all, that, is, history, if, you, 039, re, ready, to, stop, there, 039, s, a, good, chance, that, suboxone, will, put, you, on, the, path, of, great, life, again, i, have, found, the, side-effects, to, ...]"


5. Stopword Removal

In [None]:
stop_words = set(stopwords.words('english'))
df2['tokens_no_stop'] = df2['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
df2[['tokens_no_stop']].head()

Unnamed: 0,tokens_no_stop
0,"[``, It, side, effect, ,, I, take, combination, Bystolic, 5, Mg, Fish, Oil, '']"
1,"[``, My, son, halfway, fourth, week, Intuniv, ., We, became, concerned, began, last, week, ,, started, taking, highest, dose, ., For, two, days, ,, could, hardly, get, bed, ,, cranky, ,, slept, nearly, 8, hours, drive, home, school, vacation, (, unusual, ., ), I, called, doctor, Monday, morning, said, stick, days, ., See, school, ,, getting, morning, ., The, last, two, days, problem, free, ., He, MUCH, agreeable, ever, ., He, less, emotional, (, good, thing, ), ,, less, cranky, ., He, remembering, things, ., Overall, behavior, better, ., We, tried, many, different, medications, far, effective, ., '']"
2,"[``, I, used, take, another, oral, contraceptive, ,, 21, pill, cycle, ,, happy-, light, periods, ,, max, 5, days, ,, side, effects, ., But, contained, hormone, gestodene, ,, available, US, ,, I, switched, Lybrel, ,, ingredients, similar, ., When, pills, ended, ,, I, started, Lybrel, immediately, ,, first, day, period, ,, instructions, said, ., And, period, lasted, two, weeks, ., When, taking, second, pack-, two, weeks, ., And, ,, third, pack, things, got, even, worse-, third, period, lasted, two, weeks, &, #, 039, ;, end, third, week-, I, still, daily, brown, discharge, ., The, positive, side, I, &, #, 039, ...]"
3,"[``, This, first, time, using, form, birth, control, ., I, &, #, 039, ;, glad, I, went, patch, ,, I, 8, months, ., At, first, It, decreased, libido, subsided, ., The, downside, made, periods, longer, (, 5-6, days, exact, ), I, used, periods, 3-4, days, max, also, made, cramps, intense, first, two, days, period, ,, I, never, cramps, using, birth, control, ., Other, happy, patch, '']"
4,"[``, Suboxone, completely, turned, life, around, ., I, feel, healthier, ,, I, &, #, 039, ;, excelling, job, I, always, money, pocket, savings, account, ., I, none, Suboxone, spent, years, abusing, oxycontin, ., My, paycheck, already, spent, time, I, got, I, started, resorting, scheming, stealing, fund, addiction, ., All, history, ., If, &, #, 039, ;, ready, stop, ,, &, #, 039, ;, good, chance, suboxone, put, path, great, life, ., I, found, side-effects, minimal, compared, oxycontin, ., I, &, #, 039, ;, actually, sleeping, better, ., Slight, constipation, ., It, truly, amazing, ., The, cost, pales, comparison, I, spent, ...]"


6. Abbreviation Handling

In [None]:
def clean_abbreviations(tokens):
    #Remove dots from technical abbreviations
    text = ' '.join(tokens)
    text = re.sub(r'\b(Dr|Mr|Ms|M\.D|U\.S|U\.K|M\.I\.T)\.', lambda x: x.group(0).replace('.', ''), text)
    #Remove dots from single-letter abbreviations (e.g., M.I.T. → MIT)
    text = re.sub(r'\b([A-Z])\.', r'\1', text)
    return text.split()

df2['tokens_no_abbrev'] = df2['tokens_no_stop'].apply(clean_abbreviations)
df2['tokens_no_abbrev'].head()

Unnamed: 0,tokens_no_abbrev
0,"[``, It, side, effect, ,, I, take, combination, Bystolic, 5, Mg, Fish, Oil, '']"
1,"[``, My, son, halfway, fourth, week, Intuniv, ., We, became, concerned, began, last, week, ,, started, taking, highest, dose, ., For, two, days, ,, could, hardly, get, bed, ,, cranky, ,, slept, nearly, 8, hours, drive, home, school, vacation, (, unusual, ., ), I, called, doctor, Monday, morning, said, stick, days, ., See, school, ,, getting, morning, ., The, last, two, days, problem, free, ., He, MUCH, agreeable, ever, ., He, less, emotional, (, good, thing, ), ,, less, cranky, ., He, remembering, things, ., Overall, behavior, better, ., We, tried, many, different, medications, far, effective, ., '']"
2,"[``, I, used, take, another, oral, contraceptive, ,, 21, pill, cycle, ,, happy-, light, periods, ,, max, 5, days, ,, side, effects, ., But, contained, hormone, gestodene, ,, available, US, ,, I, switched, Lybrel, ,, ingredients, similar, ., When, pills, ended, ,, I, started, Lybrel, immediately, ,, first, day, period, ,, instructions, said, ., And, period, lasted, two, weeks, ., When, taking, second, pack-, two, weeks, ., And, ,, third, pack, things, got, even, worse-, third, period, lasted, two, weeks, &, #, 039, ;, end, third, week-, I, still, daily, brown, discharge, ., The, positive, side, I, &, #, 039, ...]"
3,"[``, This, first, time, using, form, birth, control, ., I, &, #, 039, ;, glad, I, went, patch, ,, I, 8, months, ., At, first, It, decreased, libido, subsided, ., The, downside, made, periods, longer, (, 5-6, days, exact, ), I, used, periods, 3-4, days, max, also, made, cramps, intense, first, two, days, period, ,, I, never, cramps, using, birth, control, ., Other, happy, patch, '']"
4,"[``, Suboxone, completely, turned, life, around, ., I, feel, healthier, ,, I, &, #, 039, ;, excelling, job, I, always, money, pocket, savings, account, ., I, none, Suboxone, spent, years, abusing, oxycontin, ., My, paycheck, already, spent, time, I, got, I, started, resorting, scheming, stealing, fund, addiction, ., All, history, ., If, &, #, 039, ;, ready, stop, ,, &, #, 039, ;, good, chance, suboxone, put, path, great, life, ., I, found, side-effects, minimal, compared, oxycontin, ., I, &, #, 039, ;, actually, sleeping, better, ., Slight, constipation, ., It, truly, amazing, ., The, cost, pales, comparison, I, spent, ...]"


7. Stemming

In [None]:
stemmer = PorterStemmer()
df2['stemmed_tokens'] = df2['tokens_no_abbrev'].apply(lambda x: [stemmer.stem(word) for word in x])
df2[['stemmed_tokens']].head()


Unnamed: 0,stemmed_tokens
0,"[``, it, side, effect, ,, i, take, combin, bystol, 5, mg, fish, oil, '']"
1,"[``, my, son, halfway, fourth, week, intuniv, ., we, becam, concern, began, last, week, ,, start, take, highest, dose, ., for, two, day, ,, could, hardli, get, bed, ,, cranki, ,, slept, nearli, 8, hour, drive, home, school, vacat, (, unusu, ., ), i, call, doctor, monday, morn, said, stick, day, ., see, school, ,, get, morn, ., the, last, two, day, problem, free, ., he, much, agreeabl, ever, ., he, less, emot, (, good, thing, ), ,, less, cranki, ., he, rememb, thing, ., overal, behavior, better, ., we, tri, mani, differ, medic, far, effect, ., '']"
2,"[``, i, use, take, anoth, oral, contracept, ,, 21, pill, cycl, ,, happy-, light, period, ,, max, 5, day, ,, side, effect, ., but, contain, hormon, gestoden, ,, avail, us, ,, i, switch, lybrel, ,, ingredi, similar, ., when, pill, end, ,, i, start, lybrel, immedi, ,, first, day, period, ,, instruct, said, ., and, period, last, two, week, ., when, take, second, pack-, two, week, ., and, ,, third, pack, thing, got, even, worse-, third, period, last, two, week, &, #, 039, ;, end, third, week-, i, still, daili, brown, discharg, ., the, posit, side, i, &, #, 039, ...]"
3,"[``, thi, first, time, use, form, birth, control, ., i, &, #, 039, ;, glad, i, went, patch, ,, i, 8, month, ., at, first, it, decreas, libido, subsid, ., the, downsid, made, period, longer, (, 5-6, day, exact, ), i, use, period, 3-4, day, max, also, made, cramp, intens, first, two, day, period, ,, i, never, cramp, use, birth, control, ., other, happi, patch, '']"
4,"[``, suboxon, complet, turn, life, around, ., i, feel, healthier, ,, i, &, #, 039, ;, excel, job, i, alway, money, pocket, save, account, ., i, none, suboxon, spent, year, abus, oxycontin, ., my, paycheck, alreadi, spent, time, i, got, i, start, resort, scheme, steal, fund, addict, ., all, histori, ., if, &, #, 039, ;, readi, stop, ,, &, #, 039, ;, good, chanc, suboxon, put, path, great, life, ., i, found, side-effect, minim, compar, oxycontin, ., i, &, #, 039, ;, actual, sleep, better, ., slight, constip, ., it, truli, amaz, ., the, cost, pale, comparison, i, spent, ...]"


8. POS Tagging

In [None]:
from nltk import pos_tag

df2['pos_tags'] = df2['stemmed_tokens'].apply(pos_tag)
df2[['pos_tags']].head()


Unnamed: 0,pos_tags
0,"[(``, ``), (it, PRP), (side, JJ), (effect, NN), (,, ,), (i, JJ), (take, VBP), (combin, NN), (bystol, NN), (5, CD), (mg, NN), (fish, JJ), (oil, NN), ('', '')]"
1,"[(``, ``), (my, PRP$), (son, NN), (halfway, RB), (fourth, JJ), (week, NN), (intuniv, NN), (., .), (we, PRP), (becam, VBP), (concern, NN), (began, VBD), (last, JJ), (week, NN), (,, ,), (start, VBP), (take, VB), (highest, JJS), (dose, NN), (., .), (for, IN), (two, CD), (day, NN), (,, ,), (could, MD), (hardli, VB), (get, VB), (bed, VBN), (,, ,), (cranki, NN), (,, ,), (slept, VBD), (nearli, RB), (8, CD), (hour, NN), (drive, NN), (home, NN), (school, NN), (vacat, NN), ((, (), (unusu, JJ), (., .), (), )), (i, NN), (call, NN), (doctor, NN), (monday, NN), (morn, VBN), (said, VBD), (stick, JJ), (day, NN), (., .), (see, VB), (school, NN), (,, ,), (get, VB), (morn, VBN), (., .), (the, DT), (last, JJ), (two, CD), (day, NN), (problem, NN), (free, JJ), (., .), (he, PRP), (much, RB), (agreeabl, IN), (ever, RB), (., .), (he, PRP), (less, JJR), (emot, JJ), ((, (), (good, JJ), (thing, NN), (), )), (,, ,), (less, JJR), (cranki, NN), (., .), (he, PRP), (rememb, VBD), (thing, NN), (., .), (overal, JJ), (behavior, NN), (better, RBR), (., .), (we, PRP), (tri, VBP), (mani, JJ), (differ, NN), (medic, NN), (far, RB), (effect, NN), (., .), ('', '')]"
2,"[(``, ``), (i, NN), (use, VBP), (take, VB), (anoth, DT), (oral, JJ), (contracept, NN), (,, ,), (21, CD), (pill, NN), (cycl, NN), (,, ,), (happy-, JJ), (light, JJ), (period, NN), (,, ,), (max, JJ), (5, CD), (day, NN), (,, ,), (side, NN), (effect, NN), (., .), (but, CC), (contain, JJ), (hormon, JJ), (gestoden, NN), (,, ,), (avail, VBP), (us, PRP), (,, ,), (i, JJ), (switch, VBP), (lybrel, NN), (,, ,), (ingredi, JJ), (similar, JJ), (., .), (when, WRB), (pill, NN), (end, NN), (,, ,), (i, JJ), (start, VBP), (lybrel, NN), (immedi, NN), (,, ,), (first, JJ), (day, NN), (period, NN), (,, ,), (instruct, NN), (said, VBD), (., .), (and, CC), (period, NN), (last, JJ), (two, CD), (week, NN), (., .), (when, WRB), (take, VB), (second, JJ), (pack-, JJ), (two, CD), (week, NN), (., .), (and, CC), (,, ,), (third, JJ), (pack, JJ), (thing, NN), (got, VBD), (even, RB), (worse-, JJ), (third, JJ), (period, NN), (last, JJ), (two, CD), (week, NN), (&, CC), (#, #), (039, CD), (;, :), (end, JJ), (third, JJ), (week-, NN), (i, NN), (still, RB), (daili, VBZ), (brown, JJ), (discharg, NN), (., .), (the, DT), (posit, JJ), (side, NN), (i, NN), (&, CC), (#, #), (039, CD), ...]"
3,"[(``, ``), (thi, VB), (first, JJ), (time, NN), (use, NN), (form, JJ), (birth, NN), (control, NN), (., .), (i, NN), (&, CC), (#, #), (039, CD), (;, :), (glad, NN), (i, NN), (went, VBD), (patch, NN), (,, ,), (i, RB), (8, CD), (month, NN), (., .), (at, IN), (first, JJ), (it, PRP), (decreas, VBZ), (libido, JJ), (subsid, NN), (., .), (the, DT), (downsid, NN), (made, VBD), (period, NN), (longer, RBR), ((, (), (5-6, JJ), (day, NN), (exact, VB), (), )), (i, NN), (use, VBP), (period, NN), (3-4, JJ), (day, NN), (max, NN), (also, RB), (made, VBD), (cramp, NN), (intens, NNS), (first, JJ), (two, CD), (day, NN), (period, NN), (,, ,), (i, JJ), (never, RB), (cramp, VBP), (use, JJ), (birth, NN), (control, NN), (., .), (other, JJ), (happi, JJ), (patch, NN), ('', '')]"
4,"[(``, ``), (suboxon, JJ), (complet, NN), (turn, VB), (life, NN), (around, IN), (., .), (i, NN), (feel, VBP), (healthier, NN), (,, ,), (i, NN), (&, CC), (#, #), (039, CD), (;, :), (excel, JJ), (job, NN), (i, JJ), (alway, RB), (money, NN), (pocket, NN), (save, VBP), (account, NN), (., .), (i, VB), (none, NN), (suboxon, NN), (spent, VBN), (year, NN), (abus, NN), (oxycontin, NN), (., .), (my, PRP$), (paycheck, NN), (alreadi, IN), (spent, JJ), (time, NN), (i, JJ), (got, VBD), (i, JJ), (start, VBP), (resort, NN), (scheme, NN), (steal, NN), (fund, NN), (addict, NN), (., .), (all, DT), (histori, NN), (., .), (if, IN), (&, CC), (#, #), (039, CD), (;, :), (readi, NN), (stop, NN), (,, ,), (&, CC), (#, #), (039, CD), (;, :), (good, JJ), (chanc, NN), (suboxon, NN), (put, VBD), (path, NN), (great, JJ), (life, NN), (., .), (i, VB), (found, VBD), (side-effect, JJ), (minim, NN), (compar, NN), (oxycontin, NN), (., .), (i, NN), (&, CC), (#, #), (039, CD), (;, :), (actual, JJ), (sleep, NN), (better, RBR), (., .), (slight, JJ), (constip, NN), (., .), (it, PRP), (truli, VBD), (amaz, RB), (., .), (the, DT), (cost, NN), (pale, JJ), (comparison, NN), (i, JJ), (spent, VBD), ...]"


9. Word Sense Disambiguation

In [None]:
from nltk.wsd import lesk

df2['wsd'] = df2['review'].apply(lambda x: lesk(word_tokenize(x), 'depression', 'n'))
df2['wsd_def'] = df2['wsd'].apply(lambda x: x.definition() if x else "No definition found")

df2[['wsd', 'wsd_def']].head()


Unnamed: 0,wsd,wsd_def
0,Synset('depressive_disorder.n.01'),a state of depression and anhedonia so severe as to require clinical intervention
1,Synset('depressive_disorder.n.01'),a state of depression and anhedonia so severe as to require clinical intervention
2,Synset('depressive_disorder.n.01'),a state of depression and anhedonia so severe as to require clinical intervention
3,Synset('depressive_disorder.n.01'),a state of depression and anhedonia so severe as to require clinical intervention
4,Synset('depression.n.05'),a period during the 1930s when there was a worldwide economic depression and mass unemployment


10. Translation

In [None]:
pip install deep_translator




In [None]:
from deep_translator import GoogleTranslator

for i, text in enumerate(df2['review'].head(5)):
    try:
        translated = GoogleTranslator(source='auto', target='ms').translate(text)
        print(f"\nReview {i+1} — Original:\n{text}")
        print(f"\nReview {i+1} — Translated (Malay):\n{translated}")
    except Exception as e:
        print(f"\nReview {i+1} — Translation failed: {e}")


Review 1 — Original:
"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"

Review 1 — Translated (Malay):
"Ia tidak mempunyai kesan sampingan, saya mengambilnya dengan kombinasi 5 mg dan minyak ikan"

Review 2 — Original:
"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. 
We have tried many different medications and so far this is the most effective."

Review 

11. Named Entity Recognition

In [None]:
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker_tab')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is alre

True

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

def safe_ner(text):
    try:
        tokens = word_tokenize(text)
        tagged = pos_tag(tokens)
        tree = ne_chunk(tagged)
        return tree
    except Exception as e:
        return f"NER failed: {e}"

# Apply safely
df2['ner_tree'] = df2['review'].apply(safe_ner)

# Print first few trees
for i, tree in enumerate(df2['ner_tree'].head(5)):
    print(f"\nReview {i+1} — NER Tree:")
    print(tree)


12. Summarization

In [None]:
from collections import Counter

def generate_summary(text, num_sentences=3):
    # Break text into sentences
    sentence_list = sent_tokenize(text)
    if len(sentence_list) <= num_sentences:
        return text  # not enough to summarize

    #Count word frequencies (excluding stopwords and punctuation)
    stop_words = set(stopwords.words('english'))
    word_scores = Counter()

    for sentence in sentence_list:
        for word in word_tokenize(sentence.lower()):
            if word.isalpha() and word not in stop_words:
                word_scores[word] += 1

    #Score each sentence by summing word scores
    ranked_sentences = []
    for sentence in sentence_list:
        score = sum(word_scores.get(word.lower(), 0) for word in word_tokenize(sentence) if word.isalpha())
        ranked_sentences.append((sentence, score))

    #Sort sentences by score and return the top ones
    ranked_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = [sent for sent, _ in ranked_sentences[:num_sentences]]
    return ' '.join(top_sentences)

In [None]:
sample = df2['review'].iloc[1]
print("Original:\n", sample)
print("\nSummary:\n", generate_summary(sample, 2))

13. Word cloud generation

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text_blob = ' '.join(df['review'].tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_blob)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud from Reviews")
plt.show()