In [8]:
import pandas as pd
import csv , cv2
import numpy as np
import tensorflow, keras
from keras.layers import Input, Conv2D, Activation, MaxPool2D,  Dense, Dropout, Flatten, BatchNormalization
from keras.models import Sequential, Model
from tensorflow.keras.utils import to_categorical
import matplotlib
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import itertools
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
dru_train = pd.read_csv(r"C:\Users\XTREME\Desktop\selected\Selected_Projects\drugsComTrain_raw.tsv", sep = "\t")
dru_test = pd.read_csv(r"C:\Users\XTREME\Desktop\selected\Selected_Projects\drugsComTest_raw.tsv", sep = "\t")
#type(dru_train)
len(dru_train)
print('Training length: ',dru_train.shape)
len(dru_test)
print('Testing length:',dru_test.shape)

Training length:  (161297, 7)
Testing length: (53766, 7)


In [10]:
dru = pd.concat([dru_train,dru_test])
dru.shape

(215063, 7)

In [11]:
dru.head()

Unnamed: 0,ID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [12]:
# dru['date'] = pd.to_datetime(dru['date'])
# dru['date'].head()
dru_train = dru[['ID', 'review','rating']].copy()
dru_train.head()

Unnamed: 0,ID,review,rating
0,206461,"""It has no side effect, I take it in combinati...",9.0
1,95260,"""My son is halfway through his fourth week of ...",8.0
2,92703,"""I used to take another oral contraceptive, wh...",5.0
3,138000,"""This is my first time using any form of birth...",8.0
4,35696,"""Suboxone has completely turned my life around...",9.0


In [13]:
dru_train.info(null_counts= True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215063 entries, 0 to 53765
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      215063 non-null  int64  
 1   review  215063 non-null  object 
 2   rating  215063 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.6+ MB


In [14]:
dru_train['ID'].unique()
dru_train['ID'].count()
dru_train['ID'].nunique()

215063

In [15]:
dru['review'][1]

1    "My son is halfway through his fourth week of ...
1    "My son has Crohn&#039;s disease and has done ...
Name: review, dtype: object

In [16]:
!pip install vaderSentiment



In [17]:
import nltk
nltk.download(['punkt','stopwords'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XTREME\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XTREME\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [19]:
dru_train.head()

Unnamed: 0,ID,review,rating
0,206461,"""It has no side effect, I take it in combinati...",9.0
1,95260,"""My son is halfway through his fourth week of ...",8.0
2,92703,"""I used to take another oral contraceptive, wh...",5.0
3,138000,"""This is my first time using any form of birth...",8.0
4,35696,"""Suboxone has completely turned my life around...",9.0


In [20]:
 # remove stopwords from review
dru_train['cleanReview'] = dru_train['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))    
dru_train.head()

Unnamed: 0,ID,review,rating,cleanReview
0,206461,"""It has no side effect, I take it in combinati...",9.0,"""It side effect, I take combination Bystolic 5..."
1,95260,"""My son is halfway through his fourth week of ...",8.0,"""My son halfway fourth week Intuniv. We became..."
2,92703,"""I used to take another oral contraceptive, wh...",5.0,"""I used take another oral contraceptive, 21 pi..."
3,138000,"""This is my first time using any form of birth...",8.0,"""This first time using form birth control. I&#..."
4,35696,"""Suboxone has completely turned my life around...",9.0,"""Suboxone completely turned life around. I fee..."


In [21]:
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [22]:
dru_train['ReviewScore'] = dru_train['cleanReview'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
dru_train.head()

Unnamed: 0,ID,review,rating,cleanReview,ReviewScore
0,206461,"""It has no side effect, I take it in combinati...",9.0,"""It side effect, I take combination Bystolic 5...",0.0
1,95260,"""My son is halfway through his fourth week of ...",8.0,"""My son halfway fourth week Intuniv. We became...",0.907
2,92703,"""I used to take another oral contraceptive, wh...",5.0,"""I used take another oral contraceptive, 21 pi...",0.7096
3,138000,"""This is my first time using any form of birth...",8.0,"""This first time using form birth control. I&#...",0.7184
4,35696,"""Suboxone has completely turned my life around...",9.0,"""Suboxone completely turned life around. I fee...",0.9403


In [24]:
positive_num = len(dru_train[dru_train['ReviewScore'] >=0.05])
neutral_num = len(dru_train[(dru_train['ReviewScore'] >-0.05) & (dru_train['ReviewScore']<0.05)])
negative_num = len(dru_train[dru_train['ReviewScore']<=-0.05])

positive_num,neutral_num,negative_num

(106198, 9035, 99830)

In [25]:
dru_train['vaderSentiment']= dru_train['ReviewScore'].map(lambda x:int(2) if x>=0.05 else int(1) if x<=-0.05 else int(0))
dru_train['vaderSentiment'].value_counts()

Total_vaderSentiment = positive_num + neutral_num + negative_num
Total_vaderSentiment

215063

In [27]:
dru_train.loc[dru_train['ReviewScore'] >=0.05,"vaderSentimentLabel"] ="positive"
dru_train.loc[(dru_train['ReviewScore'] >-0.05) & (dru_train['ReviewScore']<0.05),"vaderSentimentLabel"]= "neutral"
dru_train.loc[dru_train['ReviewScore']<=-0.05,"vaderSentimentLabel"] = "negative"
dru_train.shape
dru_train.head()

Unnamed: 0,ID,review,rating,cleanReview,ReviewScore,vaderSentiment,vaderSentimentLabel
0,206461,"""It has no side effect, I take it in combinati...",9.0,"""It side effect, I take combination Bystolic 5...",0.0,0,neutral
1,95260,"""My son is halfway through his fourth week of ...",8.0,"""My son halfway fourth week Intuniv. We became...",0.907,2,positive
2,92703,"""I used to take another oral contraceptive, wh...",5.0,"""I used take another oral contraceptive, 21 pi...",0.7096,2,positive
3,138000,"""This is my first time using any form of birth...",8.0,"""This first time using form birth control. I&#...",0.7184,2,positive
4,35696,"""Suboxone has completely turned my life around...",9.0,"""Suboxone completely turned life around. I fee...",0.9403,2,positive


In [28]:
positive_rating = len(dru_train[dru_train['rating'] >=7.0])
neutral_rating = len(dru_train[(dru_train['rating'] >=4) & (dru_train['rating']<7)])
negative_rating = len(dru_train[dru_train['rating']<=3])
positive_rating,neutral_rating,negative_rating

Total_rating = positive_rating+neutral_rating+negative_rating
Total_rating


215063

In [29]:
dru_train['ratingSentiment']= dru_train['rating'].map(lambda x:int(2) if x>=7 else int(1) if x<=3 else int(0))
dru_train['ratingSentiment'].value_counts()

dru_train.head()

Unnamed: 0,ID,review,rating,cleanReview,ReviewScore,vaderSentiment,vaderSentimentLabel,ratingSentiment
0,206461,"""It has no side effect, I take it in combinati...",9.0,"""It side effect, I take combination Bystolic 5...",0.0,0,neutral,2
1,95260,"""My son is halfway through his fourth week of ...",8.0,"""My son halfway fourth week Intuniv. We became...",0.907,2,positive,2
2,92703,"""I used to take another oral contraceptive, wh...",5.0,"""I used take another oral contraceptive, 21 pi...",0.7096,2,positive,0
3,138000,"""This is my first time using any form of birth...",8.0,"""This first time using form birth control. I&#...",0.7184,2,positive,2
4,35696,"""Suboxone has completely turned my life around...",9.0,"""Suboxone completely turned life around. I fee...",0.9403,2,positive,2


In [30]:
dru_train.loc[dru_train['rating'] >=7.0,"ratingSentimentLabel"] ="positive"
dru_train.loc[(dru_train['rating'] >=4.0) & (dru_train['rating']<7.0),"ratingSentimentLabel"]= "neutral"
dru_train.loc[dru_train['rating']<=3.0,"ratingSentimentLabel"] = "negative"

dru_train.head()

Unnamed: 0,ID,review,rating,cleanReview,ReviewScore,vaderSentiment,vaderSentimentLabel,ratingSentiment,ratingSentimentLabel
0,206461,"""It has no side effect, I take it in combinati...",9.0,"""It side effect, I take combination Bystolic 5...",0.0,0,neutral,2,positive
1,95260,"""My son is halfway through his fourth week of ...",8.0,"""My son halfway fourth week Intuniv. We became...",0.907,2,positive,2,positive
2,92703,"""I used to take another oral contraceptive, wh...",5.0,"""I used take another oral contraceptive, 21 pi...",0.7096,2,positive,0,neutral
3,138000,"""This is my first time using any form of birth...",8.0,"""This first time using form birth control. I&#...",0.7184,2,positive,2,positive
4,35696,"""Suboxone has completely turned my life around...",9.0,"""Suboxone completely turned life around. I fee...",0.9403,2,positive,2,positive


In [31]:
dru_train = dru_train[['ID','review','cleanReview','rating','ratingSentiment','ratingSentimentLabel','ReviewScore','vaderSentiment','vaderSentimentLabel']]
dru_train.head()

Unnamed: 0,ID,review,cleanReview,rating,ratingSentiment,ratingSentimentLabel,ReviewScore,vaderSentiment,vaderSentimentLabel
0,206461,"""It has no side effect, I take it in combinati...","""It side effect, I take combination Bystolic 5...",9.0,2,positive,0.0,0,neutral
1,95260,"""My son is halfway through his fourth week of ...","""My son halfway fourth week Intuniv. We became...",8.0,2,positive,0.907,2,positive
2,92703,"""I used to take another oral contraceptive, wh...","""I used take another oral contraceptive, 21 pi...",5.0,0,neutral,0.7096,2,positive
3,138000,"""This is my first time using any form of birth...","""This first time using form birth control. I&#...",8.0,2,positive,0.7184,2,positive
4,35696,"""Suboxone has completely turned my life around...","""Suboxone completely turned life around. I fee...",9.0,2,positive,0.9403,2,positive


In [32]:
dru_train.to_csv('newDrug.csv')    # To save preprocessed dataset to csv
dru_train.head(50)
import os
os.stat('newDrug.csv').st_size         # Check size of csv file About 181MB


181826795

In [33]:
dru_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215063 entries, 0 to 53765
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ID                    215063 non-null  int64  
 1   review                215063 non-null  object 
 2   cleanReview           215063 non-null  object 
 3   rating                215063 non-null  float64
 4   ratingSentiment       215063 non-null  int64  
 5   ratingSentimentLabel  215063 non-null  object 
 6   ReviewScore           215063 non-null  float64
 7   vaderSentiment        215063 non-null  int64  
 8   vaderSentimentLabel   215063 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 20.4+ MB


In [34]:
dru_train.to_csv('newDrug.csv.gz',compression='gzip')
os.stat('newDrug.csv.gz').st_size

54014518

In [35]:
dru_train.head()

Unnamed: 0,ID,review,cleanReview,rating,ratingSentiment,ratingSentimentLabel,ReviewScore,vaderSentiment,vaderSentimentLabel
0,206461,"""It has no side effect, I take it in combinati...","""It side effect, I take combination Bystolic 5...",9.0,2,positive,0.0,0,neutral
1,95260,"""My son is halfway through his fourth week of ...","""My son halfway fourth week Intuniv. We became...",8.0,2,positive,0.907,2,positive
2,92703,"""I used to take another oral contraceptive, wh...","""I used take another oral contraceptive, 21 pi...",5.0,0,neutral,0.7096,2,positive
3,138000,"""This is my first time using any form of birth...","""This first time using form birth control. I&#...",8.0,2,positive,0.7184,2,positive
4,35696,"""Suboxone has completely turned my life around...","""Suboxone completely turned life around. I fee...",9.0,2,positive,0.9403,2,positive
