In [38]:
import pandas as pd
import numpy as np

import nltk
import re

from ucimlrepo import fetch_ucirepo 

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize

# nltk.download('stopwords')
# nltk.download("punkt")
# pip install ucimlrepo

# Introduction

The dataset contains patient reviews on specific drugs and conditions (e.g., acne, ADHD, Birth Control, etc), as well as the ratings for the drugs. In this exploration, we categorize ratings as 'negative' sentiment if they are 5 or lower, and 'positive' if they are higher than 5. The assumption is that the patients tend to give lower rating if they feel displeasure with the drugs. Moreover, we will only focus on the "acne" problems.

The goal of this analysis is to build classification model to predict patient sentiment based on their reviews to the drugs. The sentiment will be the response variable, and the review will be the feature.

# Dataset

In [2]:
# fetch dataset 
drug_reviews_drugs_com = fetch_ucirepo(id=462) 
  
# data (as pandas dataframes) 
X = drug_reviews_drugs_com.data.features 

In [3]:
X.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215063 entries, 0 to 215062
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   drugName     215063 non-null  object
 1   condition    213869 non-null  object
 2   review       215063 non-null  object
 3   rating       215063 non-null  int64 
 4   date         215063 non-null  object
 5   usefulCount  215063 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 9.8+ MB


# Data Preprocessing

In [5]:
# Filter rows with acne

X_acne = X[X.condition == 'Acne']

In [6]:
X_acne.shape

(7435, 6)

In [7]:
# Number of observations per rating

X_acne.rating.value_counts()

rating
10    2350
9     1665
8      861
1      719
7      445
5      327
3      295
2      292
6      291
4      190
Name: count, dtype: int64

In [8]:
# Create sentiment based on ratings, and this will be the response variable

X_acne.loc[:, 'sentiment'] = ["positive" if rate > 5 else "negative" for rate in X_acne['rating']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_acne.loc[:, 'sentiment'] = ["positive" if rate > 5 else "negative" for rate in X_acne['rating']]


In [9]:
X_acne.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,sentiment
66,Tretinoin,Acne,"""I just hit my three month point on tretinoin ...",10,4-Nov-15,13,positive
69,Ethinyl estradiol / norgestimate,Acne,"""Best treatment for acne I have used! I&#039;v...",9,3-Nov-15,7,positive
98,Spironolactone,Acne,"""I&#039;m 30 years old. I started having real...",9,21-Aug-13,31,positive
102,Adapalene / benzoyl peroxide,Acne,"""I&#039;ve seriously only been using Epiduo fo...",8,31-Oct-11,12,positive
109,Isotretinoin,Acne,"""I just started this medication on April 1st a...",7,6-Apr-09,10,positive


In [10]:
# Number of observations per sentiment

X_acne.sentiment.value_counts()

sentiment
positive    5612
negative    1823
Name: count, dtype: int64

In [11]:
# Training the model using review as the feature

X_acne_review = X_acne['review']
y_acne = X_acne['sentiment']

In [12]:
X_acne_review.head()

66     "I just hit my three month point on tretinoin ...
69     "Best treatment for acne I have used! I&#039;v...
98     "I&#039;m 30 years old.  I started having real...
102    "I&#039;ve seriously only been using Epiduo fo...
109    "I just started this medication on April 1st a...
Name: review, dtype: object

## Text Preprocessing

In [13]:
# Tokenization
X_tokenized = X_acne_review.apply(lambda row: word_tokenize(row))

In [14]:
X_tokenized.head()

66     [``, I, just, hit, my, three, month, point, on...
69     [``, Best, treatment, for, acne, I, have, used...
98     [``, I, &, #, 039, ;, m, 30, years, old, ., I,...
102    [``, I, &, #, 039, ;, ve, seriously, only, bee...
109    [``, I, just, started, this, medication, on, A...
Name: review, dtype: object

In [15]:
# Case folding
X_case_folded = X_tokenized.apply(lambda row: [word.lower() for word in row])

In [16]:
X_case_folded.head()

66     [``, i, just, hit, my, three, month, point, on...
69     [``, best, treatment, for, acne, i, have, used...
98     [``, i, &, #, 039, ;, m, 30, years, old, ., i,...
102    [``, i, &, #, 039, ;, ve, seriously, only, bee...
109    [``, i, just, started, this, medication, on, a...
Name: review, dtype: object

In [17]:
# Punctuation removal
X_no_punctuation = X_case_folded.apply(lambda row: [word for word in row if word not in punctuation])

In [18]:
X_no_punctuation.head()

66     [``, i, just, hit, my, three, month, point, on...
69     [``, best, treatment, for, acne, i, have, used...
98     [``, i, 039, m, 30, years, old, i, started, ha...
102    [``, i, 039, ve, seriously, only, been, using,...
109    [``, i, just, started, this, medication, on, a...
Name: review, dtype: object

In [19]:
# Digit removal
X_no_digit = X_no_punctuation.apply(lambda row: [word for word in row if not word.isdigit()])

In [20]:
X_no_digit.head()

66     [``, i, just, hit, my, three, month, point, on...
69     [``, best, treatment, for, acne, i, have, used...
98     [``, i, m, years, old, i, started, having, rea...
102    [``, i, ve, seriously, only, been, using, epid...
109    [``, i, just, started, this, medication, on, a...
Name: review, dtype: object

In [21]:
# Stopwords removal
sw = stopwords.words('english')
X_no_stopword = X_no_digit.apply(lambda row: [word for word in row if word not in sw])

In [22]:
X_no_stopword.head()

66     [``, hit, three, month, point, tretinoin, .05,...
69     [``, best, treatment, acne, used, gone, accuta...
98     [``, years, old, started, really, bad, skin, y...
102    [``, seriously, using, epiduo, four, days, see...
109    [``, started, medication, april, 1st, days, st...
Name: review, dtype: object

In [23]:
# Join them together again as a sentence
X_cleaned = X_no_stopword.apply(lambda row: ' '.join(row))

In [24]:
X_cleaned.head()

66     `` hit three month point tretinoin .05 happy r...
69     `` best treatment acne used gone accutane many...
98     `` years old started really bad skin years ago...
102    `` seriously using epiduo four days seen huge ...
109    `` started medication april 1st days started s...
Name: review, dtype: object

In [25]:
# Split the data into train and test set

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_acne, test_size=0.25, random_state=42)

In [26]:
# Convert text to TF-IDF features

# Custom function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Initiation for vectorizer
vectorizer = TfidfVectorizer(binary=False,
                             use_idf=True,
                             stop_words='english',
                             preprocessor=remove_numbers,
                             max_features=None)

vectorizer.fit(X_train)

In [27]:
# Transform X_train and X_test

X_train_vector = vectorizer.transform(X_train)
X_test_vector = vectorizer.transform(X_test)

In [28]:
X_train_vector.shape

(5576, 7416)

In [29]:
X_test_vector.shape

(1859, 7416)

In [31]:
# Get the words in the vectors

words_array = vectorizer.get_feature_names_out()

# for value in words_array:
#     print(value)

words_array

array(['aa', 'abandon', 'abating', ..., 'zones', 'zooley', 'zyneret'],
      dtype=object)

# EDA

What? 
- Making a list of most "meaningful" words in each sentiment based on TF-IDF scores.

How?
- For negative sentiment, the most noticeable words that represent displeasure are "worse", "bad", and "red". However, it is not super clear for positive sentiment.

In [72]:
# Convert the TF-IDF matrix to a pandas DataFrame
tfidf_df = pd.DataFrame(X_train_vector.toarray(), columns=vectorizer.get_feature_names_out())

In [73]:
# Add the sentiment labels to the DataFrame
tfidf_df['sentiment'] = y_train.reset_index(drop=True)

In [85]:
# Top 20 words based TF-IDF
tfidf_class_grouped = tfidf_df.groupby('sentiment').sum()
top_20_words = tfidf_class_grouped.apply(lambda row: row.nlargest(20).index.tolist(), axis=1)

In [88]:
top_20_words['negative']

['acne',
 'skin',
 'face',
 'worse',
 'pill',
 'months',
 'using',
 'started',
 'month',
 'taking',
 'week',
 'bad',
 'weeks',
 'like',
 'control',
 'got',
 'red',
 'day',
 'birth',
 'pimples']

In [89]:
top_20_words['positive']

['acne',
 'skin',
 'face',
 'months',
 'using',
 'started',
 'years',
 'month',
 'clear',
 'really',
 'weeks',
 'use',
 'accutane',
 'dry',
 'week',
 'epiduo',
 'day',
 'tried',
 'product',
 'like']

In [74]:
tfidf_df.head()

Unnamed: 0,aa,abandon,abating,abdomen,abdominal,abilities,abit,ablaze,able,abnominal,...,zineryt,zip,zit,zits,zoderm,zone,zones,zooley,zyneret,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,negative
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive


# Modeling (SVM)

In [48]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [51]:
# Fit classification model

# Create an SVC object
svc = SVC(kernel='rbf', random_state=42)

# Define the hyperparameter grid
param_grid = {
    'C': np.logspace(-3, 3, 10),
    'gamma': np.logspace(-3, 3, 10)
}

# Create a GridSearchCV object
grid_search = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_vector, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_ 

In [52]:
best_params

{'C': 10.0, 'gamma': 0.46415888336127775}

In [53]:
best_model

In [54]:
# Predict X_test

y_pred = grid_search.predict(X_test_vector)

In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.89      0.79      0.84       480
    positive       0.93      0.97      0.95      1379

    accuracy                           0.92      1859
   macro avg       0.91      0.88      0.89      1859
weighted avg       0.92      0.92      0.92      1859



In [56]:
# Predict X_train

y_pred_train = grid_search.predict(X_train_vector)

In [57]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      1343
    positive       1.00      1.00      1.00      4233

    accuracy                           1.00      5576
   macro avg       1.00      1.00      1.00      5576
weighted avg       1.00      1.00      1.00      5576



There is indication of overfitting issue. Perfect model for training, but worse performance for test set.
- When the values for gamma and C are too large, the model tends to be overfit. 