# NLP For Drugs.com Data Set

### Packages Import

In [1]:
### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
sns.color_palette("Blues", as_cmap=True)

### Standard Packages
import numpy as np
import warnings
import nltk
import re
import pandas as pd
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

### NLTK
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('vader_lexicon')
import contractions

### Scikit-Learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, \
                            accuracy_score, f1_score, recall_score, precision_score

### ImbLearn
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/albertcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/albertcc/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Bringing in two .tsv files as test and train

In [2]:
# Load in the test and train datasets provided in the data file
data_test = pd.read_csv('dsc/data/drugsComTest_raw.tsv', sep='\t')
data_train = pd.read_csv('data/drugsComTrain_raw.tsv', sep='\t')

FileNotFoundError: [Errno 2] No such file or directory: 'data/drugsComTest_raw.tsv'

In [None]:
data_test.info()

In [None]:
data_train.info()

### Merge Test and Train dataframes

- The data provided is already split into test and train tsv files. I would like to combine these to not only have more data to work with, but any cleaning could be applied to the merged dataset before splitting into a training and testing set.

In [None]:
merged_df = pd.concat([data_test, data_train], axis=0)

In [None]:
merged_df.info()

In [None]:
# Drop first column since these appear to be entry numbers
merged_df = merged_df.drop(merged_df.columns[0],axis=1)

In [None]:
# Sanity check that the Unnamed column has been dropped
merged_df.head()

- Noticed how 'condition' has some missing values, but other columns are fine

In [None]:
# Drop null values that are in 'condition'

merged_df = merged_df.dropna(subset=['condition'])

In [None]:
merged_df.info()

- Reduces the dataset to 213,869 values

In [None]:
merged_df['condition'].value_counts()

### Looking at unique drugs under 'Birth Control' condition

In [None]:
merged_df['drugName'][merged_df['condition'] =='Birth Control'].nunique()

In [None]:
merged_df['drugName'][merged_df['condition'] =='Birth Control'].value_counts()

In [None]:
# Select the the top 7 birth controls since we want to focus on these
bc_drugs = ['Etonogestrel', 'Ethinyl estradiol / norethindrone', 'Nexplanon', 'Levonorgestrel', 'Ethinyl estradiol / levonorgestrel',
           'Ethinyl estradiol / norgestimate', 'Implanon']

bc_data = merged_df[merged_df['drugName'].isin(bc_drugs)]

In [None]:
bc_data.head()

In [None]:
bc_data['drugName'].value_counts()

- Wanted to only include in our dataset the top 7 drugs with condition = birth control, however when we filtered for these drugs we see additional conditions were selected

In [None]:
bc_data['condition'].value_counts()

In [None]:
# Let's try to get rid of these conditions that took in the 'Useful' rating as 'condition'
bc_data = bc_data[~bc_data['condition'].str.contains('comment')]

In [None]:
bc_data['condition'].value_counts()

In [None]:
bc_data['drugName'].value_counts()

In [None]:
bc_data.info()

In [None]:
# Create new column called 'sentiment' that will have the target variables
bc_data['sentiment'] = ['Positive' if x > 7.0 else 'Negative' for x in bc_data['rating']]

### Analyze the median rating based on condition

In [None]:
median_values = bc_data.groupby('condition')['rating'].median()

# Create bar chart
fig, ax = plt.subplots(figsize = (25, 10))
ax.bar(median_values.index, median_values.values)
plt.xticks(rotation=45)

# Set axis labels and title
ax.set_xlabel('Condition')
ax.set_ylabel('Median Values')
ax.set_title('Median Values by Group')

# Show plot
plt.show()

In [None]:
bc_data.loc[bc_data['condition'] == 'Emergency Contraception'].rating.value_counts()

### Create columns that count emphasis and capital letters in text, as this could express sentiment

In [None]:
### Creating a 'punc_emphasis' column that scores how many exclamation points and question marks are in the text

bc_data['punc_emphasis'] = bc_data['review'].apply(lambda x: sum([1 for char in x if char in ['!', '?']]))

### Creating a 'capt_emphasis' column that scores how many capitalized words are in the text

bc_data['capt_emphasis'] = bc_data['review'].apply(lambda x: sum([1 for word in x.split() if word.isupper()]))

In [None]:
bc_data.head()

In [None]:
bc_data['sentiment'].value_counts(normalize=True)

### Sentiment Analysis Against Condition
- Within the conditions we have selected, how do the reviews look pertaining to each condition?

In [None]:
# Let's try plotting sentiment against groups

fig = plt.figure(figsize = (35, 10))

hue_order = ['Positive', 'Negative']
sns.countplot(x='condition', hue='sentiment', data=bc_data, hue_order=hue_order, palette='tab10', order = bc_data['condition'].value_counts().index)

# plt.xticks(rotation=45)

plt.xlabel('Condition')
plt.ylabel('Count')
plt.title('Distribution of Conditions by Review Sentiment');

In [None]:
# Let's try plotting sentiment against groups

fig = plt.figure(figsize = (25, 10))


sns.countplot(x='drugName', hue='sentiment', data=bc_data, order = bc_data['drugName'].value_counts().index)

# plt.xticks(rotation=45)
plt.xlabel('Drug Names')
plt.ylabel('Count')
plt.title('Distribution of Birth Control Drugs by Review Sentiment');

### Can we do anything with 'UsefulCount'?

In [None]:
bc_data['usefulCount'].value_counts(bins=3)

In [None]:
bc_data['usefulCount'].min()

In [None]:
bc_data['usefulCount'].describe()

- Not sure if this is too useful of a feature, maybe we could filter the reviews that were found useful above a certain threshold to take in user input.

### After research on birth controls, wanted to read what reviews are saying

In [None]:
bc_data[bc_data['drugName'] == 'Levonorgestrel']

### Binning the years these reviews were written into two groups to see if there's a difference over time

In [None]:
bc_data['date'] = pd.to_datetime(bc_data['date'])

In [None]:
bc_data['date'].describe()

In [None]:
bc_data['date'].value_counts(bins=2)

In [None]:
# Create new column called 'date_column' that will have grouped time ranges of 2008-2012 and 2013-2017
bc_data['date_column'] = ['2013-2017' if x.year > 2013 else '2008-2012' for x in bc_data['date']]

### Can we see the difference in reviews of these drugs over time?

In [None]:
# Let's try plotting sentiment of Levonorgestrel against the two assigned time periods

fig = plt.figure(figsize = (25, 10))

sns.countplot(x='date_column', hue='sentiment', data=bc_data[bc_data['drugName'] == 'Levonorgestrel'], hue_order=hue_order, palette='tab10')

# plt.xticks(rotation=45)
plt.xlabel('Time Periods')
plt.ylabel('Count')
plt.title('Distribution of Levonorgestrel Sentiment by Two Time Periods');

In [None]:
# Let's try plotting sentiment of Etonogestrel against the two assigned time periods

fig = plt.figure(figsize = (25, 10))

sns.countplot(x='date_column', hue='sentiment', data=bc_data[bc_data['drugName'] == 'Etonogestrel'], hue_order=hue_order, palette='tab10')

# plt.xticks(rotation=45)
plt.xlabel('Time Periods')
plt.ylabel('Count')
plt.title('Distribution of Etonogestrel Sentiment by Two Time Periods');

In [None]:
# Let's try plotting sentiment of Ethinyl estradiol / norethindrone against the two assigned time periods

fig = plt.figure(figsize = (25, 10))

sns.countplot(x='date_column', hue='sentiment', data=bc_data[bc_data['drugName'] == 'Ethinyl estradiol / norethindrone'], hue_order=hue_order, palette='tab10')

# plt.xticks(rotation=45)
plt.xlabel('Time Periods')
plt.ylabel('Count')
plt.title('Distribution of Ethinyl estradiol / norethindrone Sentiment by Two Time Periods');

### Cleaning Text Reviews

In [None]:
# Create function that will lowercase the text

def lower_case(text):
    text = text.lower()
    return text

# Create function to remove the html apostrophes in the text

def apostrophe(text):
    text = text.replace('&#039;', '\'')
    return text

# Want to expand the contractions so we can see if these words have importance

def fixcontractions(text):
    text = contractions.fix(text)
    return text

# Create a function that uses a regex tokenizer to remove punctuation but ignores contraction apostrophes

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+\'?\w+')
    text = tokenizer.tokenize(text)
    text = ' '.join(text)
    return text

# Remove stopwords from the reviews

def remove_stopwords(text, stop_words_list = set(stopwords.words('english'))):
    text = text.split()
    text = [word for word in text if word not in stop_words_list]
    text = ' '.join(text)
    return text

# Create a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

def clean_text(text):
    text = lower_case(text)
    text = apostrophe(text)
    text = fixcontractions(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [None]:
# Manually testing the contractions.fix function
contractions.fix("I've aren't Tim's got a lovely bunch of coconuts")

In [None]:
# Original review text location 6
bc_data['review'][14]

In [None]:
# Testing one of the reviews to see what it is doing to the text, as above
clean_text(bc_data['review'][14])

In [None]:
### Plotting the top 10 most common words in the 'text' column in an sns bar chart after applying the clean_text function

text = ' '.join(bc_data['review'])
text = clean_text(text)
text = text.split()

freq = pd.Series(text).value_counts()[:10]
freq = freq.to_frame()
freq = freq.reset_index()
freq.columns = ['word', 'count']
freq = freq.sort_values(by='count', ascending=False)

fig = plt.figure(figsize=(6,4))
sns.barplot(x='count', y='word', data=freq, palette='tab10')
plt.xlabel('Count')
plt.ylabel('Word')
plt.title('Top 10 Most Common Words in Reviews')
plt.show()

# Modeling

In [None]:
bc_data.info()

### First Simple Model - Count Vectorizer / Logistic Regression / No Features

In [None]:
X1 = bc_data['review']
y1 = bc_data['sentiment']

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=1337)

# For Train Set, apply clean_text function

X_train_1 = X_train_1.apply(clean_text)

### Train - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_1 = X_train_1.apply(lambda x: x.split())
X_train_1 = X_train_1.map(' '.join)

### Train - Vectorize the training data using CountVectorizer

cv = CountVectorizer()
X_train_1 = cv.fit_transform(X_train_1)

### Train - Fit training data to Logistic Regression Model

logit = LogisticRegression()
logit.fit(X_train_1, y_train_1)

### VALIDATION - Perform a cross validation on the logistic regression model

scores = cross_val_score(logit, X_train_1, y_train_1, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

In [None]:
# Logistic Regression Test Set Preprocessing

X_test_1_logit = X_test_1.apply(clean_text)
X_test_1_logit = X_test_1.apply(lambda x: x.split())
X_test_1_logit = X_test_1.map(' '.join)
X_test_1_logit = cv.transform(X_test_1)

In [None]:
logit_pred = logit.predict(X_test_1_logit)

In [None]:
print('Logistic Regression Accuracy: ', accuracy_score(y_test_1, logit_pred))
print('Logistic Regression F1 Score: ', f1_score(y_test_1, logit_pred, average='weighted'))
print('Logistic Regression Precision Score: ', precision_score(y_test_1, logit_pred, average='weighted'))
print('Logistic Regression Recall Score: ', recall_score(y_test_1, logit_pred, average='weighted'))

In [None]:
### Get scores for the sentiments individually to see

# Quick look at decision matrix for our first model:

### Logistic Regression Confusion Matrix

cm = confusion_matrix(y_test_1, logit_pred)
cm_df = pd.DataFrame(cm, index=['Negative', 'Positive'], columns=['Negative', 'Positive'])

fig_cm1 = plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Math check to see if these add up to the train split. It does!
1748 + 1975 + 362 + 391

### Second Model - TFID Vectorizer / Logistic Regression / No Features¶

- Want to test if TFIDF Vectorizer makes a difference compared to Count Vectorizer

In [None]:
X1_tfidf = bc_data['review']
y1_tfidf = bc_data['sentiment']

X_train_1_tfidf, X_test_1_tfidf, y_train_1_tfidf, y_test_1_tfidf = train_test_split(X1, y1, test_size=0.2, random_state=1337)

# For Train Set, apply clean_text function

X_train_1_tfidf = X_train_1_tfidf.apply(clean_text)

### Train - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_1_tfidf = X_train_1_tfidf.apply(lambda x: x.split())
X_train_1_tfidf = X_train_1_tfidf.map(' '.join)

### Train - Vectorize the training data using CountVectorizer

tfidf = TfidfVectorizer()
X_train_1_tfidf = tfidf.fit_transform(X_train_1_tfidf)

### Train - Fit training data to Logistic Regression Model

logit_tfidf = LogisticRegression()
logit_tfidf.fit(X_train_1_tfidf, y_train_1_tfidf)

### VALIDATION - Perform a cross validation on the decision tree classifier

scores = cross_val_score(logit_tfidf, X_train_1_tfidf, y_train_1_tfidf, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

In [None]:
# Logistic Regression Test (TFIDF) Set Preprocessing

X_test_1_tfidf = X_test_1_tfidf.apply(clean_text)
X_test_1_tfidf = X_test_1_tfidf.apply(lambda x: x.split())
X_test_1_tfidf = X_test_1_tfidf.map(' '.join)
X_test_1_tfidf = cv.transform(X_test_1_tfidf)

In [None]:
logit_pred_tfidf = logit_tfidf.predict(X_test_1_tfidf)

In [None]:
### Logistic Regression Confusion Matrix w/ TFIDF

cm = confusion_matrix(y_test_1_tfidf, logit_pred_tfidf)
cm_df = pd.DataFrame(cm, index=['Negative', 'Positive'], columns=['Negative', 'Positive'])

fig_cm1 = plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
print('Logistic Regression (TFIDF) Accuracy: ', accuracy_score(y_test_1_tfidf, logit_pred_tfidf))
print('Logistic Regression (TFIDF) F1 Score: ', f1_score(y_test_1_tfidf, logit_pred_tfidf, average='weighted'))
print('Logistic Regression (TFIDF) Precision Score: ', precision_score(y_test_1_tfidf, logit_pred_tfidf, average='weighted'))
print('Logistic Regression (TFIDF) Recall Score: ', recall_score(y_test_1_tfidf, logit_pred_tfidf, average='weighted'))

- We see here that our Logistic regression model using TFIDF has a lower accuracy than the model with Count Vector. This could possibly be contributed to the fact of class imbalance, and the words that the model is selecting for is the result of the imbalanced classes. Will look at sampling differently in the next model

### Grid Search for best Logistic Regression parameters

In [None]:
param_grid = {'C': [1, 10, 100, 1000, 10000],'penalty': ['none', 'l1', 'l2', 'elasticnet']}
grid = GridSearchCV(logit, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_1, y_train_1)
print('Best Parameters: ', grid.best_params_)

- Looks that the default parameters are the best parameters for this model

### Third Model: Count Vectorizer / Multinomial Bayes / No Features

In [None]:
# Using the same X_train and y_train from our first and second models

X1_nb = bc_data['review']
y1_nb = bc_data['sentiment']

X_train_1_nb, X_test_1_nb, y_train_1_nb, y_test_1_nb = train_test_split(X1, y1, test_size=0.2, random_state=1337)

# For Train Set, apply clean_text function

X_train_1_nb = X_train_1_nb.apply(clean_text)

### Train - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_1_nb = X_train_1_nb.apply(lambda x: x.split())
X_train_1_nb = X_train_1_nb.map(' '.join)

### Train - Vectorize the training data using CountVectorizer

cv = CountVectorizer()
X_train_1_nb_cv = cv.fit_transform(X_train_1_nb)

# Train - fitting the training data to a Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_1, y_train_1)

# Validation - Performing cross validation on the Naive Bayes Classifier

scores = cross_val_score(nb, X_train_1, y_train_1, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

#### Grid Search for best conditions with NB

In [None]:
param_grid = {'alpha': [0.1, 0.5, 1, 2, 5, 20, 50]}
grid = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_1, y_train_1)
print('Best Parameters: ', grid.best_params_)

In [None]:
# Naive Bayes Test Set Preprocessing

X_test_1_nb = X_test_1_nb.apply(clean_text)
X_test_1_nb = X_test_1_nb.apply(lambda x: x.split())
X_test_1_nb = X_test_1_nb.map(' '.join)
X_test_1_nb = cv.transform(X_test_1_nb)

In [None]:
nb_pred_cv = nb.predict(X_test_1_nb)

In [None]:
### Naive Bayes Confusion Matrix w/ CountVectorizer

cm = confusion_matrix(y_test_1_nb, nb_pred_cv)
cm_df = pd.DataFrame(cm, index=['Negative', 'Positive'], columns=['Negative', 'Positive'])

fig_cm1 = plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Naive Bayes Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
print('Naive Bayes Accuracy: ', accuracy_score(y_test_1_nb, nb_pred_cv))
print('Naive Bayes F1 Score: ', f1_score(y_test_1_nb, nb_pred_cv, average='weighted'))
print('Naive Bayes Precision Score: ', precision_score(y_test_1_nb, nb_pred_cv, average='weighted'))
print('Naive Bayes Recall Score: ', recall_score(y_test_1_nb, nb_pred_cv, average='weighted'))

### 4th Model: Count Vectorizer / Logistic Regression / Review, Drug Name, condition, punc emphasis, capt emphasis vs. Sentiment

In [None]:
X2 = bc_data[['review', 'drugName', 'condition', 'punc_emphasis', 'capt_emphasis']]
y2 = bc_data['sentiment']

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=1337)

# For Train Set, apply clean_text function

X_train_2['review'] = X_train_2['review'].apply(clean_text)

### Train - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_2['review'] = X_train_2['review'].apply(lambda x: x.split())
X_train_2['review'] = X_train_2['review'].map(' '.join)

### Train - Vectorize the training data using CountVectorizer

cv = CountVectorizer()
X_train_2 = cv.fit_transform(X_train_2['review'])

### Train - Fit training data to Logistic Regression Model

logit = LogisticRegression()
logit.fit(X_train_2, y_train_2)

### VALIDATION - Perform a cross validation on the logistic regression model

scores = cross_val_score(logit, X_train_2, y_train_2, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

In [None]:
logit.coef_

### 5th Model: Want to see training on Levonogestrel on its own

In [None]:
data_lev = bc_data[bc_data['drugName'] == 'Levonorgestrel']
data_lev.head()

In [None]:
data_lev['sentiment'].value_counts()

In [None]:
data_lev.info()

In [None]:
X3 = data_lev['review']
y3 = data_lev['sentiment']

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X3, y3, test_size=0.2, random_state=1337)

# For Train Set, apply clean_text function

X_train_3 = X_train_3.apply(clean_text)

### Train - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_3 = X_train_3.apply(lambda x: x.split())
X_train_3 = X_train_3.map(' '.join)

### Train - Vectorize the training data using CountVectorizer

cv = CountVectorizer()
X_train_3 = cv.fit_transform(X_train_3)

### Train - Fit training data to Logistic Regression Model

logit_lev = LogisticRegression()
logit_lev.fit(X_train_3, y_train_3)

### VALIDATION - Perform a cross validation on the logistic regression model

scores = cross_val_score(logit_lev, X_train_3, y_train_3, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

In [None]:
# Logistic Regression Test Set Preprocessing

X_test_3_logit = X_test_3.apply(clean_text)
X_test_3_logit = X_test_3.apply(lambda x: x.split())
X_test_3_logit = X_test_3.map(' '.join)
X_test_3_logit = cv.transform(X_test_3)

In [None]:
logit_pred_3 = logit_lev.predict(X_test_3_logit)

In [None]:
print('Logistic Regression Accuracy: ', accuracy_score(y_test_3, logit_pred_3))
print('Logistic Regression F1 Score: ', f1_score(y_test_3, logit_pred_3, average='weighted'))
print('Logistic Regression Precision Score: ', precision_score(y_test_3, logit_pred_3, average='weighted'))
print('Logistic Regression Recall Score: ', recall_score(y_test_3, logit_pred_3, average='weighted'))

In [None]:
### Get scores for the sentiments individually to see

# Quick look at decision matrix for our fifth model:

### Logistic Regression Confusion Matrix

cm = confusion_matrix(y_test_3, logit_pred_3)
cm_df = pd.DataFrame(cm, index=['Negative', 'Positive'], columns=['Negative', 'Positive'])

fig_cm3 = plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()