In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

In [2]:
test = pd.read_csv('drugsComTest_raw.tsv', sep='\t')
train = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')

In [None]:
# test['rating'].value_counts().sort_index()

**Note**: We originally combined the dataset, then reshuffled and split into train and test data ourselves, but instead have decided to just use the original train and test datasets for training and evaluation. 

In [None]:
# all_reviews = pd.concat([train,test])

In [3]:
def get_sentiment(rating):
  if rating < 4.0:
    return 'neg'
  elif rating >= 4.0 and rating <= 7.0:
    return 'neutral'
  else:
    return 'pos'

# map ratings 1-3 to negative; 4-7 as neutral; 8-10 as positive
train['sentiment'] = train['rating'].map(lambda x: get_sentiment(x))
test['sentiment'] = test['rating'].map(lambda x: get_sentiment(x))

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,sentiment
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,pos
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,pos
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,neutral
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,pos
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,pos


### Tune CountVectorizer features

In [4]:
# reviews_train, reviews_test, labels_train, labels_test = train_test_split(all_reviews['review'], all_reviews['sentiment'], random_state=1)

reviews_train = train['review']
labels_train = train['sentiment']
reviews_test = test['review']
labels_test = test['sentiment']

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
import os

for classifier in ['uni', 'bi', 'uni_bi']:
    print(f'------------------ CLASSIFIER: {classifier} ------------------')
    if classifier is 'uni':
        ngram_range = (1, 1)
    elif classifier is 'bi':
        ngram_range = (2, 2)
    else:  # unigram and bigram case
        ngram_range = (1, 2)

    # Iterate over datasets with and without stopwords
    for without_stopwords_bool in [True, False]:
        if without_stopwords_bool:
            stop_words = 'english'
            # alpha = alpha_dict[classifier]['without_stopwords_true']
        else:
            # don't remove stopwords
            stop_words = None
            # alpha = alpha_dict[classifier]['without_stopwords_false']
        print(f'REMOVE STOPWORDS? : {str(without_stopwords_bool)}')

        # print(f'alpha param: {alpha}')

        # ============================
        # TRAINING
        # ============================
        clf = Pipeline(steps=[
            ('vectorizer', CountVectorizer(ngram_range=ngram_range, stop_words=stop_words)),
            ('model', MultinomialNB(alpha=0.5)),
        ])

        clf = clf.fit(reviews_train, labels_train)

        # ============================
        # RESULTS FROM TUNING
        # ============================
        predictions = clf.predict(reviews_test)
        score = metrics.accuracy_score(labels_test,predictions,normalize=True)
        print(f'accuracy: {score}')

------------------ CLASSIFIER: uni ------------------
REMOVE STOPWORDS? : True
accuracy: 0.6923892422720679
REMOVE STOPWORDS? : False
accuracy: 0.7010192314845813
------------------ CLASSIFIER: bi ------------------
REMOVE STOPWORDS? : True
accuracy: 0.8634638991184019
REMOVE STOPWORDS? : False
accuracy: 0.8638916787560912
------------------ CLASSIFIER: uni_bi ------------------
REMOVE STOPWORDS? : True
accuracy: 0.8579771602871703
REMOVE STOPWORDS? : False
accuracy: 0.8466874976751106


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
import os

for classifier in ['uni', 'bi', 'uni_bi']:
    print(f'------------------ CLASSIFIER: {classifier} ------------------')
    if classifier is 'uni':
        ngram_range = (1, 1)
    elif classifier is 'bi':
        ngram_range = (2, 2)
    else:  # unigram and bigram case
        ngram_range = (1, 2)

    # Iterate over datasets with and without stopwords
    for without_stopwords_bool in [True, False]:
        if without_stopwords_bool:
            stop_words = 'english'
            # alpha = alpha_dict[classifier]['without_stopwords_true']
        else:
            # don't remove stopwords
            stop_words = None
            # alpha = alpha_dict[classifier]['without_stopwords_false']
        print(f'REMOVE STOPWORDS? : {str(without_stopwords_bool)}')

        # print(f'alpha param: {alpha}')

        # ============================
        # TRAINING
        # ============================
        clf = Pipeline(steps=[
            ('vectorizer', CountVectorizer(ngram_range=ngram_range, stop_words=stop_words)),
            ('model', MultinomialNB(alpha=0.5)),
        ])

        clf = clf.fit(reviews_train[:10000], labels_train[:10000])

        # ============================
        # RESULTS FROM TUNING
        # ============================
        predictions = clf.predict(reviews_test[:2000])
        score = metrics.accuracy_score(labels_test[:2000],predictions,normalize=True)
        print(f'accuracy: {score}')

We will proceed with using bigrams as the text features for our baseline model as it achieves the best accuracy based on the results shown above. Now we can also tune the MultinomialNB classifier:

### Tune Alpha Hyperparameter for MultinomialNB classifier

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)

In [None]:

def tune_alpha(X_train,y_train,X_val,y_val):
    # alphas = np.arange(0,2,0.25)
    alphas = np.arange(0.25,0.85,0.05)

    from sklearn.metrics import f1_score

    print('------------ CLASSIFIER: bi_sw_removed ------------')

    for a in alphas:
        mnb = MultinomialNB(alpha=a)
        mnb.fit(X_train,y_train)

        y_pred = mnb.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        # f1 = f1_score(y_val, y_pred, average='micro')
        print(f'accuracy for alpha value {a:.2f}: {accuracy}')
        # print(f'f1 score (micro-avg) for alpha value {a}: {f1}')
    

In [None]:
X_test = vectorizer.transform(reviews_test)

tune_alpha(X_train, labels_train, X_test, labels_test)

From the results, we see that an alpha value of 0.5 gives best performance. Therefore, we will use a Multinomial Naive Bayes classifier with alpha = 0.5 trained on text transformed with bigrams and stopwords removed. To evaluate the models, we can use accuracy, precision, recall, and/or F1-scores as metrics to quantify the classification of sentiment from the drug reviews.

*Update*: alpha value of 0.4 gives best performance when trained on full training dataset

In [10]:
nb = MultinomialNB(alpha=0.4)
nb.fit(X_train,labels_train)
preds = nb.predict(X_test)

accuracy = accuracy_score(labels_test, preds)
print(accuracy)

0.8634638991184019


In [11]:
print(classification_report(labels_test, preds))

              precision    recall  f1-score   support

         neg       0.86      0.78      0.82     11838
     neutral       0.83      0.65      0.73      9579
         pos       0.87      0.96      0.91     32349

    accuracy                           0.86     53766
   macro avg       0.85      0.80      0.82     53766
weighted avg       0.86      0.86      0.86     53766

