<a href="https://colab.research.google.com/github/boskya/aai501-drug-reviews/blob/main/DrugReviewsBERTBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

AAI-501 Final Project: Predicting Patient Sentiment in Drug Reviews

Some portions of this code were generated with ChatGPT (OpenAI, 2023) 

References:
OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# load the training file and examine content
drugs_training_data_path = "sample_data/drugsComTrain_raw.tsv"
drugs_training_data = pd.read_csv(drugs_training_data_path , sep="\t")
print(drugs_training_data.head())
print(drugs_training_data.shape)

# laod the test file and examine content
drugs_test_data_path = "sample_data/drugsComTest_raw.tsv"
drugs_test_data = pd.read_csv(drugs_test_data_path , sep="\t")
drugs_test_data.head()
print(drugs_test_data.shape)

   Unnamed: 0                  drugName                     condition  \
0      206461                 Valsartan  Left Ventricular Dysfunction   
1       95260                Guanfacine                          ADHD   
2       92703                    Lybrel                 Birth Control   
3      138000                Ortho Evra                 Birth Control   
4       35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  \
0  "It has no side effect, I take it in combinati...     9.0   
1  "My son is halfway through his fourth week of ...     8.0   
2  "I used to take another oral contraceptive, wh...     5.0   
3  "This is my first time using any form of birth...     8.0   
4  "Suboxone has completely turned my life around...     9.0   

                date  usefulCount  
0       May 20, 2012           27  
1     April 27, 2010          192  
2  December 14, 2009           17  
3   November 3, 2015           1

In [7]:
def label_sentiment(rating):
    if rating >= 7:
        return "positive"
    elif rating > 4 and rating < 7:
        return "neutral"
    else:
        return "negative"

# Apply the labeling function to the 'rating' column
drugs_training_data['sentiment'] = drugs_training_data['rating'].apply(label_sentiment)
drugs_test_data['sentiment'] = drugs_test_data['rating'].apply(label_sentiment)

# Preview the labeled data
print("\nSentiment distribution in training data:")
print(drugs_training_data['sentiment'].value_counts())

print("\nSentiment distribution in test data:")
print(drugs_test_data['sentiment'].value_counts())

# View a sample of reviews with sentiment
print("\nSample labeled reviews (training data):")
print(drugs_training_data[['review', 'rating', 'sentiment']].head())


Sentiment distribution in training data:
sentiment
positive    106866
negative     40075
neutral      14356
Name: count, dtype: int64

Sentiment distribution in test data:
sentiment
positive    35440
negative    13497
neutral      4829
Name: count, dtype: int64

Sample labeled reviews (training data):
                                              review  rating sentiment
0  "It has no side effect, I take it in combinati...     9.0  positive
1  "My son is halfway through his fourth week of ...     8.0  positive
2  "I used to take another oral contraceptive, wh...     5.0   neutral
3  "This is my first time using any form of birth...     8.0  positive
4  "Suboxone has completely turned my life around...     9.0  positive


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Raw text data
X_raw_bert = drugs_training_data['review']
y_raw_bert = drugs_training_data['sentiment']

X_test_raw_bert = drugs_test_data['review']
y_test_raw_bert  = drugs_test_data['sentiment']

# Encode labels
label_encoder = LabelEncoder()
y_encoded_bert = label_encoder.fit_transform(y_raw_bert)
y_test_encoded_bert = label_encoder.transform(y_test_raw_bert)

# Split into train and validation sets
X_train_raw_bert, X_val_raw_bert, y_train_encoded_bert, y_val_encoded_bert = train_test_split(
    X_raw_bert, y_encoded_bert, test_size=0.2, random_state=42
)

print("Train data size:", len(X_train_raw_bert))
print("Validation data size:", len(X_val_raw_bert))
print("Test data size:", len(X_test_raw_bert))

Train data size: 129037
Validation data size: 32260
Test data size: 53766


In [9]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss


import torch

# Step 1: Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



print(len(X_train_raw_bert), len(y_train_encoded_bert))  # Should match
print(len(X_val_raw_bert), len(y_val_encoded_bert))           # Should match
print(len(X_test_raw_bert), len(y_test_encoded_bert))         # Should match
# Define a custom dataset class for PyTorch
class DrugReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets
train_dataset = DrugReviewDataset(X_train_raw_bert.tolist(), y_train_encoded_bert, tokenizer)
val_dataset = DrugReviewDataset(X_val_raw_bert.tolist(), y_val_encoded_bert, tokenizer)
test_dataset = DrugReviewDataset(X_test_raw_bert.tolist(), y_test_encoded_bert, tokenizer)

# Step 2: Model Initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# class weights for imbalance
class_counts = torch.tensor([list(y_train_encoded_bert).count(i) for i in range(3)])
class_weights = 1. / class_counts.float()
class_weights = class_weights / class_weights.sum()  # Normalize weights
class_weights_dict = {i: class_weights[i] for i in range(3)}

# Step 3: Define a custom loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to('cuda'))

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    label_smoothing_factor=0.1
)

# Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        "f1": f1_score(p.label_ids, p.predictions.argmax(-1), average="weighted")
    }
)

# Train the Model
trainer.train()

# Evaluate on Test Set
predictions = trainer.predict(test_dataset)
test_accuracy = accuracy_score(y_test_encoded_bert, predictions.predictions.argmax(-1))
test_f1 = f1_score(y_test_encoded_bert, predictions.predictions.argmax(-1), average="weighted")

print("Test Accuracy:", test_accuracy)
print("Test F1 Score:", test_f1)

129037 129037
32260 32260
53766 53766


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to('cuda'))
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5506,0.563876,0.847861,0.836457
2,0.4942,0.539342,0.865282,0.864803
3,0.4449,0.551973,0.875418,0.873798


Test Accuracy: 0.8700293866011978
Test F1 Score: 0.8692972095660425


In [10]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = predictions.predictions.argmax(-1)
report = classification_report(y_test_encoded_bert, y_pred, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

    negative       0.85      0.83      0.84     13497
     neutral       0.45      0.44      0.45      4829
    positive       0.93      0.94      0.94     35440

    accuracy                           0.87     53766
   macro avg       0.75      0.74      0.74     53766
weighted avg       0.87      0.87      0.87     53766

