In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# load the training file and examine content
drugs_training_data_path = "data/drugsComTrain_raw.tsv"
drugs_training_data = pd.read_csv(drugs_training_data_path , sep="\t")
print(drugs_training_data.head())
print(drugs_training_data.shape)

# laod the test file and examine content
drugs_test_data_path = "data/drugsComTest_raw.tsv"
drugs_test_data = pd.read_csv(drugs_test_data_path , sep="\t")
drugs_test_data.head() 
print(drugs_test_data.shape)

   Unnamed: 0                  drugName                     condition  \
0      206461                 Valsartan  Left Ventricular Dysfunction   
1       95260                Guanfacine                          ADHD   
2       92703                    Lybrel                 Birth Control   
3      138000                Ortho Evra                 Birth Control   
4       35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  \
0  "It has no side effect, I take it in combinati...     9.0   
1  "My son is halfway through his fourth week of ...     8.0   
2  "I used to take another oral contraceptive, wh...     5.0   
3  "This is my first time using any form of birth...     8.0   
4  "Suboxone has completely turned my life around...     9.0   

                date  usefulCount  
0       May 20, 2012           27  
1     April 27, 2010          192  
2  December 14, 2009           17  
3   November 3, 2015           1

In [15]:
# Check for missing values
print("\nMissing values in training data:")
print(drugs_training_data.isnull().sum())

print("\nMissing values in test data:")
print(drugs_test_data.isnull().sum())


Missing values in training data:
Unnamed: 0       0
drugName         0
condition      899
review           0
rating           0
date             0
usefulCount      0
dtype: int64

Missing values in test data:
Unnamed: 0       0
drugName         0
condition      295
review           0
rating           0
date             0
usefulCount      0
dtype: int64


In [23]:
# There is small portion, drop rows with missing condition
drugs_training_data = drugs_training_data.dropna(subset=['condition'])
drugs_test_data = drugs_test_data.dropna(subset=['condition'])


In [16]:
# Check for duplicates
print(f"\nDuplicates in training data: {drugs_training_data.duplicated().sum()}")
print(f"Duplicates in test data: {drugs_test_data.duplicated().sum()}")


Duplicates in training data: 0
Duplicates in test data: 0


In [36]:
# Summarize conditions
print("\nTop conditions in training data:")
print(drugs_training_data['condition'].value_counts().head(10))


Top conditions in training data:
condition
Birth Control      28788
Depression          9069
Pain                6145
Anxiety             5904
Acne                5588
Bipolar Disorde     4224
Insomnia            3673
Weight Loss         3609
Obesity             3568
ADHD                3383
Name: count, dtype: int64


In [37]:
print("\nTop conditions in test data:")
print(drugs_test_data['condition'].value_counts().head(10))


Top conditions in test data:
condition
Birth Control      9648
Depression         3095
Pain               2100
Anxiety            1908
Acne               1847
Bipolar Disorde    1380
Weight Loss        1248
Insomnia           1231
Obesity            1189
ADHD               1126
Name: count, dtype: int64


In [38]:
print("\nTop drugs in training data:")
print(drugs_training_data['drugName'].value_counts())


Top drugs in training data:
drugName
Levonorgestrel                       3657
Etonogestrel                         3336
Ethinyl estradiol / norethindrone    2850
Nexplanon                            2156
Ethinyl estradiol / norgestimate     2117
                                     ... 
Omnipaque 350                           1
Vontrol                                 1
Ivabradine                              1
Neo-Poly-Dex                            1
Grifulvin V                             1
Name: count, Length: 3436, dtype: int64


Of the top ten drugs, 7 of them are contraceptives. There are 3436 unqiue drug counts

In [40]:
# Summarize ratings distribution
print("\nRatings distribution in training data:")
print(drugs_training_data['rating'].value_counts())

print("\nRatings distribution in test data:")
print(drugs_test_data['rating'].value_counts())


Ratings distribution in training data:
rating
10.0    50989
9.0     27531
1.0     21619
8.0     18890
7.0      9456
5.0      8013
2.0      6931
3.0      6513
6.0      6343
4.0      5012
Name: count, dtype: int64

Ratings distribution in test data:
rating
10.0    17016
9.0      9177
1.0      7299
8.0      6156
7.0      3091
5.0      2710
2.0      2334
3.0      2205
6.0      2119
4.0      1659
Name: count, dtype: int64


In [44]:
# preprocess the reviews
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('all')

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_review(review):
    # Remove special characters and numbers
    review = re.sub(r"[^a-zA-Z\s]", "", review)
    # Convert to lowercase
    review = review.lower()
    # Tokenize
    words = word_tokenize(review)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Join back into a single string
    return " ".join(words)

# Apply preprocessing to the 'review' column
drugs_training_data['cleaned_review'] = drugs_training_data['review'].apply(preprocess_review)
drugs_test_data['cleaned_review'] = drugs_test_data['review'].apply(preprocess_review)

# Preview the cleaned reviews
print("\nSample cleaned reviews (training data):")
print(drugs_training_data[['review', 'cleaned_review']].head())

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/bosky/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/bosky/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/bosky/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/bosky/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/bosky/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[n


Sample cleaned reviews (training data):
                                              review  \
0  "It has no side effect, I take it in combinati...   
1  "My son is halfway through his fourth week of ...   
2  "I used to take another oral contraceptive, wh...   
3  "This is my first time using any form of birth...   
4  "Suboxone has completely turned my life around...   

                                      cleaned_review  
0  side effect take combination bystolic mg fish oil  
1  son halfway fourth week intuniv became concern...  
2  used take another oral contraceptive pill cycl...  
3  first time using form birth control im glad we...  
4  suboxone completely turned life around feel he...  


In [45]:
# Define function to map ratings to sentiment labels
def label_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

# Apply the labeling function to the 'rating' column
drugs_training_data['sentiment'] = drugs_training_data['rating'].apply(label_sentiment)
drugs_test_data['sentiment'] = drugs_test_data['rating'].apply(label_sentiment)

# Preview the labeled data
print("\nSentiment distribution in training data:")
print(drugs_training_data['sentiment'].value_counts())

print("\nSentiment distribution in test data:")
print(drugs_test_data['sentiment'].value_counts())

# View a sample of reviews with sentiment
print("\nSample labeled reviews (training data):")
print(drugs_training_data[['cleaned_review', 'rating', 'sentiment']].head())


Sentiment distribution in training data:
sentiment
positive    126234
negative     28550
neutral       6513
Name: count, dtype: int64

Sentiment distribution in test data:
sentiment
positive    41928
negative     9633
neutral      2205
Name: count, dtype: int64

Sample labeled reviews (training data):
                                      cleaned_review  rating sentiment
0  side effect take combination bystolic mg fish oil     9.0  positive
1  son halfway fourth week intuniv became concern...     8.0  positive
2  used take another oral contraceptive pill cycl...     5.0  positive
3  first time using form birth control im glad we...     8.0  positive
4  suboxone completely turned life around feel he...     9.0  positive


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 features

# Transform reviews into TF-IDF features
X_train = tfidf_vectorizer.fit_transform(drugs_training_data['cleaned_review'])
X_test = tfidf_vectorizer.transform(drugs_test_data['cleaned_review'])

# Encode sentiment labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(drugs_training_data['sentiment'])
y_test = label_encoder.transform(drugs_test_data['sentiment'])

# Split the training data into train and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Print shapes of datasets
print("Training set shape:", X_train_final.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)


Training set shape: (129037, 5000)
Validation set shape: (32260, 5000)
Test set shape: (53766, 5000)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize models
nb_model = MultinomialNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train Naïve Bayes model
nb_model.fit(X_train_final, y_train_final)
nb_val_preds = nb_model.predict(X_val)

# Train Random Forest model
rf_model.fit(X_train_final, y_train_final)
rf_val_preds = rf_model.predict(X_val)

# Evaluate Naive Bayes
print("\nNaive Bayes Classification Report (Validation Set):")
print(classification_report(y_val, nb_val_preds, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_val, nb_val_preds))

# Evaluate Random Forest
print("\nRandom Forest Classification Report (Validation Set):")
print(classification_report(y_val, rf_val_preds, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_val, rf_val_preds))

In [39]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Generate text
text = "Once upon a time,"
input_ids = tokenizer.encode(text, return_tensors="pt")
output = model.generate(input_ids, max_length=100)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great
