### **Project Description:** This project compares the **accuracy of sentiment classification** of user food reviews using **VADER** (*Valence Aware Dictionary and Sentiment Reasoner*) and **BERT** (*Bidirectional Encoder Representations from Transformers*).The project utilizes the Kaggle Food Dataset (Link: https://www.kaggle.com/datasets/toygarr/datasets-for-natural-language-processing)

### **Install libraries**

In [None]:
%%capture
!pip install accelerate -U

### **Import libraries**

In [None]:
import nltk
import torch
import gdown
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Download VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### **Load and observe data**

In [None]:
url = 'https://drive.google.com/uc?id=1WjpirD4dZ0jBSUeF2-pvN9EODLS5tTHi'
output = 'reviews.csv'
gdown.download(url, output, quiet=True)

df = pd.read_csv('reviews.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145293 entries, 0 to 145292
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    145293 non-null  object
 1   Y       145293 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,text,Y
0,this is the perfect blend of spice and sweet i...,1
1,the only coffee shop in town that carried big ...,1
2,this was a great price but amazon is not going...,1
3,while i love this product i feel disappointed ...,1
4,i have color treated hair and wanted to try th...,0


###  **Split data into train (80%) and test (20%)**

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

### **Preprocess the reviews**

In [None]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

# Apply preprocessing
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

### **Analyse sentiment of reviews using VADER**

In [None]:
# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def vader_sentiment(text):
    scores = vader.polarity_scores(text)
    return 1 if scores['compound'] >= 0 else 0  # Binary classification

# Apply VADER to the test dataset
test_df['vader_sentiment'] = test_df['text'].apply(vader_sentiment)

### **Load pre-trained BERT model and create torch datasets**

In [None]:
%%capture
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Tokenize the train dataset
train_encodings = tokenize_function({'text': train_df['text'].tolist()})
train_labels = train_df['Y'].tolist()

# Tokenize the test dataset
test_encodings = tokenize_function({'text': test_df['text'].tolist()})

# Create torch datasets
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, [0]*len(test_df))  # Labels are dummy here for prediction

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Fine tune the BERT model**

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=32,   # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    fp16=True,                       # enable mixed precision training
    gradient_accumulation_steps=2,   # accumulate gradients over 2 steps
    dataloader_num_workers=2,        # number of subprocesses to use for data loading
    evaluation_strategy="steps",     # evaluation strategy during training
    eval_steps=1000,                 # evaluate every 1000 steps
    logging_steps=500                # log every 500 steps
)

# Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

# Train the model
trainer.train()


  self.pid = os.fork()


Step,Training Loss,Validation Loss
1000,0.1262,5.238802


  self.pid = os.fork()


TrainOutput(global_step=1816, training_loss=0.13724871143895623, metrics={'train_runtime': 2787.3673, 'train_samples_per_second': 41.7, 'train_steps_per_second': 0.652, 'total_flos': 3.057981929816064e+16, 'train_loss': 0.13724871143895623, 'epoch': 0.9997247453894853})

### **Make predictions using the fine-tuned BERT model**

In [None]:
# Predict on the test dataset
predictions = trainer.predict(test_dataset)
test_df['bert_sentiment'] = predictions.predictions.argmax(axis=1)

  self.pid = os.fork()


  self.pid = os.fork()


### **Compare accuracy and F1 score for VADER and BERT**

In [None]:
# Calculate accuracy and F1 score for VADER
vader_accuracy = accuracy_score(test_df['Y'], test_df['vader_sentiment'])
vader_f1 = f1_score(test_df['Y'], test_df['vader_sentiment'], average='weighted')

# Calculate accuracy and F1 score for BERT
bert_accuracy = accuracy_score(test_df['Y'], test_df['bert_sentiment'])
bert_f1 = f1_score(test_df['Y'], test_df['bert_sentiment'], average='weighted')

# Print the results
print(f'VADER Accuracy: {vader_accuracy:.4f}, F1 Score: {vader_f1:.4f}')
print(f'BERT Accuracy: {bert_accuracy:.4f}, F1 Score: {bert_f1:.4f}')



VADER Accuracy: 0.8719, F1 Score: 0.8608
BERT Accuracy: 0.9669, F1 Score: 0.9668
