<a href="https://colab.research.google.com/github/amanullahshah32/CSE498R/blob/main/SentimentAnlysisWithXAI%2BBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Required Libraries


In [None]:
!pip install transformers torch torch-optimizer imbalanced-learn scikit-learn matplotlib --quiet


#2. Load and Preprocess The Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your scraped data from the provided URL
url = "https://raw.githubusercontent.com/amanullahshah32/Review-Scraping/refs/heads/main/Dataset/cleaned_dataset.csv"
df = pd.read_csv(url)

# Drop rows where 'review_description' or 'rating' are missing
df.dropna(subset=['review_description', 'rating'], inplace=True)

# Shuffle the sampled dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create a sentiment column based on rating (assuming rating scale is 1-5)
df['sentiment'] = df['rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['review_description'], df['sentiment'], test_size=0.2, random_state=42)

# Convert labels to list
train_labels = train_labels.tolist()
val_labels = val_labels.tolist()

# Display the first few rows
df.head()



#3.Handle Class Imbalance

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Since train_texts is a pandas Series, we need to reshape it to a DataFrame
train_texts_df = pd.DataFrame(train_texts)

# Apply oversampling to balance the classes
train_texts_resampled, train_labels_resampled = ros.fit_resample(train_texts_df, train_labels)

# Convert the DataFrame of resampled texts back to a list
train_texts_resampled = train_texts_resampled.squeeze().tolist()  # .squeeze() ensures a flat list


#4. Tokenization with BERT

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)


#5. Create a Dataset Class for PyTorch

In [None]:
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the PyTorch datasets
train_dataset = ReviewDataset(train_encodings, train_labels_resampled)
val_dataset = ReviewDataset(val_encodings, val_labels)


#6. Load Pre-trained BERT Model

In [None]:
import torch
from transformers import BertForSequenceClassification

# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained BERT model for sequence classification (3 classes)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Move the model to the appropriate device
model.to(device)


#7. Set Up DataLoader, Optimizer, and Scheduler

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

# Optimizer: AdamW with weight decay and a smaller learning rate
learning_rate = 3e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

# Scheduler for learning rate decay
epochs = 10
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




#8. Class Weights for Imbalance

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Convert the class labels to a NumPy array
classes = np.array([0, 1, 2])

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=classes, y=train_labels_resampled)

# Convert to a PyTorch tensor and move it to the appropriate device
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Use the weights in the loss function
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)


#9. Training Loop

In [None]:
import time
import torch
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Initialize lists to track metrics
train_accuracies = []
val_accuracies = []
epoch_durations = []

# Loop for training and validation
for epoch in range(10):  # Training for 10 epochs
    start_time = time.time()  # Start time for the epoch

    # Training loop
    model.train()
    train_preds = []
    train_labels_epoch = []  # Track labels for each epoch
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Collect predictions
        train_preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
        train_labels_epoch.extend(batch['labels'].cpu().numpy())

    end_time = time.time()  # End time for the epoch
    epoch_duration = end_time - start_time  # Time taken for the epoch
    epoch_durations.append(epoch_duration)

    # Calculate training accuracy
    train_acc = accuracy_score(train_labels_epoch, train_preds)
    train_accuracies.append(train_acc)

    # Validation loop
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            val_preds.extend(predictions.cpu().numpy())

    # Calculate validation accuracy
    val_acc = accuracy_score(val_labels, val_preds)
    val_accuracies.append(val_acc)

    print(f'Epoch {epoch+1} completed in {epoch_duration:.2f} seconds')
    print(f'Training Accuracy: {train_acc:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}')

    # Classification report
    print(f'Classification Report (Validation):\n {classification_report(val_labels, val_preds)}')

# Visualize the results
plt.figure(figsize=(10, 5))
plt.plot(range(1, epochs+1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, epochs+1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()




Epoch 1 completed in 2661.51 seconds
Training Accuracy: 0.8774
Validation Accuracy: 0.8876
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.73      0.75      0.74      1312
           1       0.14      0.25      0.18       449
           2       0.96      0.93      0.94     11293

    accuracy                           0.89     13054
   macro avg       0.61      0.64      0.62     13054
weighted avg       0.91      0.89      0.90     13054





Epoch 2 completed in 2659.65 seconds
Training Accuracy: 0.9529
Validation Accuracy: 0.8867
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.70      0.77      0.73      1312
           1       0.12      0.18      0.15       449
           2       0.96      0.93      0.94     11293

    accuracy                           0.89     13054
   macro avg       0.59      0.62      0.61     13054
weighted avg       0.90      0.89      0.89     13054





Epoch 3 completed in 2658.48 seconds
Training Accuracy: 0.9614
Validation Accuracy: 0.8697
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.69      0.73      0.71      1312
           1       0.10      0.20      0.14       449
           2       0.95      0.91      0.93     11293

    accuracy                           0.87     13054
   macro avg       0.58      0.61      0.59     13054
weighted avg       0.90      0.87      0.88     13054





Epoch 4 completed in 2658.67 seconds
Training Accuracy: 0.9655
Validation Accuracy: 0.8867
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.74      0.68      0.71      1312
           1       0.11      0.15      0.13       449
           2       0.94      0.94      0.94     11293

    accuracy                           0.89     13054
   macro avg       0.60      0.59      0.59     13054
weighted avg       0.89      0.89      0.89     13054





#10. Make New Predictions

In [None]:
# Make predictions on new data (Example: a list of review texts)
new_reviews = [
    "The app is very helpful for tracking my health.",
    "I had a bad experience, it kept crashing.",
    "Great app, I would definitely recommend it to others!"
]

# Tokenize the new reviews
new_encodings = tokenizer(new_reviews, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Move the tensors to the appropriate device
new_encodings = {key: val.to(device) for key, val in new_encodings.items()}

# Perform the prediction
model.eval()
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# Print the predictions (0 = Negative, 1 = Neutral, 2 = Positive)
for review, pred in zip(new_reviews, predictions):
    sentiment = ['Negative', 'Neutral', 'Positive'][pred]
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


#11. SHAP for Model Explanation

In [None]:
import shap

# Create a SHAP explainer using the model and tokenizer
explainer = shap.Explainer(model, tokenizer)

# Choose a random sample from the validation set to explain
sample_review = val_texts[0]  # You can change this index to any text in the validation set
sample_encoding = tokenizer(sample_review, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Explanation using SHAP
shap_values = explainer(sample_encoding)

# Visualize the SHAP values for this prediction
shap.initjs()
shap.summary_plot(shap_values, sample_encoding['input_ids'])



#12. LIME for Model Explanation

In [None]:
import lime
from lime.lime_text import LimeTextExplainer

# Initialize a LIME text explainer
explainer = LimeTextExplainer(class_names=['Negative', 'Neutral', 'Positive'])

# Define a function for predicting class probabilities using the BERT model
def predict_proba(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        logits = model(**encodings).logits
        probs = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()
    return probs

# Choose a sample review from the validation set
sample_review = val_texts[0]

# Explain the prediction for the sample review using LIME
lime_explanation = explainer.explain_instance(sample_review, predict_proba, num_features=10)

# Visualize the LIME explanation
lime_explanation.show_in_notebook(text=True)
