<a href="https://colab.research.google.com/github/amanullahshah32/CSE498R/blob/main/Sentiment%20Analysis/%20Nigar/%20SentimentAnalysis(Bert2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Install Required Libraries

## 2. Load and Preprocess Data

In [1]:
!nvidia-smi

Fri Oct 25 15:05:56 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.90                 Driver Version: 565.90         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060      WDDM  |   00000000:10:00.0  On |                  N/A |
|  0%   58C    P0             37W /  170W |     715MiB /  12288MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3060
Using device: cuda


In [3]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Load your scraped data from the provided URL
url = "all_app_reviews.csv"
df = pd.read_csv(url)

# Check the column names to find the appropriate columns
print(df.columns)

# Drop rows where 'review_description' or 'rating' are missing
df.dropna(subset=['review_description', 'rating'], inplace=True)

# Create a sentiment column based on rating (assuming rating scale is 1-5)
# Mapping: 1-2 -> Negative, 3 -> Neutral, 4-5 -> Positive
df['sentiment'] = df['rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['review_description'], df['sentiment'], test_size=0.2, random_state=42)

# Convert labels to list
train_labels = train_labels.tolist()
val_labels = val_labels.tolist()



Index(['source', 'review_id', 'user_name', 'review_title',
       'review_description', 'rating', 'thumbs_up', 'review_date',
       'developer_response', 'developer_response_date', 'appVersion',
       'language_code', 'country_code', 'app_name'],
      dtype='object')


## 3. Handle Class Imbalance

In [4]:
from imblearn.over_sampling import RandomOverSampler

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Since train_texts is a pandas Series, we need to reshape it to a DataFrame
train_texts_df = pd.DataFrame(train_texts)

# Apply oversampling to balance the classes
train_texts_resampled, train_labels_resampled = ros.fit_resample(train_texts_df, train_labels)

# Convert the DataFrame of resampled texts back to a list
train_texts_resampled = train_texts_resampled.squeeze().tolist()  # .squeeze() ensures a flat list



## 4. Tokenization with BERT

In [5]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)


  from .autonotebook import tqdm as notebook_tqdm


## 5. Create a Dataset Class for PyTorch

In [6]:
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the PyTorch datasets
train_dataset = ReviewDataset(train_encodings, train_labels_resampled)
val_dataset = ReviewDataset(val_encodings, val_labels)


## 6. Load Pre-trained BERT Model

In [7]:
from transformers import BertForSequenceClassification

# Load the pre-trained BERT model for sequence classification (3 classes)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 7. Set Up DataLoader, Optimizer, and Scheduler

In [8]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




## 8. Class Weights for Imbalance

In [9]:
import numpy as np
import torch
from sklearn.utils.class_weight import compute_class_weight

# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Convert the class labels to a NumPy array
classes = np.array([0, 1, 2])

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=classes, y=train_labels_resampled)

# Convert to a PyTorch tensor and move it to the appropriate device
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Use the weights in the loss function
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

## 9. Training Loop

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [11]:
import time
from sklearn.metrics import accuracy_score, classification_report



# Loop for training
for epoch in range(3):  # Training for 3 epochs
    start_time = time.time()  # Start time for the epoch

    # Training loop
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    end_time = time.time()  # End time for the epoch
    epoch_duration = end_time - start_time  # Time taken for the epoch

    print(f'Epoch {epoch+1} completed in {epoch_duration:.2f} seconds')

    # Validation loop
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            val_preds.extend(predictions.cpu().numpy())

    # Calculate validation accuracy
    print(f'Validation Accuracy: {accuracy_score(val_labels, val_preds)}')
    print(f'Classification Report:\n {classification_report(val_labels, val_preds)}')


Epoch 1 completed in 2802.17 seconds
Validation Accuracy: 0.8046497584541062
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.68      0.68      1983
           1       0.08      0.32      0.13       670
           2       0.95      0.84      0.89     17219

    accuracy                           0.80     19872
   macro avg       0.57      0.61      0.56     19872
weighted avg       0.89      0.80      0.84     19872

Epoch 2 completed in 2762.39 seconds
Validation Accuracy: 0.811292270531401
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.66      0.66      1983
           1       0.08      0.28      0.12       670
           2       0.95      0.85      0.90     17219

    accuracy                           0.81     19872
   macro avg       0.56      0.60      0.56     19872
weighted avg       0.89      0.81      0.85     19872

Epoch 3 completed in 3014.94 seconds


## 10. Save and Load the Model

In [12]:
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')


('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')

To load the model later:

In [13]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('sentiment_model')
tokenizer = BertTokenizer.from_pretrained('sentiment_model')


## 11. Make Predictions on New Data

In [15]:
def predict_sentiment(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move tensors to the GPU
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class

# Example
new_review = "This app is very helpful and easy to use."
predicted_class = predict_sentiment(new_review)
print(f"Predicted sentiment: {predicted_class}")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [17]:
pip install blis

Note: you may need to restart the kernel to use updated packages.


In [18]:
import spacy
spacy.require_gpu()


ValueError: BLIS support requires blis: pip install blis