<a href="https://colab.research.google.com/github/abdulrahman-riyad/real-time-reviews-analysis/blob/Approach2/aspect_sentiment_analysis_classifier_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import string
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
from transformers import Trainer, TrainingArguments, AutoTokenizer, RobertaModel
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [2]:
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/restaurants_reviews_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation


## Loading the Data

In [3]:
df1 = pd.read_csv(data_path + '/Restaurants_Train_v2.csv')
df1.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,3121,But the staff was so horrible to us.,staff,negative,8,13
1,2777,"To be completely fair, the only redeeming fact...",food,positive,57,61
2,1634,"The food is uniformly exceptional, with a very...",food,positive,4,8
3,1634,"The food is uniformly exceptional, with a very...",kitchen,positive,55,62
4,1634,"The food is uniformly exceptional, with a very...",menu,neutral,141,145


In [4]:
df2 = pd.read_csv(data_path + '/Laptop_Train_v2.csv')
df2.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12


In [5]:
df = pd.concat([df1, df2])

In [6]:
df.isna().sum()

Unnamed: 0,0
id,0
Sentence,0
Aspect Term,0
polarity,0
from,0
to,0


In [7]:
df.shape

(6051, 6)

In [8]:
df["polarity"].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,3151
negative,1671
neutral,1093
conflict,136


## Dropping Unimportant Columns & Rows

In [9]:
df.drop(['id', 'from', 'to'], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,Sentence,Aspect Term,polarity
0,But the staff was so horrible to us.,staff,negative
1,"To be completely fair, the only redeeming fact...",food,positive
2,"The food is uniformly exceptional, with a very...",food,positive
3,"The food is uniformly exceptional, with a very...",kitchen,positive
4,"The food is uniformly exceptional, with a very...",menu,neutral


In [11]:
df.drop(df[df.polarity == 'conflict'].index, inplace = True)

In [12]:
df = df.rename(columns={"Sentence": "text", "Aspect Term": "aspect", "polarity": "label"})

In [13]:
df.shape

(5818, 3)

## Cleaning Reviews Text

In [14]:
def clean_text(text):
  """Clean and preprocess text data
  Parameters:
  -----------
  text : str
  The text to clean

  Returns:
  --------
  str
  Cleaned text
  """

  # Convert to lowercase
  text = text.lower()

  # Handle contractions
  contractions = {
      "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not","haven't": "have not",
      "hasn't": "has not", "hadn't": "had not", "doesn't": "does not", "don't": "do not", "didn't": "did not",
      "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not", "shouldn't": "should not",
      "mightn't": "might not", "mustn't": "must not", "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
      "it's": "it is", "we're": "we are", "they're": "they are", "i've": "i have", "you've": "you have", "we've": "we have",
      "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would", "she'd": "she would", "it'd": "it would",
      "we'd": "we would", "they'd": "they would", "i'll": "i will", "you'll": "you will", "he'll": "he will", "she'll": "she will",
      "it'll": "it will", "we'll": "we will", "they'll": "they will", "didnt": "did not", "dont": "do not", "cant": "cannot", "wont": "will not",
      }

  for contraction, expansion in contractions.items():
    text = text.replace(contraction, expansion)

  # Preserve emotions
  emoticons = {
      ':)': ' HAPPY_FACE ',
      ':(': ' SAD_FACE ',
      ':D': ' LAUGH_FACE ',
      ':/': ' CONFUSED_FACE '
      }
  for emoticon, replacement in emoticons.items():
    text = text.replace(emoticon, replacement)


  # Remove punctuation but preserve sentence structure
  text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)

  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  # Restore emotions
  for placeholder, emoticon in {v: k for k, v in emoticons.items()}.items():
    text = text.replace(placeholder, emoticon)

  return text

## Encoding Polarity Column

In [15]:
sentiment_encoder = LabelEncoder()

df['label'] = sentiment_encoder.fit_transform(df['label'])

label_mapping = dict(zip(sentiment_encoder.classes_, range(len(sentiment_encoder.classes_))))
print(label_mapping)

{'negative': 0, 'neutral': 1, 'positive': 2}


In [16]:
df['label'].dtype

dtype('int64')

In [17]:
df["label"] = df["label"].apply(
    lambda x: x.iloc[0] if isinstance(x, pd.Series) else (x[0] if isinstance(x, list) else x)
)


In [18]:
print(type(df.loc[0, "label"]))   # Should print: <class 'int'>


<class 'pandas.core.series.Series'>


In [19]:
print(df["label"].iloc[0])
print(type(df["label"].iloc[0]))

0
<class 'numpy.int64'>


# Fine-Tuning roBERTa

In [20]:
train_df, tst_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(tst_df, test_size=0.3, random_state=42)

## Trial 1


### Tokenizer

In [21]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Dataset Class


In [22]:
class AspectSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      if isinstance(idx, (list, np.ndarray, pd.Series)):
            if len(idx) != 1:
                raise ValueError(f"Expected scalar idx, but got batch: {idx}")
            idx = int(idx[0])
      elif isinstance(idx, torch.Tensor):
            idx = idx.item()
      else:
            idx = int(idx)

      # Safe scalar access
      review = str(self.df.at[idx, "text"])
      aspect = str(self.df.at[idx, "aspect"])
      label = int(self.df.at[idx, "label"])

      encoded = self.tokenizer(
          review,
          aspect,
          truncation=True,
          padding="max_length",
          max_length=self.max_len,
          return_tensors="pt"
      )
      item = {
          "input_ids": encoded["input_ids"].squeeze(0),
          "attention_mask": encoded["attention_mask"][0].squeeze(0),
          "label": torch.tensor(label, dtype=torch.long)
          }
      return item

In [23]:
train_dataset = AspectSentimentDataset(train_df, tokenizer)
val_dataset = AspectSentimentDataset(val_df, tokenizer)
test_dataset = AspectSentimentDataset(test_df, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

### Model Architecture

In [24]:
class AspectSentimentClassifier(nn.Module):
    def __init__(self, model_name='roberta-base', num_labels=3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.2)

        hidden_size = self.roberta.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        cls_embedding = self.dropout(cls_embedding)
        logits = self.classifier(cls_embedding)
        return logits

### Training

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = AspectSentimentClassifier()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

epochs = 5

def compute_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return (preds == labels).float().mean().item()

# Training
best_val_acc = 0.0
best_model_state_dict = None  # Will hold the state_dict of the best model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    total_train_acc = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        acc = compute_accuracy(outputs, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        total_train_acc += acc

        loop.set_postfix(loss=loss.item(), acc=acc)

    avg_train_loss = total_train_loss / len(train_loader)
    avg_train_acc = total_train_acc / len(train_loader)
    print(f"\nTrain Loss: {avg_train_loss:.4f} | Train Acc: {avg_train_acc:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    total_val_acc = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels)

            total_val_loss += loss.item()
            total_val_acc += acc

    avg_val_loss = total_val_loss / len(val_loader)
    avg_val_acc = total_val_acc / len(val_loader)
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {avg_val_acc:.4f}\n")

    # Save best model state dict in variable
    if avg_val_acc > best_val_acc:
        best_val_acc = avg_val_acc
        best_model_state_dict = model.state_dict()
        print(f"✅ New best model saved with acc: {best_val_acc:.4f}\n")


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Using device: cuda


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/conf


Train Loss: 0.7056 | Train Acc: 0.7123
Val Loss: 0.5244 | Val Acc: 0.7885

✅ New best model saved with acc: 0.7885



Epoch 2/5: 100%|██████████| 291/291 [01:45<00:00,  2.77it/s, acc=1, loss=0.213]



Train Loss: 0.4653 | Train Acc: 0.8232
Val Loss: 0.4247 | Val Acc: 0.8330

✅ New best model saved with acc: 0.8330



Epoch 3/5: 100%|██████████| 291/291 [01:44<00:00,  2.79it/s, acc=0.857, loss=0.3]



Train Loss: 0.3060 | Train Acc: 0.8859
Val Loss: 0.4865 | Val Acc: 0.8265



Epoch 4/5: 100%|██████████| 291/291 [01:44<00:00,  2.78it/s, acc=1, loss=0.0968]



Train Loss: 0.2283 | Train Acc: 0.9199
Val Loss: 0.5236 | Val Acc: 0.8438

✅ New best model saved with acc: 0.8438



Epoch 5/5: 100%|██████████| 291/291 [01:44<00:00,  2.79it/s, acc=0.857, loss=0.421]



Train Loss: 0.1564 | Train Acc: 0.9499
Val Loss: 0.4748 | Val Acc: 0.8477

✅ New best model saved with acc: 0.8477



### Saving Model

In [34]:
torch.save(best_model_state_dict, data_path+"/roBERTa_model_v1")

In [35]:
tokenizer.save_pretrained(data_path+"/roBERTa_model1_tokenizer")

('/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/vocab.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/merges.txt',
 '/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/added_tokens.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/roBERTa_model1_tokenizer/tokenizer.json')