<a href="https://colab.research.google.com/github/abdulrahman-riyad/real-time-reviews-analysis/blob/Approach2/aspect_sentiment_analysis_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, RobertaModel
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [None]:
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/restaurants_reviews_dataset'

Mounted at /content/drive


# Data Preparation


## Loading the Data

In [None]:
df1 = pd.read_csv(data_path + '/Restaurants_Train_v2.csv')
df1.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,3121,But the staff was so horrible to us.,staff,negative,8,13
1,2777,"To be completely fair, the only redeeming fact...",food,positive,57,61
2,1634,"The food is uniformly exceptional, with a very...",food,positive,4,8
3,1634,"The food is uniformly exceptional, with a very...",kitchen,positive,55,62
4,1634,"The food is uniformly exceptional, with a very...",menu,neutral,141,145


In [None]:
df2 = pd.read_csv(data_path + '/Laptop_Train_v2.csv')
df2.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12


In [None]:
df = pd.concat([df1, df2])

In [None]:
df.isna().sum()

Unnamed: 0,0
id,0
Sentence,0
Aspect Term,0
polarity,0
from,0
to,0


In [None]:
df.shape

(6051, 6)

In [None]:
df["polarity"].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,3151
negative,1671
neutral,1093
conflict,136


## Dropping Unimportant Columns & Rows

In [None]:
df.drop(['id', 'from', 'to'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,Sentence,Aspect Term,polarity
0,But the staff was so horrible to us.,staff,negative
1,"To be completely fair, the only redeeming fact...",food,positive
2,"The food is uniformly exceptional, with a very...",food,positive
3,"The food is uniformly exceptional, with a very...",kitchen,positive
4,"The food is uniformly exceptional, with a very...",menu,neutral


In [None]:
df.drop(df[df.polarity == 'conflict'].index, inplace = True)

In [None]:
df = df.rename(columns={"Sentence": "text", "Aspect Term": "aspect", "polarity": "label"})

In [None]:
df.shape

(5818, 3)

## Cleaning Reviews Text

In [None]:
def clean_text(text):
  """Clean and preprocess text data
  Parameters:
  -----------
  text : str
  The text to clean

  Returns:
  --------
  str
  Cleaned text
  """

  # Convert to lowercase
  text = text.lower()

  # Handle contractions
  contractions = {
      "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not","haven't": "have not",
      "hasn't": "has not", "hadn't": "had not", "doesn't": "does not", "don't": "do not", "didn't": "did not",
      "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not", "shouldn't": "should not",
      "mightn't": "might not", "mustn't": "must not", "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
      "it's": "it is", "we're": "we are", "they're": "they are", "i've": "i have", "you've": "you have", "we've": "we have",
      "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would", "she'd": "she would", "it'd": "it would",
      "we'd": "we would", "they'd": "they would", "i'll": "i will", "you'll": "you will", "he'll": "he will", "she'll": "she will",
      "it'll": "it will", "we'll": "we will", "they'll": "they will", "didnt": "did not", "dont": "do not", "cant": "cannot", "wont": "will not",
      }

  for contraction, expansion in contractions.items():
    text = text.replace(contraction, expansion)

  # Preserve emotions
  emoticons = {
      ':)': ' HAPPY_FACE ',
      ':(': ' SAD_FACE ',
      ':D': ' LAUGH_FACE ',
      ':/': ' CONFUSED_FACE '
      }
  for emoticon, replacement in emoticons.items():
    text = text.replace(emoticon, replacement)


  # Remove punctuation but preserve sentence structure
  text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)

  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  # Restore emotions
  for placeholder, emoticon in {v: k for k, v in emoticons.items()}.items():
    text = text.replace(placeholder, emoticon)

  return text

## Encoding Polarity Column

In [None]:
sentiment_encoder = LabelEncoder()

df['label'] = sentiment_encoder.fit_transform(df['label'])

label_mapping = dict(zip(sentiment_encoder.classes_, range(len(sentiment_encoder.classes_))))
print(label_mapping)

{'negative': 0, 'neutral': 1, 'positive': 2}


# Fine-Tuning roBERTa

In [None]:
train_df, tst_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(tst_df, test_size=0.3, random_state=42)

## Trial 1


### Dataset Class


In [None]:
class AspectSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = str(self.data.loc[idx, "text"])
        aspect = str(self.data.loc[idx, "aspect"])
        label = int(self.data.loc[idx, "label"])

        encoded = self.tokenizer(
            review,
            aspect,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

        return item

In [None]:
train_dataset = AspectSentimentDataset(train_df, tokenizer)
val_dataset = AspectSentimentDataset(val_df, tokenizer)
test_dataset = AspectSentimentDataset(test_df, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

### Model Architecture

In [None]:
class AspectSentimentClassifier(nn.Module):
    def __init__(self, model_name='roberta-base', num_labels=3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.2)

        hidden_size = self.roberta.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        cls_embedding = self.dropout(cls_embedding)
        logits = self.classifier(cls_embedding)
        return logits

### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = AspectSentimentClassifier()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

epochs = 4

def compute_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return (preds == labels).float().mean().item()

# Training
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    total_train_acc = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        acc = compute_accuracy(outputs, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        total_train_acc += acc

        loop.set_postfix(loss=loss.item(), acc=acc)

    avg_train_loss = total_train_loss / len(train_loader)
    avg_train_acc = total_train_acc / len(train_loader)

    print(f"\nTrain Loss: {avg_train_loss:.4f} | Train Acc: {avg_train_acc:.4f}")

# Validation
    model.eval()
    total_val_loss = 0
    total_val_acc = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels)

            total_val_loss += loss.item()
            total_val_acc += acc

    avg_val_loss = total_val_loss / len(val_loader)
    avg_val_acc = total_val_acc / len(val_loader)
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {avg_val_acc:.4f}\n")


Using device: cuda


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/4: 100%|██████████| 291/291 [01:40<00:00,  2.89it/s, acc=0.714, loss=0.6]



Train Loss: 0.6605 | Train Acc: 0.7252
Val Loss: 0.4710 | Val Acc: 0.8109



Epoch 2/4: 100%|██████████| 291/291 [01:46<00:00,  2.74it/s, acc=1, loss=0.167]



Train Loss: 0.4123 | Train Acc: 0.8497
Val Loss: 0.4456 | Val Acc: 0.8064



Epoch 3/4: 100%|██████████| 291/291 [01:46<00:00,  2.74it/s, acc=1, loss=0.106]



Train Loss: 0.2813 | Train Acc: 0.9014
Val Loss: 0.4412 | Val Acc: 0.8344



Epoch 4/4: 100%|██████████| 291/291 [01:46<00:00,  2.74it/s, acc=0.786, loss=0.382]



Train Loss: 0.2011 | Train Acc: 0.9348
Val Loss: 0.4606 | Val Acc: 0.8431



### Saving Model

In [None]:
version = "v1"
model_path = f"aspect_sentiment_model_{version}.pt"
torch.save(model.state_dict(), data_path+"/Roberta_1/"+model_path)

In [None]:
tokenizer.save_pretrained(data_path+"/Roberta_1/"+"Roberta_1_tokenizer")

('/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/vocab.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/merges.txt',
 '/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/added_tokens.json',
 '/content/drive/My Drive/restaurants_reviews_dataset/Roberta_1/Roberta_1_tokenizer/tokenizer.json')