In [1]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
# Load dataset
df = pd.read_csv("data/Restaurant_Reviews.tsv",delimiter='\t')


# Handle missing data
df = df.dropna(subset=['Review'])



# TMake sure to split the data first before balancing the dataset
# Otherise it will be data leakage
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data.value_counts('Liked')

Liked
0    404
1    396
Name: count, dtype: int64

In [3]:
train_data.value_counts('Liked')

Liked
0    404
1    396
Name: count, dtype: int64

## Using upsampling tehnique to address imbalanced data

In [4]:
# Separate the majority and minority classes
df_majority = train_data[train_data['Liked'] == 0]  # Majority class
df_minority = train_data[train_data['Liked'] == 1]  # Minority class

# Oversample the minority class
df_minority_over = df_minority.sample(len(df_majority), replace=True)

# Combine the majority class with the oversampled minority class
df_balanced = pd.concat([df_majority, df_minority_over])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)


In [5]:
df_balanced.value_counts('Liked')

Liked
0    404
1    404
Name: count, dtype: int64

In [6]:
df_balanced

Unnamed: 0,Review,Liked
0,The selection was probably the worst I've seen...,0
1,I miss it and wish they had one in Philadelphia!,1
2,AN HOUR... seriously?,0
3,What I really like there is the crepe station.,1
4,This place has it!,1
...,...,...
803,Food was really good and I got full petty fast.,1
804,"I mean really, how do you get so famous for yo...",0
805,We ordered the duck rare and it was pink and t...,1
806,"The fries were not hot, and neither was my bur...",0


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch
import numpy as np

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        review = str(self.reviews[index])
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create Dataloaders
def create_data_loader(data, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        reviews=data.Review.to_numpy(),
        labels=data.Liked.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

MAX_LEN = 128
BATCH_SIZE = 16
train_data_loader = create_data_loader(train_data, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(test_data, tokenizer, MAX_LEN, BATCH_SIZE)
  


In [8]:
import torch
import torch.nn as nn
from transformers import BertModel, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

device = torch.device("cpu")
EPOCHS = 3

import torch.nn as nn
from torch.optim import AdamW

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # BERT returns a tuple: (last_hidden_state, pooled_output)
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # Extract the pooled_output (i.e., the [CLS] token representation)

        output = self.drop(pooled_output)  # Apply dropout to the pooled output
        return self.out(output)


model = SentimentClassifier(n_classes=2)
model = model.to(device)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

# Scheduler
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Training loop
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

# Validation loop
# def eval_model(model, data_loader, loss_fn, device, n_examples):
#     model = model.eval()
#     losses = []
#     correct_predictions = 0
    
#     with torch.no_grad():
#         for batch in data_loader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['label'].to(device)
            
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             _, preds = torch.max(outputs, dim=1)
            
#             loss = loss_fn(outputs, labels)
#             correct_predictions += torch.sum(preds == labels)
#             losses.append(loss.item())

#     return correct_predictions.double() / n_examples, np.mean(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()  # Set the model to evaluation mode
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # No gradient calculations in evaluation
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            # Calculate loss
            loss = loss_fn(outputs, labels)
            losses.append(loss.item())
            correct_predictions += torch.sum(preds == labels).item()

            # Collect predictions and true labels for metrics
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = sum(losses) / len(losses)  # Average loss
    accuracy = correct_predictions / n_examples  # Calculate accuracy

    # Calculate precision, recall, and F1 score
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, avg_loss, precision, recall, f1

# Training the model

history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}


# for epoch in range(EPOCHS):
#     train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_data))
#     val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(test_data))
    
#     history['train_acc'].append(train_acc)
#     history['train_loss'].append(train_loss)
#     history['val_acc'].append(val_acc)
#     history['val_loss'].append(val_loss)

#     print(f'Epoch {epoch + 1}/{EPOCHS}')
#     print(f'Train loss {train_loss} accuracy {train_acc}')
#     print(f'Val loss {val_loss} accuracy {val_acc}')
history = []
# Training Loop
for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_data))
    val_acc, val_loss, precision, recall, f1 = eval_model(model, val_data_loader, loss_fn, device, len(val_data_loader.dataset))
    
    # history['train_acc'].append(train_acc)
    # history['train_loss'].append(train_loss)
    # history['accuracy'].append(val_acc)
    # history['loss'].append(val_loss)
    # history['precision'].append(precision)
    # history['recall'].append(recall)
    # history['f1'].append(f1)

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}')
    print(f'Val loss: {val_loss:.4f}, Val accuracy: {val_acc:.4f}')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')


Epoch 1/3
Train loss: 0.5265, Train accuracy: 0.7488
Val loss: 0.3381, Val accuracy: 0.8900
Precision: 0.8900, Recall: 0.8900, F1-Score: 0.8900
Epoch 2/3
Train loss: 0.2547, Train accuracy: 0.9213
Val loss: 0.2630, Val accuracy: 0.8950
Precision: 0.8962, Recall: 0.8950, F1-Score: 0.8950
Epoch 3/3
Train loss: 0.1770, Train accuracy: 0.9513
Val loss: 0.2558, Val accuracy: 0.9000
Precision: 0.9008, Recall: 0.9000, F1-Score: 0.9000


In [9]:
import torch
# Save the model

torch.save(model, 'model.pth')
