# Load Read Dataset

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv("data/cleaned_reviews.csv", index_col=False)

In [9]:
data.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name', 'Sentiment'],
      dtype='object')

In [15]:
data.groupby("Recommended IND").size()

# the outcome is imbalanced
# most of the customers would recommend the product to their friends thus reviewed
# the outcome is already binary

Recommended IND
0     4101
1    18540
dtype: int64

# Tokenization

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["Review Text"], data["Recommended IND"], test_size=0.1, random_state=42
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=128,  # You can adjust max_length as needed
)

val_encodings = tokenizer(
    list(val_texts), truncation=True, padding=True, max_length=128
)

# Prepare Torch Data

In [5]:
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize BERT and train

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# training
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=5e-5)

total_steps = len(train_loader) * 3  # Number of epochs (e.g., 3)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()

epochs = 3

for epoch in range(epochs):
    # Training
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_val_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(true_labels, predictions)
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {val_accuracy}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Training Loss: 0.23412329296974738
Validation Loss: 0.22981121512392247, Accuracy: 0.8931567328918323
Epoch 2/3, Training Loss: 0.13668682773486424
Validation Loss: 0.21485648568208054, Accuracy: 0.909933774834437
Epoch 3/3, Training Loss: 0.05331472265469225
Validation Loss: 0.28579984186750074, Accuracy: 0.9130242825607064


# Evaluation

In [21]:
print(
    classification_report(
        true_labels, predictions, target_names=["Not Recommend", "Recommend"]
    )
)

model.save_pretrained("bert-recommendation-model")
tokenizer.save_pretrained("bert-recommendation-model")

               precision    recall  f1-score   support

Not Recommend       0.79      0.73      0.76       419
    Recommend       0.94      0.96      0.95      1846

     accuracy                           0.91      2265
    macro avg       0.86      0.84      0.85      2265
 weighted avg       0.91      0.91      0.91      2265



('bert-recommendation-model/tokenizer_config.json',
 'bert-recommendation-model/special_tokens_map.json',
 'bert-recommendation-model/vocab.txt',
 'bert-recommendation-model/added_tokens.json')

# Good Response and Bad Response

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-recommendation-model")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# After validation loop
predictions = []
true_labels = []
val_texts_list = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        # Decode the input_ids back to text
        batch_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        val_texts_list.extend(batch_texts)

# Create a DataFrame
val_results = pd.DataFrame(
    {
        "Review Text": val_texts_list,
        "True Label": true_labels,
        "Predicted Label": predictions,
    }
)

# Map labels to readable format
label_map = {0: "Not Recommend", 1: "Recommend"}
val_results["True Label"] = val_results["True Label"].map(label_map)
val_results["Predicted Label"] = val_results["Predicted Label"].map(label_map)

# Extract correct and incorrect predictions
correct_predictions = val_results[
    val_results["True Label"] == val_results["Predicted Label"]
]
incorrect_predictions = val_results[
    val_results["True Label"] != val_results["Predicted Label"]
]

# Save to CSV
correct_predictions.to_csv("correct_predictions.csv", index=False)
incorrect_predictions.to_csv("incorrect_predictions.csv", index=False)