In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import os
import json

# 1. Load your dataset
df = pd.read_csv('review_category_dataset_new.csv')  # Updated filename
df['category'] = df['category'].apply(eval)  # Convert stringified list to Python list

# 2. Encode labels into binary format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['category'])
category_list = mlb.classes_  # List of all categories

# 3. Tokenizer setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 4. Dataset class
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.FloatTensor(label)
        }

# 5. Split into train/val
X_train, X_val, y_train, y_val = train_test_split(df['reviews'], y, test_size=0.2, random_state=42)

train_dataset = ReviewDataset(X_train.tolist(), y_train, tokenizer)
val_dataset = ReviewDataset(X_val.tolist(), y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, pin_memory=True)

# 6. Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(category_list),
    problem_type="multi_label_classification"
)
model.to(device)

# 7. Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 12
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 8. Loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# 9. Training loop
model.train()
for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

print(f"Training finished. Average loss: {total_loss / len(train_loader):.4f}")

# 10. Save model, tokenizer, and label classes
save_dir = "saved_model_new"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

with open(f"{save_dir}/label_classes.json", "w") as f:
    json.dump(category_list.tolist(), f)

print(f"âœ… Model saved in '{save_dir}/'")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [05:00<00:00,  2.15s/it, loss=0.211]
Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [05:02<00:00,  2.16s/it, loss=0.196]
Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [05:02<00:00,  2.16s/it, loss=0.14]  
Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:55<00:00,  1.68s/it, loss=0.0917]
Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:47<00:00,  1.63s/it, loss=0.0731]
Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:47<00:00,  1.62s/it, loss=0.0946]
Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:46<00:00,  1.62s/it, loss=0.0483]
Epoch 8: 100%|â

Training finished. Average loss: 0.0525
âœ… Model saved in 'saved_model_new/'


SyntaxError: invalid syntax. Perhaps you forgot a comma? (2059920983.py, line 1)

In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import os
import json

# 1. Load your dataset
df = pd.read_csv('review_category_dataset_new.csv')
df['category'] = df['category'].apply(eval)

# 2. Encode labels into binary format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['category'])
category_list = mlb.classes_

# 3. Tokenizer and Dataset setup
tokenizer = BertTokenizer.from_pretrained('saved_model_new')
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.FloatTensor(label)
        }

# 4. Train/val split
X_train, X_val, y_train, y_val = train_test_split(df['reviews'], y, test_size=0.2, random_state=42)
train_dataset = ReviewDataset(X_train.tolist(), y_train, tokenizer)
val_dataset = ReviewDataset(X_val.tolist(), y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 5. Load previously trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("saved_model_new")
model.to(device)

# 6. Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_epochs = 30
starting_epoch = 12
loss_fn = torch.nn.BCEWithLogitsLoss()

num_training_steps = (total_epochs - starting_epoch) * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 7. Evaluation
def evaluate_model(val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)

    return f1_score(all_labels, all_preds, average='macro')

# 8. Continue training from epoch 13 with early stopping
best_f1 = 0
patience = 3
wait = 0

for epoch in range(starting_epoch, total_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    val_f1 = evaluate_model(val_loader)
    print(f"âœ… Epoch {epoch+1} | Train Loss: {avg_loss:.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        wait = 0
        print("ðŸ”¥ New best model. Saving...")
        model.save_pretrained("saved_model_new")
        tokenizer.save_pretrained("saved_model_new")
        with open("saved_model_new/label_classes.json", "w") as f:
            json.dump(category_list.tolist(), f)
    else:
        wait += 1
        print(f"ðŸ“‰ No improvement in F1. Patience: {wait}/{patience}")
        if wait >= patience:
            print("â›” Early stopping triggered.")
            break

print("ðŸŽ‰ Training complete.")


  from .autonotebook import tqdm as notebook_tqdm
Epoch 13: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [04:08<00:00,  1.78s/it, loss=0.0524]


âœ… Epoch 13 | Train Loss: 0.0465 | Val F1: 0.6544
ðŸ”¥ New best model. Saving...


Epoch 14: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0365]


âœ… Epoch 14 | Train Loss: 0.0365 | Val F1: 0.6431
ðŸ“‰ No improvement in F1. Patience: 1/3


Epoch 15: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0357]


âœ… Epoch 15 | Train Loss: 0.0300 | Val F1: 0.6862
ðŸ”¥ New best model. Saving...


Epoch 16: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:44<00:00,  1.61s/it, loss=0.0282]


âœ… Epoch 16 | Train Loss: 0.0248 | Val F1: 0.6883
ðŸ”¥ New best model. Saving...


Epoch 17: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:44<00:00,  1.60s/it, loss=0.0199] 


âœ… Epoch 17 | Train Loss: 0.0217 | Val F1: 0.7083
ðŸ”¥ New best model. Saving...


Epoch 18: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0153] 


âœ… Epoch 18 | Train Loss: 0.0189 | Val F1: 0.7232
ðŸ”¥ New best model. Saving...


Epoch 19: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:47<00:00,  1.62s/it, loss=0.0275] 


âœ… Epoch 19 | Train Loss: 0.0167 | Val F1: 0.7427
ðŸ”¥ New best model. Saving...


Epoch 20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:44<00:00,  1.60s/it, loss=0.0235] 


âœ… Epoch 20 | Train Loss: 0.0151 | Val F1: 0.7819
ðŸ”¥ New best model. Saving...


Epoch 21: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0168] 


âœ… Epoch 21 | Train Loss: 0.0139 | Val F1: 0.7561
ðŸ“‰ No improvement in F1. Patience: 1/3


Epoch 22: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.00933]


âœ… Epoch 22 | Train Loss: 0.0127 | Val F1: 0.8138
ðŸ”¥ New best model. Saving...


Epoch 23: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0179] 


âœ… Epoch 23 | Train Loss: 0.0116 | Val F1: 0.8141
ðŸ”¥ New best model. Saving...


Epoch 24: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:44<00:00,  1.61s/it, loss=0.0141] 


âœ… Epoch 24 | Train Loss: 0.0110 | Val F1: 0.8149
ðŸ”¥ New best model. Saving...


Epoch 25: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:44<00:00,  1.61s/it, loss=0.0075] 


âœ… Epoch 25 | Train Loss: 0.0104 | Val F1: 0.8165
ðŸ”¥ New best model. Saving...


Epoch 26: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0141] 


âœ… Epoch 26 | Train Loss: 0.0098 | Val F1: 0.8395
ðŸ”¥ New best model. Saving...


Epoch 27: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0118] 


âœ… Epoch 27 | Train Loss: 0.0094 | Val F1: 0.8351
ðŸ“‰ No improvement in F1. Patience: 1/3


Epoch 28: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.0111] 


âœ… Epoch 28 | Train Loss: 0.0092 | Val F1: 0.8214
ðŸ“‰ No improvement in F1. Patience: 2/3


Epoch 29: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 140/140 [03:45<00:00,  1.61s/it, loss=0.00862]


âœ… Epoch 29 | Train Loss: 0.0089 | Val F1: 0.8326
ðŸ“‰ No improvement in F1. Patience: 3/3
â›” Early stopping triggered.
ðŸŽ‰ Training complete.
