In [None]:
import optuna
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Configurations
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 5
BATCH_SIZE = 16

# Load dataset
file_path = r'/content/balanced_data2.xlsx'
data = pd.read_excel(file_path)

# Preprocessing
questions = data['Question'].astype(str).tolist()
labels = (data['Label'] - 1).tolist()  # Shift labels to 0-indexed

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(questions, truncation=True, padding=True, max_length=120)

# Dataset class
class QuestionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Prepare dataset
dataset = QuestionDataset(encodings, labels)

# Split dataset
total_size = len(dataset)
train_size = int(0.9 * total_size)
val_size = int(0.05 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Best accuracy tracker
best_accuracy = 0.0

# Objective function for Optuna
def objective(trial):
    global best_accuracy

    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(EPOCHS):
        model.train()
        train_losses = []
        train_preds, train_labels = [], []

        loop = tqdm(train_loader, leave=False)
        for batch in loop:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_losses.append(loss.item())
            preds = outputs.logits.argmax(dim=1).detach().cpu().numpy()
            labels_batch = batch['labels'].cpu().numpy()
            train_preds.extend(preds)
            train_labels.extend(labels_batch)

            loop.set_description(f'Epoch {epoch + 1}')
            loop.set_postfix(loss=loss.item())

        train_accuracy = accuracy_score(train_labels, train_preds)
        avg_train_loss = sum(train_losses) / len(train_losses)

        # Validation
        model.eval()
        val_losses = []
        val_preds, val_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                val_losses.append(loss.item())

                preds = outputs.logits.argmax(dim=1).detach().cpu().numpy()
                labels_batch = batch['labels'].cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels_batch)

        val_accuracy = accuracy_score(val_labels, val_preds)
        avg_val_loss = sum(val_losses) / len(val_losses)

        print(f'Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}')

        # Checkpoint: save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model2.pth')

    return best_accuracy

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print(f'Best hyperparameters: {study.best_params}')
print(f'Best validation accuracy: {study.best_value:.4f}')

# Load best model and evaluate on test set
best_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6).to(DEVICE)
best_model.load_state_dict(torch.load('best_model2.pth'))
best_model.eval()

test_preds, test_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = best_model(**batch)
        preds = outputs.logits.argmax(dim=1).detach().cpu().numpy()
        labels_batch = batch['labels'].cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels_batch)

test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Final Test Accuracy: {test_accuracy:.4f}')
