In [None]:
import os
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

from transformers import AutoTokenizer, AutoModel

from datasets import Dataset

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

from tqdm import tqdm
import matplotlib.pyplot as plt


# Submission 9

## Fine-tuning

In [None]:
model_version = "subm9"
os.makedirs(model_version, exist_ok=True)

# Model paths
model_name = "distilbert-base-uncased"
output_dir = f"./{model_version}/checkpoints"
os.makedirs(output_dir, exist_ok=True)

# Load dataset
file_path = "/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv"
df = pd.read_csv(file_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


def make_dataset(df, train=True):    
    sep_token = tokenizer.sep_token
    df["ingredients"] = df["ingredients"].apply(lambda x: sep_token.join(eval(x)) if isinstance(x, str) else "")
    df["description"] = df["description"].fillna("")
    
    def make_text(row):
        return f"Name: {row['name']} {sep_token} Description: {row['description']} {sep_token} Ingredients: {row['ingredients']}"

    df["text"] = df.apply(make_text, axis=1)

    if train:
        df["vegetarian"] = df["vegetarian"].astype(int)
        dataset = Dataset.from_pandas(df[["text", "vegetarian"]])
    else:
        dataset = Dataset.from_pandas(df[["text"]])

    return dataset

dataset = make_dataset(df)


class BertClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)  # DistilBertModel
        hidden_size = self.bert.config.hidden_size
        self.pre_classifier = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.activation = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]
        x = self.pre_classifier(pooled_output)
        x = self.activation(x)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits


def tokenize_dataset(dataset, train=True):
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(["text"])
    if train:
        tokenized_dataset = tokenized_dataset.rename_column("vegetarian", "labels")
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

# Tokenize dataset
tokenized_dataset = tokenize_dataset(dataset)


def evaluate(model, eval_dataloader, criterion):
    model.eval()
    eval_loss = 0.0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            eval_loss += loss.item()

            preds = torch.argmax(outputs, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    avg_valid_loss = eval_loss / len(eval_dataloader)
    print(f"Evaluation - Loss: {avg_valid_loss:.4f}, Accuracy: {accuracy:.4f}")
    return avg_valid_loss, accuracy


def train_fold(model, train_dataloader, eval_dataloader, optimizer, criterion, fold, num_epochs, eval_steps):
    best_valid_loss = float("inf")
    best_model_path = f"{output_dir}/best_model_fold{fold}.pth"
    train_loss_history = []
    valid_loss_history = []

    model.train()
    
    for epoch in range(num_epochs):
        for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1} - Fold {fold}")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # Store train loss
            train_loss_history.append({"epoch": epoch+1, "step": step, "train_loss": loss.item()})

            # Evaluate every 100 steps
            if step > 0 and step % eval_steps == 0:
                valid_loss, accuracy = evaluate(model, eval_dataloader, criterion)
                valid_loss_history.append({
                    "epoch": epoch+1,
                    "step": step,
                    "valid_loss": valid_loss,
                    "accuracy": accuracy,
                })

                # Save model if validation loss improves
                if epoch > 0 and valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), best_model_path)
                    print(f"✅ New best model saved for fold {fold} at step {step} with valid_loss {valid_loss:.4f}, accuracy {accuracy:.4f}")
    
    # Save loss history
    train_loss_df = pd.DataFrame(train_loss_history)
    valid_loss_df = pd.DataFrame(valid_loss_history)
    train_loss_df.to_csv(f"{output_dir}/train_loss_fold{fold}.csv", index=False)
    valid_loss_df.to_csv(f"{output_dir}/valid_loss_fold{fold}.csv", index=False)

    return best_model_path

# 5-fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fold_model_paths = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(tokenized_dataset)):
    print(f"\n🔹 Training fold {fold+1}/5...\n")

    # Split dataset into train and validation for this fold
    train_subset = Subset(tokenized_dataset, train_idx)
    valid_subset = Subset(tokenized_dataset, valid_idx)

    train_dataloader = DataLoader(train_subset, batch_size=32, shuffle=True)
    eval_dataloader = DataLoader(valid_subset, batch_size=32, shuffle=False)

    # Initialize new model for each fold
    model = BertClassifier(model_name=model_name, num_labels=2).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss()

    # Train and get best model path for this fold
    best_model_path = train_fold(
        model, train_dataloader, eval_dataloader, optimizer, criterion,
        fold, num_epochs=3, eval_steps=100,
    )
    fold_model_paths.append(best_model_path)

print("\n🏆 Best models saved for all folds:", fold_model_paths)

## Submission

In [None]:
!ls "/kaggle/working/subm9/checkpoints/"

In [None]:
fold_model_paths = [
    "best_model_fold0.pth",
    "best_model_fold1.pth",
    "best_model_fold2.pth",
    "best_model_fold3.pth",
    "best_model_fold4.pth",
]
fold_model_paths = [f"/kaggle/working/{model_version}/checkpoints/" + each for each in fold_model_paths]


def predict_ensemble(test_dataloader, model_paths, model_name, device):
    all_fold_preds = []

    for model_path in model_paths:
        print(f"Loading model from {model_path}...")
        
        # Load model
        model = BertClassifier(model_name=model_name, num_labels=2).to(device)
        model.load_state_dict(torch.load(model_path))
        model.eval()

        fold_preds = []
        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc=f"Predicting with {model_path}"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=-1)  # Get predicted class (0 or 1)
                fold_preds.extend(preds.cpu().numpy())

        all_fold_preds.append(fold_preds)

    # Majority voting
    final_preds = []
    for i in range(len(all_fold_preds[0])):
        votes = [all_fold_preds[f][i] for f in range(len(model_paths))]
        final_preds.append(Counter(votes).most_common(1)[0][0])

    return np.array(final_preds)

# Load test dataset
test_file_path = "/kaggle/input/win25-stat-528-kaggle-competition-1/test.csv"
df_test = pd.read_csv(test_file_path)
dataset_test = make_dataset(df_test, train=False)

# Tokenize test dataset
# tokenized_test_dataset = tokenize_dataset(dataset_test)
tokenized_test_dataset = tokenize_dataset(dataset_test, train=False)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=32, shuffle=False)

# Make ensemble predictions
final_predictions = predict_ensemble(test_dataloader, fold_model_paths, model_name, device)

# Save predictions
submission_df = pd.DataFrame({"id": df_test.id, "vegetarian": final_predictions})
submission_file = f"{model_version}/submission.csv"
submission_df.to_csv(submission_file, index=False)

In [None]:
!head -n10 subm9/submission.csv

## Plot

In [None]:
num_folds = 5

for fold in range(num_folds):
    # Load the data for the current fold
    train_df = pd.read_csv(f"subm9/checkpoints/train_loss_fold{fold}.csv")
    valid_df = pd.read_csv(f"subm9/checkpoints/valid_loss_fold{fold}.csv")

    # Compute cumulative step for train and valid
    train_df["cum_step"] = train_df.index + 1  # Simple cumulative step index
    valid_df["cum_step"] = (valid_df.index + 1) * 100  # Valid recorded every 100 steps

    # Get epoch start positions
    epoch_start_steps = train_df.groupby("epoch")["cum_step"].first().values
    epoch_labels = train_df["epoch"].unique()

    # Interpolate train loss at validation steps
    train_interp_loss = np.interp(valid_df["cum_step"], train_df["cum_step"], train_df["train_loss"])

    # Create a new figure for each fold
    plt.figure(figsize=(10, 5))

    # Plot train loss
    plt.plot(train_df["cum_step"], train_df["train_loss"], label="Train Loss", color="blue", alpha=0.5, linewidth=0.5)
    
    # Plot interpolated train loss at validation steps
    plt.plot(valid_df["cum_step"], train_interp_loss, label="Interpolated Train Loss", color="blue", linestyle="dotted", linewidth=2)

    # Plot valid loss
    plt.plot(valid_df["cum_step"], valid_df["valid_loss"], label="Valid Loss", color="orange", marker="o", linestyle="dashed", linewidth=2)

    # Add vertical lines for epoch transitions
    for i, step in enumerate(epoch_start_steps):
        plt.axvline(x=step, color="gray", linestyle="--", alpha=0.8)
        plt.text(step, plt.ylim()[1] * 0.95, f"Epoch {epoch_labels[i]}", color="black", fontsize=10, rotation=90, verticalalignment="top")

    # Labels and title with increased font size
    plt.xlabel("Step", fontsize=14)
    plt.ylabel("Loss", fontsize=14)
    plt.title(f"Training and Validation Loss - Fold {fold + 1}", fontsize=16)

    # Set tick label sizes
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Hide grid
    plt.grid(False)

    plt.legend(fontsize=12)  # Increase legend font size
    plt.savefig(f'subm9/loss_plot_fold{fold + 1}.png')
    plt.show()  # Show each figure separately

# Evaluate

In [None]:
num_folds = 5

valid_losses = []
accuracies = []

for fold in range(num_folds):
    # Load the validation loss data for the current fold
    valid_df = pd.read_csv(f"subm9/checkpoints/valid_loss_fold{fold}.csv")

    # Ensure the CSV contains the required columns
    if "valid_loss" not in valid_df.columns or "accuracy" not in valid_df.columns:
        raise ValueError(f"Missing 'valid_loss' or 'accuracy' in valid_loss_fold{fold}.csv")

    # Get the best (lowest) validation loss and corresponding accuracy
    best_valid_loss = valid_df["valid_loss"].min()
    best_accuracy = valid_df.loc[valid_df["valid_loss"].idxmin(), "accuracy"]

    valid_losses.append(best_valid_loss)
    accuracies.append(best_accuracy)

# Compute the average validation loss and accuracy across folds
avg_valid_loss = np.mean(valid_losses)
avg_accuracy = np.mean(accuracies)

print(f"Average Validation Loss: {avg_valid_loss:.5f}")
print(f"Average Accuracy: {avg_accuracy:.5f}")

## Wrong Prediction

In [None]:
fold_model_paths = [
    "best_model_fold0.pth",
    # "best_model_fold1.pth",
    # "best_model_fold2.pth",
    # "best_model_fold3.pth",
    # "best_model_fold4.pth",
]
fold_model_paths = [f"/kaggle/working/{model_version}/checkpoints/" + each for each in fold_model_paths]

# Load test dataset
test_file_path = "/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv"
df_test = pd.read_csv(test_file_path)
dataset_test = make_dataset(df_test, train=False)

# Tokenize test dataset
tokenized_test_dataset = tokenize_dataset(dataset_test, train=False)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=32, shuffle=False)

# Make ensemble predictions
final_predictions = predict_ensemble(test_dataloader, fold_model_paths, model_name, device)
df_test['prediction'] = final_predictions
df_test['wrong'] = df_test.vegetarian != df_test.prediction
df_test = pd.read_csv('/kaggle/working/subm9/wrong_prediction.csv')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_test.loc[df_test.wrong].head(10)[['vegetarian', 'prediction', 'name', 'description', 'ingredients']]