In [1]:
import os
import re
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import shutil
import time
from collections import Counter

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)

# Create directory for saving results
SAVE_DIR = "SEFOSS_TEXTData"
os.makedirs(SAVE_DIR, exist_ok=True)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Data loading and preprocessing
class YahooAnswersDataset(Dataset):
    def __init__(self, root_dir, max_files=1000, tokenizer=None, max_length=128):
        self.root_dir = root_dir
        self.max_files = max_files
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []
        self.labels = []
        self.files_per_class = {}

        # Load data from each class folder
        for class_id in range(10):  # 10 classes in Yahoo Answers
            class_folder = os.path.join(root_dir, str(class_id))
            if os.path.exists(class_folder):
                files = os.listdir(class_folder)[:max_files]
                self.files_per_class[class_id] = len(files)

                for file_name in files:
                    file_path = os.path.join(class_folder, file_name)
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        text = f.read()
                        self.data.append(text)
                        self.labels.append(class_id)

        # Create a DataFrame for easier manipulation
        self.df = pd.DataFrame({"text": self.data, "label": self.labels})

        # Print stats
        print(f"Loaded {len(self.data)} files from {root_dir}")
        print(f"Files per class: {self.files_per_class}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]

        if self.tokenizer:
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt",
            )
            return {
                "input_ids": encoding["input_ids"].squeeze(),
                "attention_mask": encoding["attention_mask"].squeeze(),
                "label": torch.tensor(label, dtype=torch.long),
            }
        else:
            return text, label

    def get_text(self, idx):
        return self.data[idx]

    def get_balanced_subset(self, n_per_class):
        """Get a balanced subset with n_per_class samples from each class"""
        subset_data = []
        subset_labels = []

        for label in range(10):
            indices = [i for i, l in enumerate(self.labels) if l == label]
            if len(indices) > n_per_class:
                selected_indices = random.sample(indices, n_per_class)
            else:
                selected_indices = indices

            for idx in selected_indices:
                subset_data.append(self.data[idx])
                subset_labels.append(self.labels[idx])

        subset_df = pd.DataFrame({"text": subset_data, "label": subset_labels})

        # Create a new dataset with the subset
        subset_dataset = YahooAnswersDataset(
            self.root_dir,
            max_files=0,
            tokenizer=self.tokenizer,
            max_length=self.max_length,
        )
        subset_dataset.data = subset_data
        subset_dataset.labels = subset_labels
        subset_dataset.df = subset_df

        return subset_dataset


def clean_text(text):
    """Basic text cleaning"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r"[^\w\s]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Model definition
class TextClassifier(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_classes=10):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

    def get_features(self, input_ids, attention_mask):
        """Extract features from the model for feature consistency"""
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
        return pooled_output


# SeFOSS implementation based on the algorithms in the image
class SeFOSS:
    def __init__(
        self, num_classes=10, device=device, backbone_model_name="bert-base-uncased"
    ):
        # Models
        self.backbone = TextClassifier(
            bert_model_name=backbone_model_name, num_classes=num_classes
        ).to(device)

        # Tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(backbone_model_name)

        # Parameters
        self.num_classes = num_classes
        self.device = device

        # Hyperparameters
        self.lr = 2e-5
        self.batch_size = 16
        self.lambda_s = 1.0  # strong augmentation weight
        self.lambda_w = 1.0  # weak augmentation weight
        self.lambda_f = 1.0  # feature consistency weight
        self.lambda_e = 1.0  # entropy weight
        self.wp = 1.0  # pretraining weight
        self.ws = 1.0  # strong augmentation weight
        self.ww = 1.0  # weak augmentation weight
        self.wu = 1.0  # unlabeled weight

        # Thresholds
        self.tau_d = 0.95  # confidence threshold
        self.tau_s = 0.95  # strong augmentation threshold
        self.tau_w = 0.8  # weak augmentation threshold

        # Optimizer
        self.optimizer = optim.AdamW(self.backbone.parameters(), lr=self.lr)

        # Create directory for saving
        self.save_dir = SAVE_DIR

    def text_augmentation(self, text, augmentation_type="weak"):
        """
        Text augmentation for SeFOSS
        weak: random word deletion
        strong: random word swap + deletion
        """
        if augmentation_type not in ["weak", "strong"]:
            return text

        words = text.split()
        if len(words) <= 3:  # Don't augment very short texts
            return text

        if augmentation_type == "weak":
            # Randomly delete words (15% chance)
            new_words = [word for word in words if random.random() > 0.15]
            if not new_words:  # Ensure we don't delete all words
                new_words = [random.choice(words)]

        elif augmentation_type == "strong":
            # Randomly delete words (20% chance)
            new_words = [word for word in words if random.random() > 0.2]
            if not new_words:  # Ensure we don't delete all words
                new_words = [random.choice(words)]

            # Randomly swap words (15% of pairs)
            for i in range(len(new_words) - 1):
                if random.random() < 0.15:
                    new_words[i], new_words[i + 1] = new_words[i + 1], new_words[i]

        return " ".join(new_words)

    def compute_loss(self, batch, unlabeled_batch=None, wp=0, we=1):
        """
        Compute SeFOSS loss according to Algorithm 2
        """
        # Labeled data loss
        input_ids = batch["input_ids"].to(self.device)
        attention_mask = batch["attention_mask"].to(self.device)
        labels = batch["label"].to(self.device)

        # Forward pass on labeled data
        logits = self.backbone(input_ids, attention_mask)
        labeled_loss = F.cross_entropy(logits, labels)

        # Initialize total loss
        total_loss = labeled_loss

        # If we have unlabeled data
        if unlabeled_batch is not None and len(unlabeled_batch["input_ids"]) > 0:
            # Get unlabeled data
            u_input_ids = unlabeled_batch["input_ids"].to(self.device)
            u_attention_mask = unlabeled_batch["attention_mask"].to(self.device)

            # Get predictions on original unlabeled data
            with torch.no_grad():
                original_logits = self.backbone(u_input_ids, u_attention_mask)
                original_probs = F.softmax(original_logits, dim=1)
                original_features = self.backbone.get_features(
                    u_input_ids, u_attention_mask
                )

                # Get confidence and pseudo-labels
                max_probs, pseudo_labels = torch.max(original_probs, dim=1)

            # Process weakly augmented data
            mask_weak = max_probs >= self.tau_w

            if torch.sum(mask_weak) > 0:
                # Get weak augmentation loss
                weak_logits = self.backbone(
                    u_input_ids[mask_weak], u_attention_mask[mask_weak]
                )
                weak_probs = F.softmax(weak_logits, dim=1)
                weak_loss = F.cross_entropy(weak_logits, pseudo_labels[mask_weak])
                total_loss += self.lambda_w * weak_loss

                # Get feature consistency loss for weakly augmented data
                weak_features = self.backbone.get_features(
                    u_input_ids[mask_weak], u_attention_mask[mask_weak]
                )
                feature_loss_weak = F.mse_loss(
                    weak_features, original_features[mask_weak]
                )
                total_loss += self.lambda_f * feature_loss_weak

            # Process strongly augmented data
            mask_strong = max_probs >= self.tau_s

            if torch.sum(mask_strong) > 0:
                # Get strong augmentation loss
                strong_logits = self.backbone(
                    u_input_ids[mask_strong], u_attention_mask[mask_strong]
                )
                strong_loss = F.cross_entropy(strong_logits, pseudo_labels[mask_strong])
                total_loss += self.lambda_s * strong_loss

                # Get feature consistency loss for strongly augmented data
                strong_features = self.backbone.get_features(
                    u_input_ids[mask_strong], u_attention_mask[mask_strong]
                )
                feature_loss_strong = F.mse_loss(
                    strong_features, original_features[mask_strong]
                )
                total_loss += self.lambda_f * feature_loss_strong

            # Compute entropy loss (encouraging confident predictions)
            entropy = -torch.sum(
                original_probs * torch.log(original_probs + 1e-6), dim=1
            ).mean()
            total_loss += self.lambda_e * entropy * we

        # Add pretraining weight if needed
        if wp > 0:
            total_loss = total_loss * wp

        return total_loss

    def compute_thresholds(self, labeled_loader):
        """Compute confidence thresholds based on labeled data"""
        self.backbone.eval()
        confidences = []

        with torch.no_grad():
            for batch in labeled_loader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)

                logits = self.backbone(input_ids, attention_mask)
                probs = F.softmax(logits, dim=1)
                max_probs, _ = torch.max(probs, dim=1)

                confidences.extend(max_probs.cpu().numpy())

        confidences = np.array(confidences)

        # Set thresholds based on percentiles
        self.tau_d = np.percentile(confidences, 50)  # median
        self.tau_s = np.percentile(confidences, 75)  # 75th percentile
        self.tau_w = np.percentile(confidences, 25)  # 25th percentile

        print(
            f"Computed thresholds: tau_d={self.tau_d:.4f}, tau_s={self.tau_s:.4f}, tau_w={self.tau_w:.4f}"
        )

        self.backbone.train()
        return self.tau_d, self.tau_s, self.tau_w

    def train(
        self,
        labeled_dataset,
        unlabeled_dataset,
        val_dataset,
        test_dataset,
        num_epochs=10,
        pretraining_epochs=5,
    ):
        """
        Train the SeFOSS model according to Algorithm 1
        """
        # Create dataloaders
        labeled_loader = DataLoader(
            labeled_dataset, batch_size=self.batch_size, shuffle=True
        )
        unlabeled_loader = DataLoader(
            unlabeled_dataset, batch_size=self.batch_size, shuffle=True
        )
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size)

        # For tracking metrics
        train_losses = []
        val_losses = []
        val_accuracies = []
        best_val_accuracy = 0.0

        # Pretraining loop (lines 1-5 in Algorithm 1)
        print("Starting pretraining phase...")
        for epoch in range(pretraining_epochs):
            self.backbone.train()
            epoch_loss = 0.0

            for batch in tqdm(
                labeled_loader, desc=f"Pretraining Epoch {epoch+1}/{pretraining_epochs}"
            ):
                self.optimizer.zero_grad()

                # Compute loss (with wp=1, we=0 to ignore entropy minimization)
                loss = self.compute_loss(batch, wp=1, we=0)

                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()

            avg_epoch_loss = epoch_loss / len(labeled_loader)
            train_losses.append(avg_epoch_loss)

            print(
                f"Pretraining Epoch {epoch+1}/{pretraining_epochs}, Loss: {avg_epoch_loss:.4f}"
            )

            # Validate
            val_loss, val_accuracy = self.evaluate(val_loader)
            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)

            print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

            # Save best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                self.save_model("best_pretrained_model.pkl")

        # Compute confidence thresholds (lines 6-10 in Algorithm 1)
        print("Computing confidence thresholds...")
        self.compute_thresholds(labeled_loader)

        # Load best pretrained model
        self.load_model("best_pretrained_model.pkl")

        # Training loop with SeFOSS (lines 11-15 in Algorithm 1)
        print("Starting SeFOSS training phase...")
        for epoch in range(num_epochs):
            self.backbone.train()
            epoch_loss = 0.0

            # Create an iterable for the unlabeled data
            unlabeled_iter = iter(unlabeled_loader)

            for batch in tqdm(
                labeled_loader, desc=f"SeFOSS Training Epoch {epoch+1}/{num_epochs}"
            ):
                self.optimizer.zero_grad()

                # Get unlabeled batch
                try:
                    unlabeled_batch = next(unlabeled_iter)
                except StopIteration:
                    unlabeled_iter = iter(unlabeled_loader)
                    unlabeled_batch = next(unlabeled_iter)

                # Compute loss with SeFOSS
                loss = self.compute_loss(batch, unlabeled_batch, wp=0, we=1)

                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()

            avg_epoch_loss = epoch_loss / len(labeled_loader)
            train_losses.append(avg_epoch_loss)

            print(
                f"SeFOSS Training Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}"
            )

            # Validate
            val_loss, val_accuracy = self.evaluate(val_loader)
            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)

            print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

            # Save best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                self.save_model("best_model.pkl")

        # Load best model for final evaluation
        self.load_model("best_model.pkl")

        # Final evaluation on test set
        test_loss, test_accuracy = self.evaluate(test_loader)
        print(f"Final Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}")

        # Save training history
        history = {
            "train_losses": train_losses,
            "val_losses": val_losses,
            "val_accuracies": val_accuracies,
            "final_test_loss": test_loss,
            "final_test_accuracy": test_accuracy,
        }

        with open(os.path.join(self.save_dir, "training_history.pkl"), "wb") as f:
            pickle.dump(history, f)

        # Plot and save learning curves
        self.plot_learning_curves(train_losses, val_losses, val_accuracies)

        # Detailed evaluation and save metrics
        self.detailed_evaluation(test_loader)

        return self.backbone

    def evaluate(self, dataloader):
        """Evaluate the model on given dataloader"""
        self.backbone.eval()
        total_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["label"].to(self.device)

                # Forward pass
                logits = self.backbone(input_ids, attention_mask)
                loss = F.cross_entropy(logits, labels)

                # Get predictions
                _, preds = torch.max(logits, 1)

                total_loss += loss.item()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        avg_loss = total_loss / len(dataloader)

        return avg_loss, accuracy

    def detailed_evaluation(self, dataloader):
        """Detailed evaluation with multiple metrics"""
        self.backbone.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["label"].to(self.device)

                # Forward pass
                logits = self.backbone(input_ids, attention_mask)

                # Get predictions
                _, preds = torch.max(logits, 1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision_macro = precision_score(all_labels, all_preds, average="macro")
        recall_macro = recall_score(all_labels, all_preds, average="macro")
        f1_macro = f1_score(all_labels, all_preds, average="macro")

        # Per-class metrics
        precision_per_class = precision_score(all_labels, all_preds, average=None)
        recall_per_class = recall_score(all_labels, all_preds, average=None)
        f1_per_class = f1_score(all_labels, all_preds, average=None)

        # Save metrics to CSV
        metrics = {
            "Metric": [
                "Accuracy",
                "Precision (Macro)",
                "Recall (Macro)",
                "F1 Score (Macro)",
            ],
            "Value": [accuracy, precision_macro, recall_macro, f1_macro],
        }
        metrics_df = pd.DataFrame(metrics)
        metrics_df.to_csv(os.path.join(self.save_dir, "test_metrics.csv"), index=False)

        # Save per-class metrics
        class_metrics = {
            "Class": list(range(self.num_classes)),
            "Precision": precision_per_class,
            "Recall": recall_per_class,
            "F1 Score": f1_per_class,
        }
        class_metrics_df = pd.DataFrame(class_metrics)
        class_metrics_df.to_csv(
            os.path.join(self.save_dir, "per_class_metrics.csv"), index=False
        )

        # Confusion matrix
        cm = confusion_matrix(all_labels, all_preds)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.savefig(os.path.join(self.save_dir, "confusion_matrix.png"))
        plt.close()

        print(f"Detailed evaluation saved to {self.save_dir}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision (Macro): {precision_macro:.4f}")
        print(f"Recall (Macro): {recall_macro:.4f}")
        print(f"F1 Score (Macro): {f1_macro:.4f}")

        return {
            "accuracy": accuracy,
            "precision": precision_macro,
            "recall": recall_macro,
            "f1": f1_macro,
        }

    def plot_learning_curves(self, train_losses, val_losses, val_accuracies):
        """Plot and save learning curves"""
        # Create figure with two subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

        # Plot losses
        ax1.plot(train_losses, label="Training Loss")
        ax1.plot(val_losses, label="Validation Loss")
        ax1.set_xlabel("Epochs")
        ax1.set_ylabel("Loss")
        ax1.set_title("Training and Validation Loss")
        ax1.legend()
        ax1.grid(True)

        # Plot accuracy
        ax2.plot(val_accuracies, label="Validation Accuracy", color="green")
        ax2.set_xlabel("Epochs")
        ax2.set_ylabel("Accuracy")
        ax2.set_title("Validation Accuracy")
        ax2.legend()
        ax2.grid(True)

        # Save figure
        plt.tight_layout()
        plt.savefig(os.path.join(self.save_dir, "learning_curves.png"))
        plt.close()

    def save_model(self, filename):
        """Save model parameters"""
        filepath = os.path.join(self.save_dir, filename)
        torch.save(self.backbone.state_dict(), filepath)
        print(f"Model saved to {filepath}")

    def load_model(self, filename):
        """Load model parameters"""
        filepath = os.path.join(self.save_dir, filename)
        if os.path.exists(filepath):
            self.backbone.load_state_dict(torch.load(filepath))
            print(f"Model loaded from {filepath}")
            return True
        else:
            print(f"No model found at {filepath}")
            return False


def analyze_different_labeled_amounts(
    train_dataset,
    val_dataset,
    test_dataset,
    labeled_amounts,
    num_epochs=5,
    pretraining_epochs=2,
):
    """
    Analyze model performance with different amounts of labeled data

    Args:
        train_dataset: Full training dataset
        val_dataset: Validation dataset
        test_dataset: Test dataset
        labeled_amounts: List of integers for number of samples per class
    """
    results = []

    for n_labeled in labeled_amounts:
        print(f"\n\n----- Training with {n_labeled} labeled samples per class -----\n")

        # Create directory for this run
        run_dir = os.path.join(SAVE_DIR, f"labeled_{n_labeled}")
        os.makedirs(run_dir, exist_ok=True)

        # Get balanced subset for labeled data
        labeled_dataset = train_dataset.get_balanced_subset(n_labeled)

        # The rest becomes unlabeled data
        labeled_indices = set()
        for i, label in enumerate(labeled_dataset.labels):
            labeled_indices.add(i)

        unlabeled_data = []
        unlabeled_labels = (
            []
        )  # We keep the labels for evaluation, but don't use them in training

        for i in range(len(train_dataset)):
            if i not in labeled_indices:
                unlabeled_data.append(train_dataset.get_text(i))
                unlabeled_labels.append(train_dataset.labels[i])

        # Create unlabeled dataset
        unlabeled_dataset = YahooAnswersDataset(
            train_dataset.root_dir,
            max_files=0,
            tokenizer=train_dataset.tokenizer,
            max_length=train_dataset.max_length,
        )
        unlabeled_dataset.data = unlabeled_data
        unlabeled_dataset.labels = unlabeled_labels

        print(f"Created labeled dataset with {len(labeled_dataset)} samples")
        print(f"Created unlabeled dataset with {len(unlabeled_dataset)} samples")

        # Initialize and train SeFOSS
        sefoss = SeFOSS(num_classes=10, device=device)
        sefoss.save_dir = run_dir

        # Train the model
        sefoss.train(
            labeled_dataset,
            unlabeled_dataset,
            val_dataset,
            test_dataset,
            num_epochs=num_epochs,
            pretraining_epochs=pretraining_epochs,
        )

        # Get final metrics
        _, accuracy = sefoss.evaluate(DataLoader(test_dataset, batch_size=16))
        metrics = sefoss.detailed_evaluation(DataLoader(test_dataset, batch_size=16))

        # Save results
        results.append(
            {
                "n_labeled": n_labeled,
                "accuracy": metrics["accuracy"],
                "precision": metrics["precision"],
                "recall": metrics["recall"],
                "f1": metrics["f1"],
            }
        )

    # Create and save results dataframe
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(SAVE_DIR, "labeled_amount_results.csv"), index=False)

    # Plot results
    plt.figure(figsize=(10, 6))
    plt.plot(
        results_df["n_labeled"], results_df["accuracy"], marker="o", label="Accuracy"
    )
    plt.plot(results_df["n_labeled"], results_df["f1"], marker="s", label="F1 Score")
    plt.xlabel("Number of Labeled Samples per Class")
    plt.ylabel("Score")
    plt.title("Performance vs. Amount of Labeled Data")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(SAVE_DIR, "labeled_amount_performance.png"))
    plt.close()

    return results_df


def main():
    print("Starting SeFOSS training for Yahoo Answers dataset")

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Load and preprocess data
    print("Loading datasets...")
    train_dataset = YahooAnswersDataset(
        "/kaggle/input/yahooanswerssplited/text/train",
        max_files=1000,
        tokenizer=tokenizer,
    )
    val_dataset = YahooAnswersDataset(
        "/kaggle/input/yahooanswerssplited/text/val",
        max_files=1000,
        tokenizer=tokenizer,
    )
    test_dataset = YahooAnswersDataset(
        "/kaggle/input/yahooanswerssplited/text/test",
        max_files=1000,
        tokenizer=tokenizer,
    )

    # Analyze different labeled amounts
    labeled_amounts = [40, 150, 300, 500]
    result_df = analyze_different_labeled_amounts(
        train_dataset,
        val_dataset,
        test_dataset,
        labeled_amounts,
        num_epochs=3,
        pretraining_epochs=2,
    )

    # Print summary of results
    print("\n=== Summary of Results ===")
    print(result_df)

    # Train the final model with all labeled data
    print("\n=== Training final model with full labeled data ===")
    final_model_dir = os.path.join(SAVE_DIR, "final_model")
    os.makedirs(final_model_dir, exist_ok=True)

    # Create full dataset split
    n_labeled = 500  # Use maximum labeled samples per class
    labeled_dataset = train_dataset.get_balanced_subset(n_labeled)

    # Create unlabeled dataset from remaining samples
    labeled_indices = set()
    for i, label in enumerate(labeled_dataset.labels):
        labeled_indices.add(i)

    unlabeled_data = []
    unlabeled_labels = []
    for i in range(len(train_dataset)):
        if i not in labeled_indices:
            unlabeled_data.append(train_dataset.get_text(i))
            unlabeled_labels.append(train_dataset.labels[i])

    unlabeled_dataset = YahooAnswersDataset(
        train_dataset.root_dir,
        max_files=0,
        tokenizer=train_dataset.tokenizer,
        max_length=train_dataset.max_length,
    )
    unlabeled_dataset.data = unlabeled_data
    unlabeled_dataset.labels = unlabeled_labels

    print(f"Final model: Using {len(labeled_dataset)} labeled samples")
    print(f"Final model: Using {len(unlabeled_dataset)} unlabeled samples")

    # Initialize SeFOSS with final settings
    final_sefoss = SeFOSS(num_classes=10, device=device)
    final_sefoss.save_dir = final_model_dir

    # Train final model with full pretraining and training cycles
    final_sefoss.train(
        labeled_dataset,
        unlabeled_dataset,
        val_dataset,
        test_dataset,
        num_epochs=5,
        pretraining_epochs=3,
    )

    # Generate class distribution analysis
    class_distribution = Counter(train_dataset.labels)
    plt.figure(figsize=(10, 6))
    plt.bar(range(10), [class_distribution.get(i, 0) for i in range(10)])
    plt.xlabel("Class")
    plt.ylabel("Number of Samples")
    plt.title("Class Distribution in Yahoo Answers Dataset")
    plt.xticks(range(10))
    plt.grid(axis="y")
    plt.savefig(os.path.join(SAVE_DIR, "class_distribution.png"))
    plt.close()

    # Generate a summary report
    summary = {
        "Dataset": "Yahoo Answers",
        "Total Samples": len(train_dataset),
        "Classes": 10,
        "Best Accuracy (500 labeled)": result_df.loc[
            result_df["n_labeled"] == 500, "accuracy"
        ].values[0],
        "Best F1 Score (500 labeled)": result_df.loc[
            result_df["n_labeled"] == 500, "f1"
        ].values[0],
        "Accuracy with 40 labeled": result_df.loc[
            result_df["n_labeled"] == 40, "accuracy"
        ].values[0],
        "Model": "BERT + SeFOSS",
    }

    summary_df = pd.DataFrame(list(summary.items()), columns=["Metric", "Value"])
    summary_df.to_csv(os.path.join(SAVE_DIR, "experiment_summary.csv"), index=False)

    print("\n=== Experiment complete ===")
    print(f"All results saved to {SAVE_DIR}")

    return result_df


if __name__ == "__main__":
    main()

Using device: cuda
Starting SeFOSS training for Yahoo Answers dataset


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading datasets...
Loaded 10000 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 1000, 1: 1000, 2: 1000, 3: 1000, 4: 1000, 5: 1000, 6: 1000, 7: 1000, 8: 1000, 9: 1000}
Loaded 10000 files from /kaggle/input/yahooanswerssplited/text/val
Files per class: {0: 1000, 1: 1000, 2: 1000, 3: 1000, 4: 1000, 5: 1000, 6: 1000, 7: 1000, 8: 1000, 9: 1000}
Loaded 10000 files from /kaggle/input/yahooanswerssplited/text/test
Files per class: {0: 1000, 1: 1000, 2: 1000, 3: 1000, 4: 1000, 5: 1000, 6: 1000, 7: 1000, 8: 1000, 9: 1000}


----- Training with 40 labeled samples per class -----

Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Created labeled dataset with 400 samples
Created unlabeled dataset with 9600 samples


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting pretraining phase...


Pretraining Epoch 1/2: 100%|██████████| 25/25 [00:06<00:00,  3.77it/s]


Pretraining Epoch 1/2, Loss: 2.3245
Validation Loss: 2.2596, Accuracy: 0.1362
Model saved to SEFOSS_TEXTData/labeled_40/best_pretrained_model.pkl


Pretraining Epoch 2/2: 100%|██████████| 25/25 [00:06<00:00,  4.17it/s]


Pretraining Epoch 2/2, Loss: 2.1119
Validation Loss: 1.9733, Accuracy: 0.4352
Model saved to SEFOSS_TEXTData/labeled_40/best_pretrained_model.pkl
Computing confidence thresholds...
Computed thresholds: tau_d=0.1806, tau_s=0.2084, tau_w=0.1613


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/labeled_40/best_pretrained_model.pkl
Starting SeFOSS training phase...


SeFOSS Training Epoch 1/3: 100%|██████████| 25/25 [00:19<00:00,  1.25it/s]


SeFOSS Training Epoch 1/3, Loss: 6.0399
Validation Loss: 2.3942, Accuracy: 0.2330


SeFOSS Training Epoch 2/3: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]


SeFOSS Training Epoch 2/3, Loss: 4.4465
Validation Loss: 2.0625, Accuracy: 0.3294


SeFOSS Training Epoch 3/3: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]


SeFOSS Training Epoch 3/3, Loss: 3.8956
Validation Loss: 1.9268, Accuracy: 0.3977
No model found at SEFOSS_TEXTData/labeled_40/best_model.pkl
Final Test Loss: 1.9205, Accuracy: 0.3974


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Detailed evaluation saved to SEFOSS_TEXTData/labeled_40
Accuracy: 0.3974
Precision (Macro): 0.4580
Recall (Macro): 0.3974
F1 Score (Macro): 0.2903


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Detailed evaluation saved to SEFOSS_TEXTData/labeled_40
Accuracy: 0.3974
Precision (Macro): 0.4580
Recall (Macro): 0.3974
F1 Score (Macro): 0.2903


----- Training with 150 labeled samples per class -----

Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Created labeled dataset with 1500 samples
Created unlabeled dataset with 8500 samples
Starting pretraining phase...


Pretraining Epoch 1/2: 100%|██████████| 94/94 [00:22<00:00,  4.22it/s]


Pretraining Epoch 1/2, Loss: 2.0773
Validation Loss: 1.6123, Accuracy: 0.6204
Model saved to SEFOSS_TEXTData/labeled_150/best_pretrained_model.pkl


Pretraining Epoch 2/2: 100%|██████████| 94/94 [00:22<00:00,  4.22it/s]


Pretraining Epoch 2/2, Loss: 1.2711
Validation Loss: 1.1099, Accuracy: 0.6785
Model saved to SEFOSS_TEXTData/labeled_150/best_pretrained_model.pkl
Computing confidence thresholds...
Computed thresholds: tau_d=0.6312, tau_s=0.7875, tau_w=0.4588


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/labeled_150/best_pretrained_model.pkl
Starting SeFOSS training phase...


SeFOSS Training Epoch 1/3: 100%|██████████| 94/94 [01:09<00:00,  1.36it/s]


SeFOSS Training Epoch 1/3, Loss: 2.2389
Validation Loss: 1.0642, Accuracy: 0.6858
Model saved to SEFOSS_TEXTData/labeled_150/best_model.pkl


SeFOSS Training Epoch 2/3: 100%|██████████| 94/94 [01:14<00:00,  1.27it/s]


SeFOSS Training Epoch 2/3, Loss: 1.6055
Validation Loss: 1.2409, Accuracy: 0.6817


SeFOSS Training Epoch 3/3: 100%|██████████| 94/94 [01:15<00:00,  1.24it/s]


SeFOSS Training Epoch 3/3, Loss: 1.2736
Validation Loss: 1.3034, Accuracy: 0.6802


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/labeled_150/best_model.pkl
Final Test Loss: 1.0516, Accuracy: 0.6905
Detailed evaluation saved to SEFOSS_TEXTData/labeled_150
Accuracy: 0.6905
Precision (Macro): 0.6945
Recall (Macro): 0.6905
F1 Score (Macro): 0.6880
Detailed evaluation saved to SEFOSS_TEXTData/labeled_150
Accuracy: 0.6905
Precision (Macro): 0.6945
Recall (Macro): 0.6905
F1 Score (Macro): 0.6880


----- Training with 300 labeled samples per class -----

Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Created labeled dataset with 3000 samples
Created unlabeled dataset with 7000 samples
Starting pretraining phase...


Pretraining Epoch 1/2: 100%|██████████| 188/188 [00:44<00:00,  4.22it/s]


Pretraining Epoch 1/2, Loss: 1.7246
Validation Loss: 1.0995, Accuracy: 0.6792
Model saved to SEFOSS_TEXTData/labeled_300/best_pretrained_model.pkl


Pretraining Epoch 2/2: 100%|██████████| 188/188 [00:44<00:00,  4.23it/s]


Pretraining Epoch 2/2, Loss: 0.9210
Validation Loss: 0.9581, Accuracy: 0.7104
Model saved to SEFOSS_TEXTData/labeled_300/best_pretrained_model.pkl
Computing confidence thresholds...
Computed thresholds: tau_d=0.8188, tau_s=0.8978, tau_w=0.6701


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/labeled_300/best_pretrained_model.pkl
Starting SeFOSS training phase...


SeFOSS Training Epoch 1/3: 100%|██████████| 188/188 [02:15<00:00,  1.39it/s]


SeFOSS Training Epoch 1/3, Loss: 1.5148
Validation Loss: 1.1538, Accuracy: 0.6955


SeFOSS Training Epoch 2/3: 100%|██████████| 188/188 [02:25<00:00,  1.29it/s]


SeFOSS Training Epoch 2/3, Loss: 1.1113
Validation Loss: 1.2952, Accuracy: 0.6875


SeFOSS Training Epoch 3/3: 100%|██████████| 188/188 [02:28<00:00,  1.27it/s]


SeFOSS Training Epoch 3/3, Loss: 0.8851
Validation Loss: 1.3359, Accuracy: 0.6906
No model found at SEFOSS_TEXTData/labeled_300/best_model.pkl
Final Test Loss: 1.3231, Accuracy: 0.6978
Detailed evaluation saved to SEFOSS_TEXTData/labeled_300
Accuracy: 0.6978
Precision (Macro): 0.7050
Recall (Macro): 0.6978
F1 Score (Macro): 0.6960
Detailed evaluation saved to SEFOSS_TEXTData/labeled_300
Accuracy: 0.6978
Precision (Macro): 0.7050
Recall (Macro): 0.6978
F1 Score (Macro): 0.6960


----- Training with 500 labeled samples per class -----

Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Created labeled dataset with 5000 samples
Created unlabeled dataset with 5000 samples
Starting pretraining phase...


Pretraining Epoch 1/2: 100%|██████████| 313/313 [01:14<00:00,  4.21it/s]


Pretraining Epoch 1/2, Loss: 1.4556
Validation Loss: 0.9699, Accuracy: 0.7030
Model saved to SEFOSS_TEXTData/labeled_500/best_pretrained_model.pkl


Pretraining Epoch 2/2: 100%|██████████| 313/313 [01:14<00:00,  4.21it/s]


Pretraining Epoch 2/2, Loss: 0.7855
Validation Loss: 0.9397, Accuracy: 0.7115
Model saved to SEFOSS_TEXTData/labeled_500/best_pretrained_model.pkl
Computing confidence thresholds...
Computed thresholds: tau_d=0.8744, tau_s=0.9307, tau_w=0.7163


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/labeled_500/best_pretrained_model.pkl
Starting SeFOSS training phase...


SeFOSS Training Epoch 1/3: 100%|██████████| 313/313 [03:54<00:00,  1.33it/s]


SeFOSS Training Epoch 1/3, Loss: 1.1842
Validation Loss: 1.1022, Accuracy: 0.7039


SeFOSS Training Epoch 2/3: 100%|██████████| 313/313 [04:07<00:00,  1.26it/s]


SeFOSS Training Epoch 2/3, Loss: 0.8141
Validation Loss: 1.2111, Accuracy: 0.7089


SeFOSS Training Epoch 3/3: 100%|██████████| 313/313 [04:13<00:00,  1.24it/s]


SeFOSS Training Epoch 3/3, Loss: 0.6000
Validation Loss: 1.3236, Accuracy: 0.7000
No model found at SEFOSS_TEXTData/labeled_500/best_model.pkl
Final Test Loss: 1.3202, Accuracy: 0.7020
Detailed evaluation saved to SEFOSS_TEXTData/labeled_500
Accuracy: 0.7020
Precision (Macro): 0.7057
Recall (Macro): 0.7020
F1 Score (Macro): 0.6993
Detailed evaluation saved to SEFOSS_TEXTData/labeled_500
Accuracy: 0.7020
Precision (Macro): 0.7057
Recall (Macro): 0.7020
F1 Score (Macro): 0.6993

=== Summary of Results ===
   n_labeled  accuracy  precision  recall        f1
0         40    0.3974   0.457971  0.3974  0.290291
1        150    0.6905   0.694454  0.6905  0.688038
2        300    0.6978   0.705037  0.6978  0.696024
3        500    0.7020   0.705710  0.7020  0.699283

=== Training final model with full labeled data ===
Loaded 0 files from /kaggle/input/yahooanswerssplited/text/train
Files per class: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
Loaded 0 files from /kaggle/input/y

Pretraining Epoch 1/3: 100%|██████████| 313/313 [01:13<00:00,  4.23it/s]


Pretraining Epoch 1/3, Loss: 1.3568
Validation Loss: 0.9755, Accuracy: 0.6962
Model saved to SEFOSS_TEXTData/final_model/best_pretrained_model.pkl


Pretraining Epoch 2/3: 100%|██████████| 313/313 [01:14<00:00,  4.22it/s]


Pretraining Epoch 2/3, Loss: 0.7641
Validation Loss: 0.9567, Accuracy: 0.7100
Model saved to SEFOSS_TEXTData/final_model/best_pretrained_model.pkl


Pretraining Epoch 3/3: 100%|██████████| 313/313 [01:14<00:00,  4.22it/s]


Pretraining Epoch 3/3, Loss: 0.5075
Validation Loss: 1.0208, Accuracy: 0.7054
Computing confidence thresholds...
Computed thresholds: tau_d=0.9437, tau_s=0.9669, tau_w=0.8860


  self.backbone.load_state_dict(torch.load(filepath))


Model loaded from SEFOSS_TEXTData/final_model/best_pretrained_model.pkl
Starting SeFOSS training phase...


SeFOSS Training Epoch 1/5: 100%|██████████| 313/313 [03:30<00:00,  1.49it/s]


SeFOSS Training Epoch 1/5, Loss: 1.1469
Validation Loss: 1.0810, Accuracy: 0.6957


SeFOSS Training Epoch 2/5: 100%|██████████| 313/313 [03:51<00:00,  1.35it/s]


SeFOSS Training Epoch 2/5, Loss: 0.8273
Validation Loss: 1.2112, Accuracy: 0.7050


SeFOSS Training Epoch 3/5: 100%|██████████| 313/313 [04:03<00:00,  1.29it/s]


SeFOSS Training Epoch 3/5, Loss: 0.6053
Validation Loss: 1.3099, Accuracy: 0.6981


SeFOSS Training Epoch 4/5: 100%|██████████| 313/313 [04:06<00:00,  1.27it/s]


SeFOSS Training Epoch 4/5, Loss: 0.5007
Validation Loss: 1.4287, Accuracy: 0.6953


SeFOSS Training Epoch 5/5: 100%|██████████| 313/313 [04:10<00:00,  1.25it/s]


SeFOSS Training Epoch 5/5, Loss: 0.3912
Validation Loss: 1.4593, Accuracy: 0.6956
No model found at SEFOSS_TEXTData/final_model/best_model.pkl
Final Test Loss: 1.4733, Accuracy: 0.6953
Detailed evaluation saved to SEFOSS_TEXTData/final_model
Accuracy: 0.6953
Precision (Macro): 0.6984
Recall (Macro): 0.6953
F1 Score (Macro): 0.6956

=== Experiment complete ===
All results saved to SEFOSS_TEXTData


In [2]:
import os
import zipfile

# Define the directory to zip and the output zip file path
dir_to_zip = "/kaggle/working/SEFOSS_TEXTData"
zip_path = "/kaggle/working/SEFOSS_TEXTData.zip"

# Create a zip file
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(dir_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip, maintaining relative path
            zipf.write(file_path, os.path.relpath(file_path, dir_to_zip))

print(f"Zipped output directory to: {zip_path}")


Zipped output directory to: /kaggle/working/SEFOSS_TEXTData.zip
