In [None]:
!pip install datasets pandas evaluate numpy transformers scipy scikit-learn accelerate emoji

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl

In [None]:
import os
import argparse
import logging
import re
import emoji

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import Dataset
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import evaluate

from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.models.auto.configuration_auto import AutoConfig
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.trainer_utils import set_seed
from transformers.trainer_callback import EarlyStoppingCallback
from transformers import PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput


class CustomClassifierHead(nn.Module):
    def __init__(self, hidden_size: int, num_labels: int, dropout_rate: float = 0.4):
        super().__init__()
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, hidden_size // 2)
        self.act = nn.GELU()
        self.out = nn.Linear(hidden_size // 2, num_labels)

    def forward(self, hidden_states):
        x = self.norm(hidden_states)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.act(x)
        x = self.dropout(x)
        return self.out(x)

class FocalLoss(nn.Module):
    def __init__(self, gamma: float = 2.0, weight: torch.Tensor = None):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=weight, reduction="none")

    def forward(self, logits, labels):
        logp = -self.ce(logits, labels)            # –CE
        p    = logp.exp()                          # prob of true class
        loss = -(1 - p)**self.gamma * logp         # focal term
        return loss.mean()


class FocalLoss(nn.Module):
    def __init__(self, gamma: float = 2.0, weight: torch.Tensor = None):
        super().__init__()
        self.gamma = gamma
        self.weight = weight  # Class weights tensor

    def forward(self, logits, labels):
        log_softmax = F.log_softmax(logits, dim=-1)

        logpt = log_softmax[range(len(labels)), labels]
        pt = logpt.exp()

        if self.weight is not None:
            weight = self.weight[labels]
        else:
            weight = 1.0

        focal_term = (1 - pt) ** self.gamma
        ce_term = -logpt

        loss_per_sample = weight * focal_term * ce_term

        return loss_per_sample.mean()


class TransformerWithCustomClassifier(PreTrainedModel):
    """
    Transformer model with frozen backbone and custom classifier head
    """
    # Add config_class attribute - this is crucial for from_pretrained to work
    config_class = AutoConfig

    def __init__(self, config, model_name: str = None, num_labels: int = None, freeze_layers: int = 6):
        super().__init__(config)

        # Handle both initialization scenarios
        if model_name is not None:
            # Direct initialization with model_name
            self.transformer = AutoModel.from_pretrained(model_name)
            self.num_labels = num_labels or config.num_labels
        else:
            # Loading from pretrained - we need the original model name stored in config
            if hasattr(config, '_name_or_path') and config._name_or_path:
                # Try to get the original model name from config
                original_model_name = config._name_or_path
                try:
                    self.transformer = AutoModel.from_pretrained(original_model_name)
                except:
                    # Fallback to creating from config
                    self.transformer = AutoModel.from_config(config)
            else:
                # Fallback to creating from config
                self.transformer = AutoModel.from_config(config)
            self.num_labels = config.num_labels

        self.config = config

        # Freeze specified number of layers (only during training initialization)
        if model_name is not None:
            self.freeze_transformer_layers(freeze_layers)

        # Custom classifier head
        hidden_size = self.transformer.config.hidden_size
        self.classifier = CustomClassifierHead(
            hidden_size=hidden_size,
            num_labels=self.num_labels,
            dropout_rate=0.4
        )

        # Initialize weights
        self.post_init()

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override from_pretrained to handle our custom model properly
        """
        # Load config first
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        # Create model instance without model_name (loading scenario)
        model = cls(config, model_name=None)

        # Try different file formats
        model_file = None
        possible_files = [
            "pytorch_model.bin",
            "model.safetensors",
            "pytorch_model.safetensors"
        ]

        for filename in possible_files:
            filepath = os.path.join(pretrained_model_name_or_path, filename)
            if os.path.exists(filepath):
                model_file = filepath
                break

        if model_file is None:
            raise FileNotFoundError(f"No model file found in {pretrained_model_name_or_path}")

        # Load state dict based on file type
        if model_file.endswith('.safetensors'):
            from safetensors.torch import load_file
            state_dict = load_file(model_file)
        else:
            state_dict = torch.load(model_file, map_location="cpu")

        # Load the state dict into the model
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

        if missing_keys:
            print(f"Warning: Missing keys when loading model: {missing_keys}")
        if unexpected_keys:
            print(f"Warning: Unexpected keys when loading model: {unexpected_keys}")

        return model

    def freeze_transformer_layers(self, num_layers_to_freeze: int):
        """
        Freeze bottom N transformer layers and embeddings
        Works with different transformer architectures (BERT, RoBERTa, XLM-RoBERTa, etc.)
        """
        frozen_params = 0
        total_params = 0

        # Freeze embeddings - handle different architectures
        if hasattr(self.transformer, 'embeddings'):
            for param in self.transformer.embeddings.parameters():
                param.requires_grad = False
                frozen_params += param.numel()

        # Freeze specified number of encoder layers - handle different architectures
        encoder_layers = None
        if hasattr(self.transformer, 'encoder') and hasattr(self.transformer.encoder, 'layer'):
            encoder_layers = self.transformer.encoder.layer
        elif hasattr(self.transformer, 'encoder') and hasattr(self.transformer.encoder, 'layers'):
            encoder_layers = self.transformer.encoder.layers
        elif hasattr(self.transformer, 'layers'):
            encoder_layers = self.transformer.layers

        if encoder_layers is not None:
            print(f"total layers: {len(encoder_layers)}")
            num_to_freeze = min(num_layers_to_freeze, len(encoder_layers))

            for i in range(num_to_freeze):
                for param in encoder_layers[i].parameters():
                    param.requires_grad = False
                    frozen_params += param.numel()
        else:
            print("Warning: Could not find encoder layers to freeze")
            num_to_freeze = 0

        # Count total parameters
        for param in self.parameters():
            total_params += param.numel()

        trainable_params = total_params - frozen_params
        print(f"Frozen {frozen_params:,} parameters")
        print(f"Trainable {trainable_params:,} parameters ({100*trainable_params/total_params:.1f}%)")
        print(f"Frozen {num_to_freeze} transformer layers + embeddings")

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        """
        Forward pass through transformer + custom classifier
        """
        # Filter kwargs to only include valid transformer arguments
        valid_kwargs = {}
        transformer_forward_keys = [
            'token_type_ids', 'position_ids', 'head_mask', 'inputs_embeds',
            'encoder_hidden_states', 'encoder_attention_mask',
            'output_attentions', 'output_hidden_states', 'return_dict'
        ]

        for key in transformer_forward_keys:
            if key in kwargs:
                valid_kwargs[key] = kwargs[key]

        # Get transformer outputs
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **valid_kwargs
        )

        # Use [CLS] token representation (first token)
        pooled_output = transformer_outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        # Pass through custom classifier
        logits = self.classifier(pooled_output)

        loss = None

        if labels is not None:
          loss_fn = FocalLoss(gamma=2.0)  # Pass weights here
          loss = loss_fn(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


class AlternativeCustomClassifier(nn.Module):
    """
    Alternative classifier design with attention pooling
    """
    def __init__(self, hidden_size: int, num_labels: int, dropout_rate: float = 0.3):
        super().__init__()

        # Attention pooling layer
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=dropout_rate,
            batch_first=True
        )

        # Classification layers
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_size)

        # Deep classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_size // 2, num_labels)
        )

    def forward(self, hidden_states, attention_mask=None):
        # Apply attention pooling across sequence length
        attended_output, _ = self.attention(hidden_states, hidden_states, hidden_states)

        # Mean pooling with attention mask
        if attention_mask is not None:
            mask_expanded = attention_mask.unsqueeze(-1).expand(attended_output.size()).float()
            sum_embeddings = torch.sum(attended_output * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            pooled_output = sum_embeddings / sum_mask
        else:
            pooled_output = attended_output.mean(dim=1)

        # Apply layer norm and dropout
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)

        # Classification
        logits = self.classifier(pooled_output)
        return logits


def clean_text(text: str) -> str:
    """
    SIMPLIFIED text cleaning - preserve more linguistic features
    """
    # Keep original case - it might be important for AI detection
    # text = text.lower()  # REMOVED

    # Replace URLs but keep them as tokens
    text = re.sub(r'https?://\S+', ' [URL] ', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)

    # Keep @mentions and #hashtags - they might be stylistic indicators
    # text = re.sub(r'@\w+|#\w+', ' ', text)  # REMOVED

    # Convert emojis to text (keep this - emotional expression matters)
    text = emoji.demojize(text, delimiters=(" ", " "))

    # MUCH less aggressive character filtering - keep punctuation patterns
    text = re.sub(r'[^\w\s@#.,!?;:()\[\]{}"\'-]', ' ', text)

    # Collapse multiple spaces
    text = " ".join(text.split()).strip()
    return text if text else "[BLANK]"


def preprocess_function(examples, tokenizer, max_length: int = 512):
    """
    Tokenize texts, truncating/padding to max_length.
    """
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )


def get_data(train_path: str, test_path: str, random_seed: int):
    """
    Load and prepare data with minimal preprocessing
    """
    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)

    # Clean text (less aggressively now)
    train_df["text"] = train_df["text"].map(clean_text)
    test_df["text"] = test_df["text"].map(clean_text)

    # Check class distribution
    print("Class distribution in training data:")
    print(train_df["label"].value_counts().sort_index())

    # Stratified split: 85% train / 15% dev (more training data)
    train_df, val_df = train_test_split(
        train_df,
        test_size=0.15,  # Reduced validation size
        stratify=train_df["label"],
        random_state=random_seed,
    )
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)


def compute_metrics(eval_pred):
    """
    Compute both accuracy and micro-F1
    """
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average="micro")

    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}


def fine_tune(
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    output_dir: str,
    id2label: dict,
    label2id: dict,
    model_name: str,
    device: torch.device,
    num_epochs: int = 5,
    per_device_batch: int = 16,
    gradient_accumulation_steps: int = 1,
    max_length: int = 512,
    freeze_layers: int = 6,  # NEW: Number of layers to freeze
    use_custom_classifier: bool = True,  # NEW: Whether to use custom classifier
):
    """
    Fine-tune with frozen layers and custom classifier
    """
    # Convert to HF Datasets
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model with custom classifier and frozen layers
    if use_custom_classifier:
        # Create a config for our custom model
        base_model = AutoModel.from_pretrained(model_name)
        config = base_model.config
        config.num_labels = len(label2id)
        config.id2label = id2label
        config.label2id = label2id

        model = TransformerWithCustomClassifier(
            config=config,
            model_name=model_name,
            num_labels=len(label2id),
            freeze_layers=freeze_layers
        )
    else:
        # Use standard approach (for comparison)
        from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(label2id),
            id2label=id2label,
            label2id=label2id,
        )

    model.config.pad_token_id = tokenizer.pad_token_id
    model.to(device)

    # Tokenize dataset
    tokenized_train = train_ds.map(
        lambda ex: preprocess_function(ex, tokenizer, max_length),
        batched=True,
        remove_columns=["text"],
    )
    tokenized_valid = valid_ds.map(
        lambda ex: preprocess_function(ex, tokenizer, max_length),
        batched=True,
        remove_columns=["text"],
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    # Training arguments - optimized for frozen layers
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,  # Slightly higher LR for custom classifier
        lr_scheduler_type="cosine",
        per_device_train_batch_size=per_device_batch,  # Can use larger batch with frozen layers
        per_device_eval_batch_size=per_device_batch,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_epochs,
        weight_decay=0.03,
        warmup_ratio=0.1,
        # fp16=True,  # Enable mixed precision
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        eval_accumulation_steps=1,
        dataloader_num_workers=2,
        logging_steps=25,
        logging_dir=os.path.join(output_dir, "logs"),
        remove_unused_columns=True,  # Important for avoiding argument errors
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-5,
        lr_scheduler_type="cosine",
        per_device_train_batch_size=per_device_batch,
        per_device_eval_batch_size=per_device_batch,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=25,
        logging_dir=os.path.join(output_dir, "logs"),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    # Start training
    trainer.train()

    # Save the best model
    best_dir = os.path.join(output_dir, "best")
    os.makedirs(best_dir, exist_ok=True)
    trainer.save_model(best_dir)


def test(
    test_df: pd.DataFrame,
    model_path: str,
    id2label: dict,
    label2id: dict,
    device: torch.device,
    max_length: int = 512,
    use_custom_classifier: bool = True,
):
    """
    Run inference on the test set
    """
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if use_custom_classifier:
        # Load custom model using our custom from_pretrained method
        model = TransformerWithCustomClassifier.from_pretrained(model_path)
    else:
        from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
        model = AutoModelForSequenceClassification.from_pretrained(
            model_path,
            num_labels=len(label2id),
            id2label=id2label,
            label2id=label2id,
        )

    model.config.pad_token_id = tokenizer.pad_token_id
    model.to(device)

    test_ds = Dataset.from_pandas(test_df)
    tokenized_test = test_ds.map(
        lambda ex: preprocess_function(ex, tokenizer, max_length),
        batched=True,
        remove_columns=["text"],
    )

    data_collator = DataCollatorWithPadding(tokenizer)
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    predictions_output = trainer.predict(tokenized_test)
    logits = predictions_output.predictions
    probs = softmax(logits, axis=-1)
    preds = np.argmax(probs, axis=1)

    # Compute results if labels exist
    if predictions_output.label_ids is not None:
        report = evaluate.load("bstrai/classification_report")
        results = report.compute(
            predictions=preds, references=predictions_output.label_ids
        )
    else:
        results = None

    return results, preds


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Transformer fine-tuning with custom classifier and layer freezing")
    parser.add_argument("--train_file_path", "-tr", required=False, default='./subtaskB_train.jsonl', help="Path to the train JSONL file", type=str)
    parser.add_argument("--test_file_path", "-t", required=False, default='./subtaskB.jsonl', help="Path to the test JSONL file", type=str)
    parser.add_argument("--model", "-m", required=False, default="xlm-roberta-base", help="HuggingFace model name", type=str)
    parser.add_argument("--output_dir", "-o", required=False, default="./checkpoints", help="Directory to save checkpoints", type=str)
    parser.add_argument("--prediction_file_path", "-p", required=False, default="./subtaskB_predictions.jsonl", help="Where to save predictions", type=str)
    parser.add_argument("--num_epochs", "-e", required=False, default=5, help="Number of training epochs", type=int)
    parser.add_argument("--batch_size", "-b", required=False, default=16, help="Per-device batch size", type=int)
    parser.add_argument("--accum_steps", "-a", required=False, default=1, help="Gradient accumulation steps", type=int)
    parser.add_argument("--max_length", "-l", required=False, default=512, help="Max sequence length", type=int)
    parser.add_argument("--seed", "-s", required=False, default=42, help="Random seed", type=int)
    parser.add_argument("--freeze_layers", "-f", required=False, default=2, help="Number of transformer layers to freeze", type=int)
    parser.add_argument("--use_custom_classifier", "-c", required=False, default=True, help="Use custom classifier head", type=bool)

    args = parser.parse_args([])
    set_seed(args.seed)

    # Map labels for Subtask B
    id2label = {0: "human", 1: "chatGPT", 2: "cohere", 3: "davinci", 4: "bloomz", 5: "dolly"}
    label2id = {v: k for k, v in id2label.items()}

    # Device check
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Validate file paths
    if not os.path.exists(args.train_file_path):
        raise FileNotFoundError(f"Train file not found: {args.train_file_path}")
    if not os.path.exists(args.test_file_path):
        raise FileNotFoundError(f"Test file not found: {args.test_file_path}")

    # Prepare data
    train_df, valid_df, test_df = get_data(
        args.train_file_path, args.test_file_path, random_seed=args.seed
    )

    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(valid_df)}")
    print(f"Test samples: {len(test_df)}")

    # Fine-tune with custom classifier and frozen layers
    ckpt_dir = os.path.join(args.output_dir, f"{args.model.replace('/', '_')}_custom_classifier")
    os.makedirs(ckpt_dir, exist_ok=True)

    fine_tune(
        train_df=train_df,
        valid_df=valid_df,
        output_dir=ckpt_dir,
        id2label=id2label,
        label2id=label2id,
        model_name=args.model,
        device=device,
        num_epochs=args.num_epochs,
        per_device_batch=args.batch_size,
        gradient_accumulation_steps=args.accum_steps,
        max_length=args.max_length,
        freeze_layers=args.freeze_layers,
        use_custom_classifier=args.use_custom_classifier,
    )

    # Test
    best_model_dir = os.path.join(ckpt_dir, "best")
    results, preds = test(
        test_df=test_df,
        model_path=best_model_dir,
        id2label=id2label,
        label2id=label2id,
        device=device,
        max_length=args.max_length,
        use_custom_classifier=args.use_custom_classifier,
    )

    # Log metrics
    if results is not None:
        print("=== Classification Report ===")
        for k, v in results.items():
            print(f"{k}: {v}")

    # Save predictions
    pd.DataFrame({"id": test_df["id"], "label": preds}).to_json(
        args.prediction_file_path, orient="records", lines=True
    )
    print(f"Done. Predictions saved to {args.prediction_file_path}")

Using device: cuda
Class distribution in training data:
label
0    11997
1    11995
2    11336
3    11999
4    11998
5    11702
Name: count, dtype: int64
Training samples: 60372
Validation samples: 10655
Test samples: 18000
total layers: 12
Frozen 213,662,208 parameters
Trainable 64,381,440 parameters (23.2%)
Frozen 3 transformer layers + embeddings


Map:   0%|          | 0/60372 [00:00<?, ? examples/s]

Map:   0%|          | 0/10655 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1331,0.201413,0.86138,0.86138
2,0.0872,0.165754,0.892257,0.892257
3,0.0591,0.225496,0.889348,0.889348
4,0.0232,0.317322,0.881276,0.881276
5,0.0265,0.410663,0.868512,0.868512


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at ./checkpoints/xlm-roberta-base_custom_classifier/best and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.la

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

=== Classification Report ===
0: {'precision': 0.9960199004975124, 'recall': 0.33366666666666667, 'f1-score': 0.4998751560549313, 'support': 3000.0}
1: {'precision': 0.4207104345068189, 'recall': 0.8843333333333333, 'f1-score': 0.5701697829357404, 'support': 3000.0}
2: {'precision': 0.0851063829787234, 'recall': 0.008, 'f1-score': 0.014625228519195612, 'support': 3000.0}
3: {'precision': 0.49916072177926984, 'recall': 0.793, 'f1-score': 0.6126706155034767, 'support': 3000.0}
4: {'precision': 0.9963406520292748, 'recall': 0.9983333333333333, 'f1-score': 0.9973359973359973, 'support': 3000.0}
5: {'precision': 0.47210626185958254, 'recall': 0.4146666666666667, 'f1-score': 0.4415261756876664, 'support': 3000.0}
accuracy: 0.572
macro avg: {'precision': 0.5782407256085303, 'recall': 0.572, 'f1-score': 0.5227004926728346, 'support': 18000.0}
weighted avg: {'precision': 0.5782407256085302, 'recall': 0.572, 'f1-score': 0.5227004926728345, 'support': 18000.0}
Done. Predictions saved to ./subtask

In [None]:
!zip -r checkpoints1.zip checkpoints ./subtaskB_predictions.jsonl

  adding: checkpoints/ (stored 0%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/ (stored 0%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/ (stored 0%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/rng_state.pth (deflated 25%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/special_tokens_map.json (deflated 52%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/trainer_state.json (deflated 78%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/scheduler.pt (deflated 56%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/training_args.bin (deflated 52%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/tokenizer.json (deflated 76%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/optimizer.pt (deflated 8%)
  adding: checkpoints/xlm-roberta-base_custom_classifier/checkpoint-3774/model

In [None]:
from google.colab import files

files.download('checkpoints1.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>