In [None]:
# 1. SETUP AND INSTALLS
# ---------------------
print("Installing necessary libraries...")
!pip install -q transformers[torch] datasets accelerate scikit-learn evaluate

import pandas as pd
import numpy as np
import torch
import zipfile
import os

import evaluate
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from transformers import EarlyStoppingCallback

print("Imports successful.")

#
# 2. CONFIGURATION
# ------------------
class Config:
    DATASET_PATH = "/kaggle/input/mahed-task-1"
    
    # Model configuration
    MODEL_NAME = "microsoft/mdeberta-v3-base"  
    MAX_LENGTH = 256  
    
    # Training configuration
    OUTPUT_DIR = "./results"
    LOGGING_DIR = "./logs"
    NUM_EPOCHS = 10
    TRAIN_BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_STEPS = 200
    
    # Submission file
    SUBMISSION_FILE = "prediction.csv"
    ZIP_FILE = "prediction.zip"

#
# 3. LOAD DATASETS
# ----------------
print("Loading datasets...")
try:
    train_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'train.csv'))
    validation_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'validation.csv'))
    test_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'test.csv'))
    print("Dataframes created successfully.")
    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(validation_df)}")
    print(f"Test samples: {len(test_df)}")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print(f"Please make sure your CSV files are in the directory: {Config.DATASET_PATH}")
    print("Update the 'DATASET_PATH' in the Config class if needed.")


#
# 4. PREPROCESSING AND LABEL MAPPING
# ------------------------------------
labels = list(train_df['label'].unique())
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

train_df['label_id'] = train_df['label'].map(label2id)
validation_df['label_id'] = validation_df['label'].map(label2id)

print("Label mapping created:")
print(label2id)

#
# 5. INITIALIZE MODEL AND TOKENIZER
# -----------------------------------
print(f"Initializing tokenizer and model for '{Config.MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    Config.MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

#
# 6. CREATE PYTORCH DATASETS
# ----------------------------
class ArabHateHopeDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        item_dict = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.labels is not None:
            item_dict['labels'] = torch.tensor(self.labels[item], dtype=torch.long)

        return item_dict

train_dataset = ArabHateHopeDataset(
    texts=train_df.text.tolist(),
    labels=train_df.label_id.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
val_dataset = ArabHateHopeDataset(
    texts=validation_df.text.tolist(),
    labels=validation_df.label_id.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
test_dataset = ArabHateHopeDataset(
    texts=test_df.text.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
print("PyTorch datasets created.")

#
# 7. DEFINE EVALUATION METRICS (MACRO F1-SCORE)
# -----------------------------------------------
print("Defining evaluation metrics...")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate macro-averaged metrics
    macro_f1 = metric_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    macro_precision = metric_precision.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric_recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    acc = accuracy_score(labels, predictions)
    
    return {
        'f1': macro_f1,
        'precision': macro_precision,
        'recall': macro_recall,
        'accuracy': acc,
    }

#
# 8. SET TRAINING ARGUMENTS 
# --------------------------------------
training_args = TrainingArguments(
    output_dir=Config.OUTPUT_DIR,
    num_train_epochs=Config.NUM_EPOCHS,
    learning_rate=Config.LEARNING_RATE,
    per_device_train_batch_size=Config.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
    weight_decay=Config.WEIGHT_DECAY,
    warmup_steps=Config.WARMUP_STEPS,
    eval_strategy="epoch",      
    save_strategy="epoch",            
    save_total_limit=2,              
    load_best_model_at_end=True,      
    metric_for_best_model="f1",       
    greater_is_better=True,
    logging_dir=Config.LOGGING_DIR,
    logging_steps=100,
    fp16=True,                        
    report_to="none",               
)

#
# 9. INITIALIZE AND RUN TRAINER
# -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)]
)

print("Starting model training...")
trainer.train()
print("Training finished.")

#
# 10. MAKE PREDICTIONS ON THE TEST SET
# --------------------------------------
print("Making predictions on the test set...")
predictions = trainer.predict(test_dataset)
predicted_class_ids = np.argmax(predictions.predictions, axis=1)

predicted_labels = [id2label[id] for id in predicted_class_ids]

#
# 11. CREATE SUBMISSION FILE
# ----------------------------
print("Creating submission file...")
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'prediction': predicted_labels
})

# Save the predictions to a CSV file
submission_df.to_csv(Config.SUBMISSION_FILE, index=False, encoding='utf-8')

# Zip the CSV file for submission
with zipfile.ZipFile(Config.ZIP_FILE, 'w') as zf:
    zf.write(Config.SUBMISSION_FILE)

print("-" * 50)
print(f"✅ Submission file '{Config.ZIP_FILE}' created successfully!")
print("You can now download this file from the Kaggle output directory and submit it to the competition.")
print("-" * 50)
print("Sample predictions:")
print(submission_df.head())

Installing necessary libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m


2025-07-27 18:54:26.855676: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753642467.043319      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753642467.100086      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Imports successful.
Loading datasets...
Dataframes created successfully.
Training samples: 6890
Validation samples: 1476
Test samples: 1477
Label mapping created:
{'not_applicable': 0, 'hope': 1, 'hate': 2}
Initializing tokenizer and model for 'microsoft/mdeberta-v3-base'...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PyTorch datasets created.
Defining evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Starting model training...


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.7765,0.753837,0.597721,0.640552,0.589741,0.642276
2,0.7253,0.735603,0.629618,0.62747,0.64016,0.650407
3,0.6083,0.837187,0.617441,0.603318,0.646887,0.624661
4,0.5404,0.837645,0.615433,0.606166,0.628484,0.625339
5,0.4734,0.98145,0.62045,0.607759,0.643433,0.631436


Training finished.
Making predictions on the test set...


Creating submission file...
--------------------------------------------------
✅ Submission file 'prediction.zip' created successfully!
You can now download this file from the Kaggle output directory and submit it to the competition.
--------------------------------------------------
Sample predictions:
     id      prediction
0  5813  not_applicable
1  5853            hope
2   251  not_applicable
3  7213  not_applicable
4  6848            hate
