In [None]:
#
# 1. SETUP AND INSTALLS
# ---------------------
print("Installing necessary libraries...")
!pip install -q transformers[torch] datasets accelerate scikit-learn evaluate

import pandas as pd
import numpy as np
import torch
import zipfile
import os
import shutil  

import evaluate
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score

print("Imports successful.")

#
# 2. CONFIGURATION
# ------------------
class Config:
    DATASET_PATH = "/kaggle/input/mahed-task-1/"
    
    MODEL_NAME = "aubmindlab/bert-large-arabertv2"
    MAX_LENGTH = 256
    
    # Training configuration
    OUTPUT_DIR = "./results"
    LOGGING_DIR = "./logs"
    NUM_EPOCHS = 7
    TRAIN_BATCH_SIZE = 32
    EVAL_BATCH_SIZE = 64
    LEARNING_RATE = 1e-5
    WEIGHT_DECAY = 0.01
    WARMUP_STEPS = 200
    
    # Submission file
    SUBMISSION_FILE = "prediction.csv"
    ZIP_FILE = "prediction.zip"

#
# 3. LOAD DATASETS
# ----------------
print("Loading datasets...")
try:
    train_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'train.csv'))
    validation_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'validation.csv'))
    test_df = pd.read_csv(os.path.join(Config.DATASET_PATH, 'test.csv'))
    print("Dataframes created successfully.")
    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(validation_df)}")
    print(f"Test samples: {len(test_df)}")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print(f"Please make sure your CSV files are in the directory: {Config.DATASET_PATH}")
    # Stop execution if files are not found
    raise e

#
# 4. PREPROCESSING AND LABEL MAPPING
# ------------------------------------
labels = sorted(list(train_df['label'].unique()))
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

train_df['label_id'] = train_df['label'].map(label2id)
validation_df['label_id'] = validation_df['label'].map(label2id)

print("Label mapping created:")
print(label2id)

#
# 5. INITIALIZE MODEL AND TOKENIZER
# -----------------------------------
print(f"Initializing tokenizer and model for '{Config.MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    Config.MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

#
# 6. CREATE PYTORCH DATASETS
# ----------------------------
class ArabHateHopeDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        item_dict = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.labels is not None:
            item_dict['labels'] = torch.tensor(self.labels[item], dtype=torch.long)

        return item_dict

train_dataset = ArabHateHopeDataset(
    texts=train_df.text.tolist(),
    labels=train_df.label_id.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
val_dataset = ArabHateHopeDataset(
    texts=validation_df.text.tolist(),
    labels=validation_df.label_id.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
test_dataset = ArabHateHopeDataset(
    texts=test_df.text.tolist(),
    tokenizer=tokenizer,
    max_len=Config.MAX_LENGTH
)
print("PyTorch datasets created.")

#
# 7. DEFINE EVALUATION METRICS (MACRO F1-SCORE)
# -----------------------------------------------
print("Defining evaluation metrics...")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    macro_f1 = metric_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    macro_precision = metric_precision.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric_recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    acc = accuracy_score(labels, predictions)
    
    return {
        'f1': macro_f1,
        'precision': macro_precision,
        'recall': macro_recall,
        'accuracy': acc,
    }

#
# 8. SET TRAINING ARGUMENTS
# ---------------------------
training_args = TrainingArguments(
    output_dir=Config.OUTPUT_DIR,
    num_train_epochs=Config.NUM_EPOCHS,
    learning_rate=Config.LEARNING_RATE,
    per_device_train_batch_size=Config.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
    weight_decay=Config.WEIGHT_DECAY,
    warmup_steps=Config.WARMUP_STEPS,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir=Config.LOGGING_DIR,
    logging_steps=100,
    fp16=True,
    report_to="none",
)

#
# 9. INITIALIZE AND RUN TRAINER
# -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Starting model training...")
trainer.train()
print("Training finished.")

#
# 10. MAKE PREDICTIONS ON THE TEST SET
# --------------------------------------
print("Making predictions on the test set...")
predictions = trainer.predict(test_dataset)
predicted_class_ids = np.argmax(predictions.predictions, axis=1)
predicted_labels = [id2label[id] for id in predicted_class_ids]

#
# 11. CREATE SUBMISSION FILE AND CLEAN UP
# -----------------------------------------
print("Creating submission file...")
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'prediction': predicted_labels
})

submission_df.to_csv(Config.SUBMISSION_FILE, index=False, encoding='utf-8')

with zipfile.ZipFile(Config.ZIP_FILE, 'w') as zf:
    zf.write(Config.SUBMISSION_FILE)

print("Cleaning up checkpoint directories...")
if os.path.exists(Config.OUTPUT_DIR):
    shutil.rmtree(Config.OUTPUT_DIR)
if os.path.exists(Config.LOGGING_DIR):
    shutil.rmtree(Config.LOGGING_DIR)

print("-" * 50)
print(f"✅ Submission file '{Config.ZIP_FILE}' created successfully!")
print("You can now download this file and submit it.")
print("-" * 50)
print("Sample predictions:")
print(submission_df.head())

Installing necessary libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m


2025-07-22 13:00:26.166714: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753189226.369557      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753189226.425107      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Imports successful.
Loading datasets...
Dataframes created successfully.
Training samples: 6890
Validation samples: 1476
Test samples: 1477
Label mapping created:
{'hate': 0, 'hope': 1, 'not_applicable': 2}
Initializing tokenizer and model for 'aubmindlab/bert-large-arabertv2'...


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-large-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PyTorch datasets created.
Defining evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Starting model training...


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.9888,0.901841,0.253297,0.290324,0.334404,0.538618
2,0.8424,0.806499,0.549224,0.621852,0.541998,0.618564
3,0.9213,0.869241,0.467475,0.589915,0.48383,0.587398
4,0.8229,0.811678,0.533744,0.593714,0.51794,0.605014


Training finished.
Making predictions on the test set...


Creating submission file...
Cleaning up checkpoint directories...
--------------------------------------------------
✅ Submission file 'prediction.zip' created successfully!
You can now download this file and submit it.
--------------------------------------------------
Sample predictions:
     id      prediction
0  5813  not_applicable
1  5853  not_applicable
2   251  not_applicable
3  7213  not_applicable
4  6848  not_applicable
