In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
import warnings
import csv
import os 

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

DATA_BASE_PATH = '/kaggle/input/clef2025-checkthat-lab-track-01/arabic/'

TRAIN_PATH = os.path.join(DATA_BASE_PATH, 'train_ar.tsv')
DEV_PATH = os.path.join(DATA_BASE_PATH, 'dev_ar.tsv')
TEST_PATH = os.path.join(DATA_BASE_PATH, 'test_ar_unlabeled.tsv')
OUTPUT_PATH = 'subtask_arabic.tsv'

MODEL_NAME = 'UBC-NLP/MARBERTv2'
BATCH_SIZE = 16
MAX_LENGTH = 128
EPOCHS = 5
FINAL_EPOCHS = 3
LEARNING_RATE = 1.8e-5
FINAL_LR_FACTOR = 0.5
WEIGHT_DECAY = 0.01
WARMUP_PROPORTION = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded.")

class SubjectivityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, has_labels=True):
        self.tokenizer = tokenizer
        if 'sentence' not in dataframe.columns:
            raise ValueError("DataFrame must contain a 'sentence' column.")
        dataframe['sentence'] = dataframe['sentence'].astype(str)
        dataframe = dataframe.dropna(subset=['sentence'])
        self.text = dataframe['sentence'].tolist()

        if 'sentence_id' not in dataframe.columns:
             raise ValueError("DataFrame must contain a 'sentence_id' column.")
        dataframe['sentence_id'] = dataframe['sentence_id'].astype(str)
        self.ids = dataframe['sentence_id'].tolist()

        self.max_length = max_length
        self.has_labels = has_labels

        if has_labels:
            if 'label' not in dataframe.columns:
                raise ValueError("DataFrame must contain a 'label' column when has_labels=True.")
            dataframe = dataframe.dropna(subset=['label'])
            self.text = dataframe['sentence'].tolist()
            self.ids = dataframe['sentence_id'].tolist()
            self.labels = [1 if str(label).upper() == 'SUBJ' else 0 for label in dataframe['label'].tolist()]
            if len(self.text) != len(self.labels):
                 raise ValueError("Mismatch between number of texts and labels after processing NaNs.")

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        sentence_id = self.ids[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        result = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentence_id': sentence_id
        }

        if self.has_labels:
            if index < len(self.labels):
                 result['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
            else:
                 raise IndexError(f"Index {index} out of bounds for labels list of length {len(self.labels)}")

        return result

def clean_text(text):
    text = str(text).strip()
    return text

def load_and_preprocess_data(file_path, has_labels=True):
    print(f"Loading data from: {file_path}")
    try:
        df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='warn', dtype={'sentence_id': str})
        print(f"Initial rows loaded: {len(df)}")
        required_cols = ['sentence_id', 'sentence']
        if has_labels:
            required_cols.append('label')
        if not all(col in df.columns for col in required_cols):
             print(f"Warning: File {file_path} might be missing required columns (needed: {required_cols}, found: {df.columns}). Trying QUOTE_MINIMAL.")
             raise ValueError("Missing columns")

    except (pd.errors.ParserError, ValueError) as e:
        print(f"ParserError/ValueError loading {file_path} with QUOTE_NONE: {e}. Trying QUOTE_MINIMAL.")
        try:
             df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_MINIMAL, on_bad_lines='warn', dtype={'sentence_id': str})
             print(f"Initial rows loaded (QUOTE_MINIMAL): {len(df)}")
             required_cols = ['sentence_id', 'sentence']
             if has_labels:
                 required_cols.append('label')
             if not all(col in df.columns for col in required_cols):
                 raise ValueError(f"File {file_path} (QUOTE_MINIMAL) is missing required columns. Found: {df.columns}. Required: {required_cols}")
        except Exception as e_minimal:
             print(f"Error loading {file_path} even with QUOTE_MINIMAL: {e_minimal}")
             return None
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred loading {file_path}: {e}")
        return None

    if 'sentence' in df.columns:
        df['sentence'] = df['sentence'].apply(clean_text)
        initial_rows = len(df)
        df = df[df['sentence'].fillna('').astype(str).str.strip() != '']
        if len(df) < initial_rows:
            print(f"Dropped {initial_rows - len(df)} rows due to empty 'sentence' after cleaning.")
    else:
        print(f"Warning: 'sentence' column not found in {file_path}. Skipping cleaning.")

    essential_cols = ['sentence_id', 'sentence']
    if has_labels:
        essential_cols.append('label')

    initial_rows = len(df)
    df = df.dropna(subset=essential_cols)
    if len(df) < initial_rows:
         print(f"Dropped {initial_rows - len(df)} rows due to NaNs in essential columns ({essential_cols}).")

    print(f"Rows after preprocessing: {len(df)}")
    return df

train_df = load_and_preprocess_data(TRAIN_PATH)
dev_df = load_and_preprocess_data(DEV_PATH)
test_df = load_and_preprocess_data(TEST_PATH, has_labels=False)

if train_df is None or dev_df is None:
    print("Error loading train or dev data files. Exiting.")
    exit()
if train_df.empty or dev_df.empty:
    print("Train or Dev DataFrame is empty after loading/preprocessing. Exiting.")
    exit()
if test_df is None and os.path.exists(TEST_PATH):
     print(f"Warning: Test file {TEST_PATH} exists but could not be loaded or is empty.")
elif test_df is not None and test_df.empty and os.path.exists(TEST_PATH):
     print(f"Warning: Test DataFrame ({TEST_PATH}) is empty after loading/preprocessing.")
elif not os.path.exists(TEST_PATH):
     print(f"Info: Test file {TEST_PATH} not found. Prediction step will be skipped.")
     test_df = None

print(f"Train data shape after preprocessing: {train_df.shape}")
print(f"Dev data shape after preprocessing: {dev_df.shape}")
print(f"Test data shape after preprocessing: {test_df.shape if test_df is not None else 'None'}")

print("Combining train and dev data...")
combined_train_df = pd.concat([train_df, dev_df], ignore_index=True)
print(f"Combined train data shape: {combined_train_df.shape}")

print("Creating Datasets and DataLoaders...")
try:
    train_dataset = SubjectivityDataset(train_df, tokenizer, MAX_LENGTH)
    dev_dataset = SubjectivityDataset(dev_df, tokenizer, MAX_LENGTH)
    combined_dataset = SubjectivityDataset(combined_train_df, tokenizer, MAX_LENGTH)
    test_dataset = SubjectivityDataset(test_df, tokenizer, MAX_LENGTH, has_labels=False) if test_df is not None and not test_df.empty else None
except ValueError as e:
    print(f"Error creating Dataset: {e}")
    exit()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE) if test_dataset else None
print("DataLoaders created.")

print(f"Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)
model = model.to(device)
print("Model loaded and moved to device.")

def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    processed_batches = 0

    for batch_idx, batch in enumerate(dataloader):
        optimizer.zero_grad()

        try:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
        except KeyError as e:
            print(f"Error: Missing key in train batch {batch_idx}: {e}. Keys: {batch.keys()}. Skipping batch.")
            continue
        except Exception as e:
            print(f"Error moving train batch {batch_idx} to device: {e}. Skipping batch.")
            continue

        try:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            if loss is None:
                 print(f"Warning: Loss is None for train batch {batch_idx}. Skipping backward pass.")
                 continue

            loss.backward()
            total_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            processed_batches += 1

        except Exception as e:
             print(f"Error during train forward/backward pass for batch {batch_idx}: {e}. Skipping batch.")
             continue

    avg_loss = total_loss / processed_batches if processed_batches > 0 else 0
    return avg_loss

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    all_sentence_ids = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            try:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                batch_sentence_ids = batch['sentence_id']

                has_labels_in_batch = 'labels' in batch
                if has_labels_in_batch:
                    labels = batch['labels'].to(device)

            except KeyError as e:
                print(f"Error: Missing key in eval batch {batch_idx}: {e}. Skipping batch.")
                continue
            except Exception as e:
                 print(f"Error moving eval batch {batch_idx} to device: {e}. Skipping batch.")
                 continue

            try:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                predictions.extend(preds.cpu().tolist())
                all_sentence_ids.extend(batch_sentence_ids)
                if has_labels_in_batch:
                    actual_labels.extend(labels.cpu().tolist())

            except Exception as e:
                 print(f"Error during evaluation forward pass for batch {batch_idx}: {e}. Skipping batch.")
                 continue

    if actual_labels:
        if len(predictions) != len(actual_labels):
             print(f"Warning: Mismatch in prediction ({len(predictions)}) and label ({len(actual_labels)}) counts during evaluation.")

        if not predictions:
            print("Warning: No predictions generated during evaluation.")
            return 0.0

        if len(set(actual_labels)) < 2:
            print(f"Warning: Only one class present in evaluation labels: {set(actual_labels)}. Macro F1 might be ill-defined or 0.")

        f1 = f1_score(actual_labels, predictions, average='macro', zero_division=0)
        try:
            report = classification_report(actual_labels, predictions, target_names=['OBJ', 'SUBJ'], zero_division=0)
            print("\n--- Evaluation Report ---")
            print(report)
            print("-------------------------\n")
        except ValueError as e:
            print(f"Could not generate classification report: {e}")
            print(f"Actual labels unique: {np.unique(actual_labels)}")
            print(f"Predictions unique: {np.unique(predictions)}")
        return f1
    else:
         print("No labels found in evaluation data. Returning predictions only.")
         label_predictions = ['SUBJ' if pred == 1 else 'OBJ' for pred in predictions]
         return all_sentence_ids, label_predictions

def predict(model, dataloader, device):
    if dataloader is None:
        print("Test dataloader is None. Skipping prediction.")
        return [], []

    model.eval()
    predictions = []
    sentence_ids = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            try:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                batch_sentence_ids = batch['sentence_id']
            except KeyError as e:
                print(f"Error: Missing key in prediction batch {batch_idx}: {e}. Skipping batch.")
                continue
            except Exception as e:
                print(f"Error processing prediction batch {batch_idx}: {e}. Skipping batch.")
                continue

            try:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                predictions.extend(preds.cpu().tolist())
                sentence_ids.extend(batch_sentence_ids)
            except Exception as e:
                 print(f"Error during prediction forward pass for batch {batch_idx}: {e}. Skipping batch.")
                 continue

    label_predictions = ['SUBJ' if pred == 1 else 'OBJ' for pred in predictions]

    if len(sentence_ids) != len(label_predictions):
        print(f"Warning: Mismatch in sentence ID count ({len(sentence_ids)}) and prediction count ({len(label_predictions)}). Submission file might be incorrect.")

    return sentence_ids, label_predictions

print("Setting up optimizer and scheduler...")
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

num_train_batches = len(train_loader)
if num_train_batches == 0:
    print("Error: Training loader has zero batches. Cannot train.")
    exit()

total_steps = num_train_batches * EPOCHS
warmup_steps = int(total_steps * WARMUP_PROPORTION)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print(f"Optimizer: AdamW (LR={LEARNING_RATE}, WD={WEIGHT_DECAY})")
print(f"Scheduler: LinearWarmup (Total Steps={total_steps}, Warmup Steps={warmup_steps})")

best_f1 = 0.0
best_model_state = None

print("\n--- Starting Initial Training Phase (Train on Train, Evaluate on Dev) ---")
if len(train_loader) == 0 or len(dev_loader) == 0:
     print("Error: Train or Dev loader is empty. Cannot proceed with training.")
     exit()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")

    print("Training...")
    train_loss = train_model(model, train_loader, optimizer, scheduler, device)
    print(f"  Training Loss: {train_loss:.4f}")

    print("Evaluating on dev set...")
    eval_result = evaluate_model(model, dev_loader, device)

    if isinstance(eval_result, float):
        dev_f1 = eval_result
        print(f"  Dev Macro F1 Score: {dev_f1:.4f}")

        if dev_f1 > best_f1:
            best_f1 = dev_f1
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
            print(f"  * New best F1: {best_f1:.4f}. Model state saved. *")
        else:
            print(f"  F1 did not improve from {best_f1:.4f}.")
    else:
         print("  Evaluation on dev set did not return an F1 score. Cannot determine best model based on F1.")

print(f"\nBest validation F1 achieved during initial training: {best_f1:.4f}")

if best_model_state:
    print("\nLoading best model state for final training phase...")
    model.cpu()
    model.load_state_dict(best_model_state)
    model.to(device)
    print("Best model loaded successfully.")
else:
    print("\nWarning: No best model state was saved (possibly due to no improvement or eval issues).")
    print("Proceeding with the model state from the last epoch of initial training for final training.")

print("\n--- Starting Final Training Phase (Train on Train + Dev Combined) ---")
if len(combined_loader) == 0:
    print("Error: Combined loader is empty. Skipping final training.")
else:
    final_lr = LEARNING_RATE * FINAL_LR_FACTOR
    print(f"Setting up final optimizer and scheduler (LR={final_lr})...")
    optimizer_final = AdamW(model.parameters(), lr=final_lr, weight_decay=WEIGHT_DECAY)

    total_steps_final = len(combined_loader) * FINAL_EPOCHS
    warmup_steps_final = int(total_steps_final * WARMUP_PROPORTION)

    scheduler_final = get_linear_schedule_with_warmup(
        optimizer_final,
        num_warmup_steps=warmup_steps_final,
        num_training_steps=total_steps_final
    )
    print(f"Final Scheduler: LinearWarmup (Total Steps={total_steps_final}, Warmup Steps={warmup_steps_final})")

    for epoch in range(FINAL_EPOCHS):
        print(f"\nFinal Training - Epoch {epoch + 1}/{FINAL_EPOCHS}")
        train_loss = train_model(model, combined_loader, optimizer_final, scheduler_final, device)
        print(f"  Training Loss: {train_loss:.4f}")
    print("Final training phase completed.")

print("\n--- Generating Predictions on Test Set ---")
if test_loader:
    sentence_ids, predictions = predict(model, test_loader, device)

    if sentence_ids and predictions:
        if len(sentence_ids) == len(predictions):
            print(f"Generated {len(predictions)} predictions.")
            submission_df = pd.DataFrame({
                'sentence_id': sentence_ids,
                'label': predictions
            })

            try:
                 submission_df.to_csv(OUTPUT_PATH, sep='\t', index=False, quoting=csv.QUOTE_MINIMAL, header=True)
                 print(f"Predictions successfully saved to {OUTPUT_PATH}")
                 print("\nIMPORTANT: Remember to zip this file as subtask_arabic.zip for submission!")
            except Exception as e:
                 print(f"Error saving submission file to {OUTPUT_PATH}: {e}")
        else:
             print(f"Error: Mismatch between generated sentence IDs ({len(sentence_ids)}) and predictions ({len(predictions)}). Submission file not saved.")

    elif not sentence_ids and not predictions and test_dataset is not None:
         print("Prediction resulted in empty lists. This might be due to errors during the prediction loop or an empty test dataset after filtering.")
    else:
        print("No predictions were generated. Check for errors during the prediction process.")
else:
     print("Test loader was not created (Test data likely missing, empty, or failed to load). No submission file generated.")

print("\n--- Script Finished ---")

Using device: cuda
Loading tokenizer: UBC-NLP/MARBERTv2


tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizer loaded.
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/arabic/train_ar.tsv
Initial rows loaded: 2446
Rows after preprocessing: 2446
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/arabic/dev_ar.tsv
Initial rows loaded: 467
Rows after preprocessing: 467
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/arabic/test_ar_unlabeled.tsv
Initial rows loaded: 1036
Rows after preprocessing: 1036
Train data shape after preprocessing: (2446, 3)
Dev data shape after preprocessing: (467, 3)
Test data shape after preprocessing: (1036, 2)
Combining train and dev data...
Combined train data shape: (2913, 3)
Creating Datasets and DataLoaders...
DataLoaders created.
Loading model: UBC-NLP/MARBERTv2


config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

2025-05-01 08:51:40.887527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746089501.062743      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746089501.112923      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device.
Setting up optimizer and scheduler...
Optimizer: AdamW (LR=1.8e-05, WD=0.01)
Scheduler: LinearWarmup (Total Steps=765, Warmup Steps=76)

--- Starting Initial Training Phase (Train on Train, Evaluate on Dev) ---

Epoch 1/5
Training...


model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

  Training Loss: 0.6851
Evaluating on dev set...

--- Evaluation Report ---
              precision    recall  f1-score   support

         OBJ       0.60      0.80      0.69       266
        SUBJ       0.52      0.28      0.37       201

    accuracy                           0.58       467
   macro avg       0.56      0.54      0.53       467
weighted avg       0.57      0.58      0.55       467

-------------------------

  Dev Macro F1 Score: 0.5268
  * New best F1: 0.5268. Model state saved. *

Epoch 2/5
Training...
  Training Loss: 0.6416
Evaluating on dev set...

--- Evaluation Report ---
              precision    recall  f1-score   support

         OBJ       0.58      0.93      0.71       266
        SUBJ       0.55      0.11      0.19       201

    accuracy                           0.58       467
   macro avg       0.56      0.52      0.45       467
weighted avg       0.57      0.58      0.49       467

-------------------------

  Dev Macro F1 Score: 0.4521
  F1 did not 