In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
import warnings
import csv
import os

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

LANGUAGE = 'german'
BASE_DATA_PATH = '/kaggle/input/clef2025-checkthat-lab-track-01/'

TRAIN_PATH = os.path.join(BASE_DATA_PATH, LANGUAGE, 'train_de.tsv')
DEV_PATH = os.path.join(BASE_DATA_PATH, LANGUAGE, 'dev_de.tsv')
TEST_PATH = os.path.join(BASE_DATA_PATH, LANGUAGE, 'test_de_unlabeled.tsv')
OUTPUT_PATH = f'subtask_{LANGUAGE}.tsv'

MODEL_NAME = 'deepset/gelectra-large'

BATCH_SIZE = 16
MAX_LENGTH = 128
EPOCHS = 5
FINAL_EPOCHS = 3
LEARNING_RATE = 1.5e-5
WEIGHT_DECAY = 0.01
WARMUP_PROPORTION = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded.")

class SubjectivityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, has_labels=True):
        self.tokenizer = tokenizer
        if 'sentence' not in dataframe.columns:
            raise ValueError("DataFrame must contain a 'sentence' column.")
        dataframe = dataframe.dropna(subset=['sentence'])
        self.text = dataframe['sentence'].tolist()

        if 'sentence_id' not in dataframe.columns:
             raise ValueError("DataFrame must contain a 'sentence_id' column.")
        self.ids = dataframe['sentence_id'].tolist()

        self.max_length = max_length
        self.has_labels = has_labels

        if has_labels:
            if 'label' not in dataframe.columns:
                raise ValueError("DataFrame must contain a 'label' column when has_labels=True.")
            valid_label_indices = dataframe.dropna(subset=['label']).index
            dataframe = dataframe.loc[valid_label_indices]

            self.text = dataframe['sentence'].tolist()
            self.ids = dataframe['sentence_id'].tolist()
            self.labels = [1 if str(label).upper() == 'SUBJ' else 0 for label in dataframe['label'].tolist()]
            print(f"Label distribution after filtering: OBJ={self.labels.count(0)}, SUBJ={self.labels.count(1)}")

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        sentence_id = self.ids[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        result = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentence_id': str(sentence_id)
        }

        if self.has_labels:
            if index < len(self.labels):
                 result['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
            else:
                raise IndexError(f"Index {index} out of bounds for labels list of length {len(self.labels)}")

        return result

def clean_text(text):
    if isinstance(text, str):
        text = text.strip()
    else:
        text = str(text)
    return text

def load_and_preprocess_data(file_path, has_labels=True):
    print(f"Loading data from: {file_path}")
    try:
        df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='warn', dtype={'sentence_id': str})
        print(f"Initial rows loaded: {len(df)}")
        required_cols = ['sentence_id', 'sentence']
        if has_labels:
            required_cols.append('label')
        if not all(col in df.columns for col in required_cols):
             print(f"Warning: Columns might be misparsed with QUOTE_NONE (found: {df.columns}). Trying QUOTE_MINIMAL.")
             raise pd.errors.ParserError("Potential column parsing issue with QUOTE_NONE")

    except (pd.errors.ParserError, FileNotFoundError) as e:
        print(f"Error loading {file_path} with QUOTE_NONE: {e}. Trying QUOTE_MINIMAL.")
        try:
             required_cols = ['sentence_id', 'sentence']
             if has_labels:
                 required_cols.append('label')

             df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_MINIMAL, on_bad_lines='warn', dtype={'sentence_id': str})
             print(f"Initial rows loaded (QUOTE_MINIMAL): {len(df)}")
             if not all(col in df.columns for col in required_cols):
                 raise ValueError(f"File {file_path} (QUOTE_MINIMAL) is missing required columns. Found: {df.columns}. Required: {required_cols}")
        except FileNotFoundError:
             print(f"Error: File not found at {file_path}")
             return None
        except Exception as e_minimal:
             print(f"Error loading {file_path} even with QUOTE_MINIMAL: {e_minimal}")
             return None
    except Exception as e:
        print(f"An unexpected error occurred loading {file_path}: {e}")
        return None

    if 'sentence' in df.columns:
        df['sentence'] = df['sentence'].apply(clean_text)
        df = df[df['sentence'].fillna('').astype(str).str.strip() != '']
        print(f"Rows after cleaning 'sentence': {len(df)}")
    else:
        print(f"Warning: 'sentence' column not found in {file_path}. Skipping cleaning.")

    initial_rows_before_na_drop = len(df)
    df = df.dropna(subset=['sentence_id', 'sentence'])
    if has_labels:
        df = df.dropna(subset=['label'])
    print(f"Rows after dropping NAs in essential columns: {len(df)} (dropped {initial_rows_before_na_drop - len(df)})")

    if not has_labels and df.columns[0].startswith('Unnamed: '):
        print(f"Detected potential index column '{df.columns[0]}' in test set. Removing it.")
        df = df.iloc[:, 1:]

    df = df[df['sentence_id'].fillna('').astype(str).str.strip() != '']
    print(f"Rows after ensuring non-empty 'sentence_id': {len(df)}")

    return df

train_df = load_and_preprocess_data(TRAIN_PATH)
dev_df = load_and_preprocess_data(DEV_PATH)
test_df = load_and_preprocess_data(TEST_PATH, has_labels=False)

if train_df is None or dev_df is None:
    print("Error loading train or dev data files. Exiting.")
    exit()
if train_df.empty or dev_df.empty:
    print("Train or Dev DataFrame is empty after loading/preprocessing. Exiting.")
    exit()

if test_df is None and os.path.exists(TEST_PATH):
     print(f"Warning: Test file {TEST_PATH} could not be loaded.")
elif test_df is not None and test_df.empty and os.path.exists(TEST_PATH):
     print(f"Warning: Test DataFrame ({TEST_PATH}) is empty after loading/preprocessing.")
elif not os.path.exists(TEST_PATH):
     print(f"Info: Test file {TEST_PATH} not found. Prediction step will be skipped.")
     test_df = None

print(f"Train data shape after preprocessing: {train_df.shape}")
print(f"Dev data shape after preprocessing: {dev_df.shape}")
if test_df is not None:
    print(f"Test data shape after preprocessing: {test_df.shape}")
    print("Test columns:", test_df.columns)
else:
    print("Test data not available.")

combined_train_df = pd.concat([train_df, dev_df], ignore_index=True)
print(f"Combined train data shape: {combined_train_df.shape}")

try:
    print("Creating Train Dataset...")
    train_dataset = SubjectivityDataset(train_df, tokenizer, MAX_LENGTH)
    print("Creating Dev Dataset...")
    dev_dataset = SubjectivityDataset(dev_df, tokenizer, MAX_LENGTH)
    print("Creating Combined Dataset...")
    combined_dataset = SubjectivityDataset(combined_train_df, tokenizer, MAX_LENGTH)
    test_dataset = None
    if test_df is not None and not test_df.empty:
        print("Creating Test Dataset...")
        test_dataset = SubjectivityDataset(test_df, tokenizer, MAX_LENGTH, has_labels=False)
    else:
        print("Skipping Test Dataset creation (data not available or empty).")
except ValueError as e:
    print(f"Error creating Dataset: {e}")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during Dataset creation: {e}")
    exit()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE) if test_dataset else None
print("DataLoaders created.")

print(f"Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)
print("Model loaded.")

model = model.to(device)

def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    batch_count = 0

    for batch_idx, batch in enumerate(dataloader):
        optimizer.zero_grad()

        try:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
        except KeyError as e:
            print(f"Error: Missing key in train batch {batch_idx}: {e}")
            print("Batch keys:", batch.keys())
            continue
        except Exception as e:
            print(f"Error moving train batch {batch_idx} to device: {e}")
            continue

        try:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            if loss is None:
                 print(f"Warning: Loss is None for train batch {batch_idx}. Skipping backward pass.")
                 continue
            total_loss += loss.item()
            batch_count += 1

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        except Exception as e:
             print(f"Error during train forward/backward pass for batch {batch_idx}: {e}")
             if "CUDA out of memory" in str(e):
                 print("CUDA OOM Error detected. Try reducing BATCH_SIZE.")
             continue

    avg_loss = total_loss / batch_count if batch_count > 0 else 0
    return avg_loss

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    all_sentence_ids = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            try:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                batch_sentence_ids = batch['sentence_id']

                has_labels_in_batch = 'labels' in batch
                if has_labels_in_batch:
                    labels = batch['labels'].to(device)

            except KeyError as e:
                print(f"Error: Missing key in eval batch {batch_idx}: {e}")
                continue
            except Exception as e:
                 print(f"Error moving eval batch {batch_idx} to device: {e}")
                 if "CUDA out of memory" in str(e):
                      print("CUDA OOM Error during evaluation. Consider reducing BATCH_SIZE.")
                 continue

            try:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                logits = outputs.logits
                _, preds = torch.max(logits, dim=1)

                predictions.extend(preds.cpu().tolist())
                all_sentence_ids.extend(batch_sentence_ids)
                if has_labels_in_batch:
                    actual_labels.extend(labels.cpu().tolist())

            except Exception as e:
                 print(f"Error during evaluation forward pass for batch {batch_idx}: {e}")
                 if "CUDA out of memory" in str(e):
                      print("CUDA OOM Error during evaluation inference.")
                 continue

    if actual_labels:
        if len(predictions) != len(actual_labels):
             print(f"Warning: Mismatch in prediction ({len(predictions)}) and label ({len(actual_labels)}) counts during evaluation.")
             min_len = min(len(predictions), len(actual_labels))
             if min_len == 0: return 0.0
             f1 = f1_score(actual_labels[:min_len], predictions[:min_len], average='macro', zero_division=0)
             print(f"Calculated F1 on minimum aligned length: {min_len}")

        else:
            f1 = f1_score(actual_labels, predictions, average='macro', zero_division=0)

        try:
            report = classification_report(actual_labels, predictions, target_names=['OBJ', 'SUBJ'], zero_division=0)
            print("Evaluation Report:")
            print(report)
        except ValueError as e:
            print(f"Could not generate classification report: {e}")
            print(f"Actual labels unique: {np.unique(actual_labels)}")
            print(f"Predictions unique: {np.unique(predictions)}")
        return f1
    else:
         print("No labels found in evaluation data. Returning predictions and IDs.")
         label_predictions = ['SUBJ' if pred == 1 else 'OBJ' for pred in predictions]
         return all_sentence_ids, label_predictions

def predict(model, dataloader, device):
    if dataloader is None:
        print("Test dataloader is None. Skipping prediction.")
        return [], []

    model.eval()
    predictions = []
    sentence_ids = []
    print("Starting prediction loop...")

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            try:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                batch_sentence_ids = batch['sentence_id']
            except KeyError as e:
                print(f"Error: Missing key in prediction batch {batch_idx}: {e}")
                continue
            except Exception as e:
                print(f"Error processing prediction batch {batch_idx}: {e}")
                if "CUDA out of memory" in str(e):
                     print("CUDA OOM Error during prediction. Consider reducing BATCH_SIZE.")
                continue

            try:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                logits = outputs.logits
                _, preds = torch.max(logits, dim=1)

                predictions.extend(preds.cpu().tolist())
                sentence_ids.extend(batch_sentence_ids)
            except Exception as e:
                 print(f"Error during prediction forward pass for batch {batch_idx}: {e}")
                 if "CUDA out of memory" in str(e):
                     print("CUDA OOM Error during prediction inference.")
                 continue
            if (batch_idx + 1) % 50 == 0:
                print(f"  Predicted batch {batch_idx + 1}/{len(dataloader)}")

    label_predictions = ['SUBJ' if pred == 1 else 'OBJ' for pred in predictions]
    print(f"Prediction loop finished. Generated {len(label_predictions)} predictions for {len(sentence_ids)} IDs.")

    if len(sentence_ids) != len(label_predictions):
        print(f"CRITICAL WARNING: Mismatch in sentence ID count ({len(sentence_ids)}) and prediction count ({len(label_predictions)}). Submission file might be incorrect.")
        min_len = min(len(sentence_ids), len(label_predictions))
        print(f"Returning aligned results up to length {min_len}")
        return sentence_ids[:min_len], label_predictions[:min_len]

    return sentence_ids, label_predictions

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_train_batches = len(train_loader)
if num_train_batches == 0:
    print("Error: Training loader has zero batches. Cannot train.")
    exit()

total_steps = num_train_batches * EPOCHS
warmup_steps = int(total_steps * WARMUP_PROPORTION)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print(f"Optimizer and Scheduler configured for initial training ({total_steps} total steps, {warmup_steps} warmup steps).")

best_f1 = 0.0
best_model_state = None

print("\n--- Starting Initial Training Phase ---")
if len(train_loader) == 0 or len(dev_loader) == 0:
     print("Error: Train or Dev loader is empty. Cannot proceed with training.")
     exit()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")

    print("Training...")
    train_loss = train_model(model, train_loader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")

    print("Evaluating on dev set...")
    eval_result = evaluate_model(model, dev_loader, device)

    if isinstance(eval_result, float):
        dev_f1 = eval_result
        print(f"Dev F1 Score: {dev_f1:.4f}")

        if dev_f1 > best_f1:
            best_f1 = dev_f1
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
            print(f"*** New best F1: {best_f1:.4f}. Model state saved. ***")
        else:
            print(f"F1 did not improve from {best_f1:.4f}")
    else:
         print("Evaluation on dev set did not return an F1 score. Cannot determine best model based on F1.")
         pass

print(f"\n--- Initial Training Finished ---")
print(f"Best validation F1 achieved: {best_f1:.4f}")

if best_model_state:
    print("Loading best model state for final training...")
    model.cpu()
    model.load_state_dict(best_model_state)
    model.to(device)
    print("Best model loaded successfully.")
else:
    print("Warning: No best model state was saved (possibly due to no improvement or errors). Proceeding with the final model state from initial training.")

print("\n--- Starting Final Training on Combined Data (train + dev) ---")
if len(combined_loader) == 0:
    print("Error: Combined loader is empty. Skipping final training.")
else:
    final_lr = LEARNING_RATE / 2
    print(f"Using final learning rate: {final_lr}")
    optimizer_final = AdamW(model.parameters(), lr=final_lr, weight_decay=WEIGHT_DECAY)

    total_steps_final = len(combined_loader) * FINAL_EPOCHS
    warmup_steps_final = int(total_steps_final * WARMUP_PROPORTION)

    scheduler_final = get_linear_schedule_with_warmup(
        optimizer_final,
        num_warmup_steps=warmup_steps_final,
        num_training_steps=total_steps_final
    )
    print(f"Optimizer and Scheduler reconfigured for final training ({total_steps_final} total steps, {warmup_steps_final} warmup steps).")

    for epoch in range(FINAL_EPOCHS):
        print(f"\nFinal training - Epoch {epoch + 1}/{FINAL_EPOCHS}")
        train_loss = train_model(model, combined_loader, optimizer_final, scheduler_final, device)
        print(f"Training loss: {train_loss:.4f}")

print("\n--- Final Training Finished ---")

print("\n--- Generating Predictions on Test Set ---")
if test_loader:
    sentence_ids, predictions = predict(model, test_loader, device)

    if sentence_ids and predictions:
        if len(sentence_ids) == len(predictions):
            print(f"Successfully generated {len(predictions)} predictions.")
            submission_df = pd.DataFrame({
                'sentence_id': sentence_ids,
                'label': predictions
            })

            try:
                 submission_df[['sentence_id', 'label']].to_csv(
                     OUTPUT_PATH,
                     sep='\t',
                     index=False,
                     quoting=csv.QUOTE_MINIMAL,
                     header=True
                )
                 print(f"Predictions successfully saved to {OUTPUT_PATH}")
                 print("\nSubmission File Head:")
                 print(submission_df.head())
                 print(f"\nReminder: Zip this file into '{OUTPUT_PATH.replace('.tsv', '.zip')}' for submission.")

            except Exception as e:
                 print(f"Error saving submission file to {OUTPUT_PATH}: {e}")
        else:
             print(f"Error: Mismatch between sentence IDs ({len(sentence_ids)}) and predictions ({len(predictions)}) after predict function. Submission file not generated.")

    elif not sentence_ids and not predictions and test_dataset is not None:
         print("Prediction resulted in empty lists, possibly due to errors during prediction loop or empty test set after processing.")
    else:
        print("No predictions were generated (Test data might be empty or errors occurred). No submission file generated.")
else:
     print("Test loader was not created (Test data likely missing, empty, or failed to load). No submission file generated.")

print("\n--- Script Finished ---")

Using device: cuda
Loading tokenizer: deepset/gelectra-large


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

Tokenizer loaded.
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/german/train_de.tsv
Initial rows loaded: 800
Rows after cleaning 'sentence': 800
Rows after dropping NAs in essential columns: 800 (dropped 0)
Rows after ensuring non-empty 'sentence_id': 800
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/german/dev_de.tsv
Initial rows loaded: 491
Rows after cleaning 'sentence': 491
Rows after dropping NAs in essential columns: 491 (dropped 0)
Rows after ensuring non-empty 'sentence_id': 491
Loading data from: /kaggle/input/clef2025-checkthat-lab-track-01/german/test_de_unlabeled.tsv
Initial rows loaded: 347
Rows after cleaning 'sentence': 347
Rows after dropping NAs in essential columns: 347 (dropped 0)
Detected potential index column 'Unnamed: 0' in test set. Removing it.
Rows after ensuring non-empty 'sentence_id': 347
Train data shape after preprocessing: (800, 3)
Dev data shape after preprocessing: (491, 3)
Test data shape after preprocessing: (347

2025-05-01 10:00:24.825110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746093625.010893      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746093625.065219      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at deepset/gelectra-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.
Optimizer and Scheduler configured for initial training (250 total steps, 25 warmup steps).

--- Starting Initial Training Phase ---

Epoch 1/5
Training...
Training loss: 0.6402
Evaluating on dev set...
Evaluation Report:
              precision    recall  f1-score   support

         OBJ       0.80      0.90      0.85       317
        SUBJ       0.76      0.59      0.67       174

    accuracy                           0.79       491
   macro avg       0.78      0.75      0.76       491
weighted avg       0.79      0.79      0.78       491

Dev F1 Score: 0.7568
*** New best F1: 0.7568. Model state saved. ***

Epoch 2/5
Training...
Training loss: 0.3911
Evaluating on dev set...
Evaluation Report:
              precision    recall  f1-score   support

         OBJ       0.81      0.89      0.85       317
        SUBJ       0.75      0.63      0.68       174

    accuracy                           0.79       491
   macro avg       0.78      0.76      0.77       491
weighte