In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
import warnings
import csv
warnings.filterwarnings('ignore')

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed()

TRAIN_PATH = '/kaggle/input/clef2025-checkthat-lab-track-01/english/train_en.tsv'
DEV_PATH = '/kaggle/input/clef2025-checkthat-lab-track-01/english/dev_en.tsv'
TEST_PATH = '/kaggle/input/clef2025-checkthat-lab-track-01/english/test_en_unlabeled.tsv'
OUTPUT_PATH = 'subtask_english.tsv'
MODEL_NAME = 'roberta-large'
BATCH_SIZE = 16
MAX_LENGTH = 128
EPOCHS = 5
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_PROPORTION = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

class SubjectivityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, has_labels=True):
        self.tokenizer = tokenizer
        self.text = dataframe['sentence'].tolist()
        self.ids = dataframe['sentence_id'].tolist()
        self.max_length = max_length
        self.has_labels = has_labels
        
        if has_labels:
            self.labels = [1 if label == 'SUBJ' else 0 for label in dataframe['label'].tolist()]
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        sentence_id = self.ids[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        result = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentence_id': sentence_id
        }
        
        if self.has_labels:
            result['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
            
        return result

def clean_text(text):
    if isinstance(text, str):
        text = text.strip()
    return text

def load_and_preprocess_data(file_path, has_labels=True):
    try:
        df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE)
        df['sentence'] = df['sentence'].apply(clean_text)
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

train_df = load_and_preprocess_data(TRAIN_PATH)
dev_df = load_and_preprocess_data(DEV_PATH)
test_df = load_and_preprocess_data(TEST_PATH, has_labels=False)

print(f"Train data shape: {train_df.shape}")
print(f"Dev data shape: {dev_df.shape}")
print(f"Test data shape: {test_df.shape}")

combined_train_df = pd.concat([train_df, dev_df], ignore_index=True)
print(f"Combined train data shape: {combined_train_df.shape}")

train_dataset = SubjectivityDataset(train_df, tokenizer, MAX_LENGTH)
dev_dataset = SubjectivityDataset(dev_df, tokenizer, MAX_LENGTH)
combined_dataset = SubjectivityDataset(combined_train_df, tokenizer, MAX_LENGTH)
test_dataset = SubjectivityDataset(test_df, tokenizer, MAX_LENGTH, has_labels=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

class_counts = train_df['label'].value_counts()
total = len(train_df)
weights = torch.tensor([total/class_counts['OBJ'], total/class_counts['SUBJ']]).to(device)

model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

model = model.to(device)

def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            if 'labels' in batch:
                actual_labels.extend(batch['labels'].cpu().tolist())
    
    if actual_labels:
        f1 = f1_score(actual_labels, predictions, average='macro')
        report = classification_report(actual_labels, predictions, target_names=['OBJ', 'SUBJ'])
        print(report)
        return f1
    
    return predictions

def predict(model, dataloader, device):
    model.eval()
    predictions = []
    sentence_ids = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_sentence_ids = batch['sentence_id']
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            sentence_ids.extend(batch_sentence_ids)
    
    label_predictions = ['SUBJ' if pred == 1 else 'OBJ' for pred in predictions]
    
    return sentence_ids, label_predictions

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(total_steps * WARMUP_PROPORTION),
    num_training_steps=total_steps
)

best_f1 = 0
best_model_state = None

print("Starting training...")
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    
    train_loss = train_model(model, train_loader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")
    
    print("Evaluating on dev set...")
    dev_f1 = evaluate_model(model, dev_loader, device)
    print(f"Dev F1 Score: {dev_f1:.4f}")
    
    if dev_f1 > best_f1:
        best_f1 = dev_f1
        best_model_state = model.state_dict().copy()
        print(f"New best F1: {best_f1:.4f}")

print(f"Best validation F1: {best_f1:.4f}")

if best_model_state:
    print("Loading best model...")
    model.load_state_dict(best_model_state)

print("Training on combined data (train + dev)...")
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(combined_loader) * 3
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(total_steps * WARMUP_PROPORTION),
    num_training_steps=total_steps
)

for epoch in range(3):
    print(f"Final training - Epoch {epoch + 1}/3")
    train_loss = train_model(model, combined_loader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")

print("Generating predictions on test set...")
sentence_ids, predictions = predict(model, test_loader, device)

submission_df = pd.DataFrame({
    'sentence_id': sentence_ids,
    'label': predictions
})

submission_df.to_csv(OUTPUT_PATH, sep='\t', index=False, quoting=csv.QUOTE_NONE)

print(f"Predictions saved to {OUTPUT_PATH}")
print("Done!")

2025-05-01 07:15:40.531335: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746083740.716159      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746083740.769291      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Train data shape: (830, 4)
Dev data shape: (462, 4)
Test data shape: (300, 2)
Combined train data shape: (1292, 4)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch 1/5
Training loss: 0.5964
Evaluating on dev set...
              precision    recall  f1-score   support

         OBJ       0.79      0.71      0.75       222
        SUBJ       0.75      0.82      0.79       240

    accuracy                           0.77       462
   macro avg       0.77      0.77      0.77       462
weighted avg       0.77      0.77      0.77       462

Dev F1 Score: 0.7667
New best F1: 0.7667
Epoch 2/5
Training loss: 0.4216
Evaluating on dev set...
              precision    recall  f1-score   support

         OBJ       0.81      0.79      0.80       222
        SUBJ       0.81      0.82      0.82       240

    accuracy                           0.81       462
   macro avg       0.81      0.81      0.81       462
weighted avg       0.81      0.81      0.81       462

Dev F1 Score: 0.8069
New best F1: 0.8069
Epoch 3/5
Training loss: 0.2629
Evaluating on dev set...
              precision    recall  f1-score   support

         OBJ     