Import Required Libraries


In [None]:
import pandas as pd
import torch
import os
import tqdm
from conllu import parse
from torch.utils.data.dataset import Dataset
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import hamming_loss, f1_score, classification_report

Load and Map Labels


In [None]:
def load_and_map_labels(label_file_path):
    labels_df = pd.read_csv(label_file_path, sep="\\t", header=None, names=["article_id", "narratives", "subnarratives"])
    labels_mapping = {
        row["article_id"]: {
            "narratives": row["narratives"].split(";"),
            "subnarratives": row["subnarratives"].split(";")
        }
        for _, row in labels_df.iterrows()
    }
    return labels_mapping

label_file = "../data/training_data_16_October_release/EN/subtask-2-annotations.txt"
labels_mapping = load_and_map_labels(label_file)

Parse and Load Articles


In [None]:
def parse_conllu_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    token_lists = parse(data)
    all_tokens = [token["form"] for token_list in token_lists for token in token_list]
    return " ".join(all_tokens)

def load_articles_from_conllu(articles_path, article_ids, labels_mapping):
    articles_data = []
    for article_id in article_ids:
        file_path = os.path.join(articles_path, f"{article_id.replace('.txt', '.conllu')}")
        if os.path.exists(file_path):
            article_text = parse_conllu_file(file_path)
            labels = labels_mapping.get(article_id, {"narratives": [], "subnarratives": []})
            articles_data.append({
                "article_id": article_id,
                "text": article_text,
                "narratives": labels["narratives"],
                "subnarratives": labels["subnarratives"]
            })
    return articles_data

articles_path = "../data/tmp/EN"
article_ids = labels_mapping.keys()
articles_data = load_articles_from_conllu(articles_path, article_ids, labels_mapping)

Initialize Tokenizer


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Generate Label Vocabulary


all_narratives = sorted({n for article in articles_data for n in article["narratives"]})
all_subnarratives = sorted({sn for article in articles_data for sn in article["subnarratives"]})

narrative_to_index = {n: i for i, n in enumerate(all_narratives)}
subnarrative_to_index = {sn: i for i, sn in enumerate(all_subnarratives)}

def encode_labels(narratives, subnarratives):
    narrative_vector = [0] * len(all_narratives)
    subnarrative_vector = [0] * len(all_subnarratives)
    for n in narratives:
        narrative_vector[narrative_to_index[n]] = 1
    for sn in subnarratives:
        subnarrative_vector[subnarrative_to_index[sn]] = 1
    return narrative_vector + subnarrative_vector

for article in articles_data:
    article["labels"] = encode_labels(article["narratives"], article["subnarratives"])

Create Dataset Class


In [None]:
class NarrativeDataset(Dataset):
    def __init__(self, articles):
        self.articles = articles

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        inputs = tokenizer(
            article["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        labels = torch.tensor(article["labels"], dtype=torch.float)
        return {**inputs, "labels": labels}

Initialize Model


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(all_narratives) + len(all_subnarratives)
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", 
    num_labels=num_labels
)
model = model.to(device)
model.train()

Train-Validation Split


In [None]:
train_data, val_data = train_test_split(articles_data, test_size=0.2, random_state=42)
train_dataset = NarrativeDataset(train_data)
val_dataset = NarrativeDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=6, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=6, shuffle=False, pin_memory=True)

Training Loop


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5  # Increased number of epochs

for epoch in range(num_epochs): 
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_loss = 0
    progress_bar = tqdm.tqdm(train_loader, desc="Processing Batches", leave=True)
    
    for batch in progress_bar:
        optimizer.zero_grad()
        inputs = {key: val.squeeze(1).to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})
    
    print(f"Epoch {epoch + 1} completed.")
    print(f"- Average Loss: {epoch_loss / len(train_loader):.4f}")

Evaluation


In [17]:
def get_predictions(model, data_loader, device, threshold=0.5):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            inputs = {key: val.squeeze(1).to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.sigmoid(logits)  
            preds = (probs > threshold).int() 
            all_predictions.append(preds.cpu())
            all_labels.append(labels.cpu())
    all_predictions = torch.cat(all_predictions, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    return all_predictions.numpy(), all_labels.numpy()

def evaluate_model(model, data_loader, device, val_data, class_labels, print_report=True):
    y_pred, y_true = get_predictions(model, data_loader, device)
    hamming = hamming_loss(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    micro_f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
    subset_accuracy = (y_true == y_pred).all(axis=1).mean()
    if print_report:
        report = classification_report(
            y_true, y_pred, target_names=class_labels, digits=2, zero_division=0
        )
        print("\nClassification Report:\n")
        print(report)
    return {"Hamming Loss": hamming, "Macro F1": macro_f1, "Micro F1": micro_f1, "Subset Accuracy": subset_accuracy}

print("Evaluating on Validation Set...")
evaluation_results = evaluate_model(model, val_loader, device, val_data, all_narratives + all_subnarratives)
print(evaluation_results)

Evaluating on Validation Set...

Classification Report:

                                                                                                            precision    recall  f1-score   support

                                                                          CC: Climate change is beneficial       0.00      0.00      0.00         1
                                                                  CC: Controversy about green technologies       0.00      0.00      0.00         3
                                                                         CC: Criticism of climate movement       0.00      0.00      0.00         4
                                                                         CC: Criticism of climate policies       0.00      0.00      0.00         4
                                                             CC: Criticism of institutions and authorities       0.00      0.00      0.00         5
                                                      

Error Analysis


In [18]:
def error_analysis(model, data_loader, device, val_data, class_labels, num_examples=5):
    y_pred, y_true = get_predictions(model, data_loader, device)
    incorrect_indices = (y_pred != y_true).any(axis=1)
    incorrect_examples = [(i, y_pred[i], y_true[i]) for i in range(len(y_pred)) if incorrect_indices[i]]
    print(f"Total Incorrect Predictions: {len(incorrect_examples)}")
    print(f"Showing {min(num_examples, len(incorrect_examples))} examples:\n")
    for idx, pred, true in incorrect_examples[:num_examples]:
        print(f"Example {idx}:")
        print(f"Predicted: {pred}")
        print(f"True: {true}")
        print(f"Text: {val_data[idx]['text']}\n")

print("Error Analysis on Validation Set...")
error_analysis(model, val_loader, device, val_data, all_narratives + all_subnarratives)

Error Analysis on Validation Set...
Total Incorrect Predictions: 40
Showing 5 examples:

Example 0:
Predicted: [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
True: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Text: interview chinese wisdom aids sustainable development new world heritage site beijing sept xinhua cultural landscape old tea forests jingmai mountain puer southwest china embodies wisdom chinas agrarian culture example positive interaction people nature sustainable development chinese official said li qun deputy minister culture tourism head national cultural heritage administration made remarks interview