# Code Pre-run

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, PreTrainedModel, AutoConfig
from torch.nn import BCEWithLogitsLoss
import torch
from torch.utils.data import Dataset
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.graph_objects as go

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Sanity Check for device usage
torch.cuda.is_available()
torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1060 with Max-Q Design'

In [3]:
# Optional setting to handle wandb trigger
os.environ['WANDB_DISABLED'] = 'true'

# Global Declarations

In [38]:
# Global Configuration
drive_path = './training_data_16_October_release/'
dev_documents = './dev-documents_4_December/'
gold_label_path = './cleaned_dev_10_january_2025/'
# dev_documents_langs = [dev_documents+"EN/subtask-2-annotations.txt", dev_documents + "PT/subtask-2-annotations.txt"]
# gold_labels_langs = [gold_labels+"EN/subtask-2-annotations.txt", gold_labels + "PT/subtask-2-annotations.txt"]

annotation_files = {
    "EN": drive_path + "EN/subtask-2-annotations.txt",
    "PT": drive_path + "PT/subtask-2-annotations.txt",
}
raw_documents_dirs = {
    "EN": drive_path + "EN/raw-documents",
    "PT": drive_path + "PT/raw-documents",
}
max_len = 512
batch_size = 8
num_epochs = 20
pretrained_model_name = "bert-base-multilingual-cased"  # Use multilingual model

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    seed=42
)

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

np.random.seed(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

os.environ['PYTHONHASHSEED'] = str(42)

# Set a dark theme
mpl.rcParams.update({
    'axes.facecolor': '#2E2E2E',
    'axes.edgecolor': 'white',
    'axes.labelcolor': 'white',
    'xtick.color': 'white',
    'ytick.color': 'white',
    'text.color': 'white',
    'figure.facecolor': '#2E2E2E',
    'grid.color': '#444444',
    'legend.facecolor': '#444444',
    'legend.edgecolor': 'white'
})

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
# Sanity check for file location
os.path.isdir(raw_documents_dirs["EN"])

True

# Dataset Handling

In [12]:
# Dataset class for handling text and labels
class NarrativeDataset(Dataset):
    def __init__(self, texts, narrative_labels, sub_narrative_labels, tokenizer, max_len):
        self.texts = texts
        self.narrative_labels = narrative_labels
        self.sub_narrative_labels = sub_narrative_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        if self.sub_narrative_labels is None:
            labels = torch.tensor(self.narrative_labels[idx], dtype=torch.float)
        elif self.narrative_labels is None:
            labels = torch.tensor(self.sub_narrative_labels[idx], dtype=torch.float)
        else:
            labels = torch.cat(
            [
                torch.tensor(self.narrative_labels[idx], dtype=torch.float),
                torch.tensor(self.sub_narrative_labels[idx], dtype=torch.float),
            ]
        )

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels,
        }

In [13]:
# Updated preprocess_data function to handle multilingual data
def preprocess_data(annotation_file, raw_documents_dir):
    data = pd.read_csv(
        annotation_file,
        sep="\t",
        names=["filename", "narratives", "sub_narratives"],
    )

    def read_file(filename):
        with open(os.path.join(raw_documents_dir, filename), "r", encoding="utf-8") as file:
            return file.read()

    data["text"] = data["filename"].apply(read_file)

    def process_labels(label_string):
        if isinstance(label_string, str):
            return [label.strip() for label in label_string.split(";") if label.strip()]
        else:
            return ["Other"]

    data["narratives"] = data["narratives"].apply(process_labels)
    data["sub_narratives"] = data["sub_narratives"].apply(process_labels)

    return data

In [14]:
# Load and combine datasets
datasets = []
for lang, annotation_file in annotation_files.items():
    raw_documents_dir = raw_documents_dirs[lang]
    datasets.append(preprocess_data(annotation_file, raw_documents_dir))
data = pd.concat(datasets).reset_index(drop=True)


# MultiLabelBinarizer for narratives and sub-narratives
narrative_mlb = MultiLabelBinarizer()
sub_narrative_mlb = MultiLabelBinarizer()

y_narratives = narrative_mlb.fit_transform(data["narratives"])
y_sub_narratives = sub_narrative_mlb.fit_transform(data["sub_narratives"])


In [15]:
# Sanity check for binarizer of narratives
y_narratives

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], shape=(799, 22))

In [16]:
# Sanity check for binarizer of subnarratives
y_sub_narratives

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(799, 92))

In [17]:
# Train-test split
train_texts, test_texts, y_train_narratives, y_test_narratives, y_train_sub, y_test_sub = train_test_split(
    data["text"], y_narratives, y_sub_narratives, test_size=0.2, random_state=42
)
val_texts, test_texts, y_val_narratives, y_test_narratives, y_val_sub, y_test_sub = train_test_split(
    test_texts, y_test_narratives, y_test_sub, test_size=0.5, random_state=42
)

# Tokenizer and model initialization
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)


# Utilities

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions

    # Apply threshold to logits to get binary predictions
    preds = (logits > 0).astype(int)

    # Compute precision, recall, f1
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )

    # Compute multi-label accuracy
    accuracy = (preds == labels).mean()  # Fraction of correctly predicted labels

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Narrative Classification

In [19]:
# Create narratives datasets
train_narrative_dataset = NarrativeDataset(
    train_texts.tolist(), y_train_narratives, None, tokenizer, max_len
)
val_narrative_dataset = NarrativeDataset(
    val_texts.tolist(), y_val_narratives, None, tokenizer, max_len
)
test_narrative_dataset = NarrativeDataset(
    test_texts.tolist(), y_test_narratives, None, tokenizer, max_len
)

In [20]:
class NarrativeClassificationModel(PreTrainedModel):
    def __init__(self, pretrained_model_name, num_narrative_labels):
        config = AutoConfig.from_pretrained(pretrained_model_name)
        config.num_labels = num_narrative_labels
        super().__init__(config)

        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        self.loss_fn = BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token embeddings

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)


In [21]:
# Create narrative model
narrative_model = NarrativeClassificationModel(
    pretrained_model_name,
    num_narrative_labels=y_train_narratives.shape[1],
)

In [46]:
# Create narrative model trainer
# narrative_trainer = Trainer(
#     model=narrative_model,
#     args=training_args,
#     train_dataset=train_narrative_dataset,
#     eval_dataset=val_narrative_dataset,
#     compute_metrics=compute_metrics,
# )

# # Train and evaluate
# narrative_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2361,0.227861,0.909659,0.171987,0.145455,0.155146
2,0.1942,0.218538,0.909659,0.265486,0.248485,0.239919
3,0.1645,0.210208,0.914205,0.397596,0.333333,0.329145
4,0.1608,0.18814,0.922727,0.519531,0.29697,0.35861
5,0.1303,0.204079,0.917614,0.488657,0.333333,0.354454
6,0.109,0.193636,0.922727,0.707456,0.490909,0.513662
7,0.0981,0.20166,0.923864,0.656111,0.448485,0.478995
8,0.0743,0.190836,0.928977,0.692165,0.406061,0.482555
9,0.0661,0.207096,0.925,0.63887,0.454545,0.478179
10,0.0514,0.18995,0.932386,0.686912,0.515152,0.556204


TrainOutput(global_step=1600, training_loss=0.0819653993844986, metrics={'train_runtime': 1779.5961, 'train_samples_per_second': 7.181, 'train_steps_per_second': 0.899, 'total_flos': 3363163108761600.0, 'train_loss': 0.0819653993844986, 'epoch': 20.0})

In [47]:
# Evaluate the model
# print("Evaluating on test set...")
# results = narrative_trainer.evaluate(test_narrative_dataset)
# results

Evaluating on test set...


{'eval_loss': 0.16147944331169128,
 'eval_accuracy': 0.9460227272727273,
 'eval_precision': 0.6261813350048644,
 'eval_recall': 0.5714285714285714,
 'eval_f1': 0.5881386010929698,
 'eval_runtime': 2.4445,
 'eval_samples_per_second': 32.726,
 'eval_steps_per_second': 4.091,
 'epoch': 20.0}

In [48]:
# Save narrative model for more downstream use
# with open('narrative_model.pkl','wb') as f:
#     pickle.dump(narrative_model,f)

In [22]:
# Sanity Check for narrative_model store

with open('narrative_model.pkl', 'rb') as f:
    n_model = pickle.load(f)

n_trainer = Trainer(
    model=n_model,
    args=training_args,
    train_dataset=train_narrative_dataset,
    eval_dataset=val_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
print("Evaluating on test set...")
results = n_trainer.evaluate(test_narrative_dataset)
results

Evaluating on test set...


{'eval_loss': 0.16147944331169128,
 'eval_model_preparation_time': 0.004,
 'eval_accuracy': 0.9460227272727273,
 'eval_precision': 0.6261813350048644,
 'eval_recall': 0.5714285714285714,
 'eval_f1': 0.5881386010929698,
 'eval_runtime': 10.9713,
 'eval_samples_per_second': 7.292,
 'eval_steps_per_second': 0.911}

In [90]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Test documents to be predicted by our model n_trainer 
dev_documents_langs = [dev_documents+"EN/subtask-2-documents", dev_documents + "PT/subtask-2-documents"]
# print('dev_documents_langs', dev_documents_langs)

# Gold labels which have the file name and the correct prediction in tab separated format: filename\tNarrative1;Narrative2;...;NarrativeN\tSubnarrative1;Subnarrative2;...;SubnarrativeN
gold_label_annotations = [gold_label_path + "EN/subtask-2-annotations.txt", gold_label_path + "PT/subtask-2-annotations.txt"]
gold_labels_raw_doc_dir = [gold_label_path + "EN/subtask-2-documents/", gold_label_path + "PT/subtask-2-documents/"]

# Function to read and preprocess dev documents
def read_dev_documents(dev_documents_langs):
    dev_texts = []
    dev_filenames = []
    for dev_doc_dir in dev_documents_langs:
        for filename in os.listdir(dev_doc_dir):
            file_path = os.path.join(dev_doc_dir, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                dev_texts.append(file.read())
                dev_filenames.append(filename)
    return dev_texts, dev_filenames

# Read and preprocess dev documents
dev_texts, dev_filenames = read_dev_documents(dev_documents_langs)

# Tokenize dev documents
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")

# Create a dataset for dev documents
class DevDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

dev_dataset = DevDataset(dev_encodings)

# Reinitialize the Trainer to avoid AcceleratorState issues
n_trainer = Trainer(
    model=n_model,
    args=training_args,
    train_dataset=train_narrative_dataset,
    eval_dataset=val_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Predict narratives for dev documents
predictions = n_trainer.predict(dev_dataset)
predicted_logits = predictions.predictions
predicted_labels = (predicted_logits > 0).astype(int)

# Convert the binary matrix back to label names
predicted_label_names = narrative_mlb.inverse_transform(predicted_labels)

# for text, predicted_label, filename in zip(dev_texts, predicted_label_names, dev_filenames):
#     print(f"Filename: {filename}")
#     print(f"Text: {text[:100]}...")  # Print the first 100 characters of the text for brevity
#     print(f"Predicted Labels: {predicted_label}")
#     print()

# Function to preprocess data
def preprocess_data(annotation_file, raw_documents_dir):
    data = []
    with open(annotation_file, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            filename = parts[0]
            narratives = parts[1].split(";")
            sub_narratives = parts[2].split(";")
            with open(os.path.join(raw_documents_dir, filename), "r", encoding="utf-8") as doc_file:
                text = doc_file.read()
            data.append({
                "filename": filename,
                "narratives": narratives,
                "sub_narratives": sub_narratives,
                "text": text
            })
    return pd.DataFrame(data)

gold_data = []
for annotation_file, raw_documents_dir in zip(gold_label_annotations, gold_labels_raw_doc_dir):
    gold_data.append(preprocess_data(annotation_file, raw_documents_dir))
gold_data = pd.concat(gold_data).reset_index(drop=True)

# Calculate F1 score, precision, and recall for each document
f1_scores = []
precision_scores = []
recall_scores = []

# print('predicted_label_names: ', len(predicted_label_names))
# print('predicted_label_names: ', predicted_label_names)
# Align the gold_data labels with the predicted_label_names based on filenames
aligned_gold_labels = []

for filename in dev_filenames:
    gold_row = gold_data[gold_data['filename'] == filename]
    if not gold_row.empty:
        aligned_gold_labels.append(gold_row.iloc[0]['narratives'])

# Ensure the lengths match
assert len(aligned_gold_labels) == len(predicted_label_names)

# Calculate F1 score, precision, and recall for each document
f1_scores = []
precision_scores = []
recall_scores = []

for gold_labels, predicted_labelss, filename in zip(aligned_gold_labels, predicted_label_names, dev_filenames):
    # print(f"filename: {filename}")
    # print(f"Gold Labels: {gold_labels}")
    # print(f"Predicted Labels: {predicted_labelss}")

    # Convert the gold labels and predicted labels to binary arrays
    binary_gold_labels = narrative_mlb.transform([gold_labels])[0]
    binary_predicted_labels = narrative_mlb.transform([predicted_labelss])[0]


    
    # Calculate F1 score, precision, and recall for this document
    f1 = f1_score(binary_gold_labels, binary_predicted_labels, average='weighted', zero_division=1)
    precision = precision_score(binary_gold_labels, binary_predicted_labels, average='weighted', zero_division=1)
    recall = recall_score(binary_gold_labels, binary_predicted_labels, average='weighted', zero_division=1)
    
    # print(f"F1 Score: {f1}")
    # print(f"Precision: {precision}")
    # print(f"Recall: {recall}")
    
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

# Calculate average scores
average_f1 = np.mean(f1_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print(f"Average F1 Score: {average_f1}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")



Filename: EN_CC_200030.txt
Text: Bangladesh, Nordic countries to strengthen coop: minister 

 Nordic countries pledged to join hands ...
Predicted Labels: ('Other',)

Filename: EN_CC_200033.txt
Text: Greta Thunberg Calls For 'Overthrow of Whole Capitalist System' 

 Climate activist Greta Thunberg h...
Predicted Labels: ('CC: Criticism of climate movement', 'CC: Criticism of institutions and authorities')

Filename: EN_CC_200034.txt
Text: If we “just stop oil” like climate protesters want, six BILLION people could die 

 Six billion peop...
Predicted Labels: ('CC: Controversy about green technologies', 'CC: Criticism of climate movement', 'CC: Criticism of climate policies', 'CC: Criticism of institutions and authorities')

Filename: EN_CC_200035.txt
Text: Gretchen Whitmer Orders Michigan’s State Fleet Be 100% Electric by 2040 

 Gov. Gretchen Whitmer (D-...
Predicted Labels: ()

Filename: EN_CC_200036.txt
Text: Climate cultists push bizarre scare language about weather: “rain bombs” a

## Narrative Model Output Analysis

In [None]:
# Get the predicted labels for the test set
predictions = n_trainer.predict(test_narrative_dataset)
predicted_logits = predictions.predictions
predicted_labels = (predicted_logits > 0).astype(int)

# Convert the binary matrix back to label names
predicted_label_names = narrative_mlb.inverse_transform(predicted_labels)
actual_label_names = narrative_mlb.inverse_transform(y_test_narratives)

# Initialize counters for the final score
total_labels = 0
correct_labels = 0

# Print the test texts with their predicted and actual labels
for text, predicted_labels, actual_labels in zip(test_texts, predicted_label_names, actual_label_names):
    matched_labels = set(predicted_labels) & set(actual_labels)
    matched_percentage = len(matched_labels) / len(actual_labels) * 100 if actual_labels else 0
    total_labels += len(actual_labels)
    correct_labels += len(matched_labels)

    print(f"Predicted Labels: {predicted_labels}\nActual Labels: {actual_labels}")
    print(f"Matched Labels: {matched_labels}\nMatched Percentage: {matched_percentage:.2f}%")
    print("--------------------------------------------------\n")

# Calculate the final score
final_score = correct_labels / total_labels * 100 if total_labels else 0
print(f"Final Score: {final_score:.2f}%")

Predicted Labels: ('Other',)
Actual Labels: ('URW: Blaming the war on others rather than the invader', 'URW: Russia is the Victim')
Matched Labels: set()
Matched Percentage: 0.00%
--------------------------------------------------

Predicted Labels: ('URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy', 'URW: Speculating war outcomes')
Actual Labels: ('URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy', 'URW: Praise of Russia', 'URW: Speculating war outcomes')
Matched Labels: {'URW: Speculating war outcomes', 'URW: Discrediting the West, Diplomacy', 'URW: Discrediting Ukraine'}
Matched Percentage: 75.00%
--------------------------------------------------

Predicted Labels: ('URW: Amplifying war-related fears', 'URW: Discrediting the West, Diplomacy')
Actual Labels: ('URW: Discrediting the West, Diplomacy', 'URW: Negative Consequences for the West')
Matched Labels: {'URW: Discrediting the West, Diplomacy'}
Matched Percentage: 50.00%
---------------------

# Subnarrative Classification


In [92]:
class SubNarrativeClassificationModel(PreTrainedModel):
    def __init__(self, pretrained_model_name, num_sub_narrative_labels):
        config = AutoConfig.from_pretrained(pretrained_model_name)
        config.num_labels = num_sub_narrative_labels
        super().__init__(config)

        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        self.loss_fn = BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token embeddings

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)


### **Subnarratives (Without Narrative Context)**

In [93]:
# Create subnarrative datasets
train_sub_narrative_dataset = NarrativeDataset(
    train_texts.tolist(), y_train_sub, None, tokenizer, max_len
)
val_sub_narrative_dataset = NarrativeDataset(
    val_texts.tolist(), y_val_sub, None, tokenizer, max_len
)
test_sub_narrative_dataset = NarrativeDataset(
    test_texts.tolist(), y_test_sub, None, tokenizer, max_len
)

In [94]:
# Create subnarrative model
sub_narrative_model_baseline = SubNarrativeClassificationModel(
    pretrained_model_name,
    num_sub_narrative_labels=y_train_sub.shape[1],
)

In [53]:
# Create subnarrative model trainer
sub_narrative_baseline_trainer = Trainer(
    model=sub_narrative_model_baseline,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Training the second layer model
sub_narrative_baseline_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1661,0.158631,0.969158,0.0,0.0,0.0
2,0.12,0.12522,0.968342,0.012115,0.013216,0.012641
3,0.1079,0.113849,0.969158,0.0,0.0,0.0
4,0.1039,0.1047,0.970788,0.177313,0.061674,0.087355
5,0.0866,0.104337,0.970788,0.23348,0.092511,0.120745
6,0.0795,0.097773,0.970516,0.172745,0.118943,0.127999
7,0.0775,0.097855,0.969565,0.238815,0.105727,0.129265
8,0.069,0.093911,0.970109,0.262315,0.136564,0.16177
9,0.0607,0.092508,0.969837,0.256891,0.145374,0.176627
10,0.055,0.093182,0.969158,0.233407,0.14978,0.16667


TrainOutput(global_step=1600, training_loss=0.07053177824243903, metrics={'train_runtime': 1690.8764, 'train_samples_per_second': 7.558, 'train_steps_per_second': 0.946, 'total_flos': 3365276483174400.0, 'train_loss': 0.07053177824243903, 'epoch': 20.0})

In [54]:
# Evaluate the model
print("Evaluating on test set...")
results = sub_narrative_baseline_trainer.evaluate(test_sub_narrative_dataset)
results

Evaluating on test set...


{'eval_loss': 0.07413209974765778,
 'eval_accuracy': 0.9767663043478261,
 'eval_precision': 0.3178451982799809,
 'eval_recall': 0.21195652173913043,
 'eval_f1': 0.24643466082085008,
 'eval_runtime': 2.3902,
 'eval_samples_per_second': 33.47,
 'eval_steps_per_second': 4.184,
 'epoch': 20.0}

In [55]:
# Save narrative model for more downstream use
with open('sub_narrative_model_baseline.pkl','wb') as f:
    pickle.dump(sub_narrative_model_baseline,f)

In [68]:
# Sanity Check for sub_narrative_model_baseline store

with open('sub_narrative_model_baseline.pkl', 'rb') as f:
    sn_model_b = pickle.load(f)

sn_b_trainer = Trainer(
    model=sn_model_b,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
print("Evaluating on test set...")
results = sn_b_trainer.evaluate(test_sub_narrative_dataset)
results

Evaluating on test set...


{'eval_loss': 0.07413209974765778,
 'eval_model_preparation_time': 0.0031,
 'eval_accuracy': 0.9767663043478261,
 'eval_precision': 0.3178451982799809,
 'eval_recall': 0.21195652173913043,
 'eval_f1': 0.24643466082085008,
 'eval_runtime': 2.4615,
 'eval_samples_per_second': 32.501,
 'eval_steps_per_second': 4.063}

### **Subnarratives (With Narrative Context)**

In [95]:
def augment_input_with_narratives(texts, narrative_preds, mlb):
    augmented_texts = []
    for text, pred in zip(texts, narrative_preds):
        predicted_labels = mlb.inverse_transform(pred.reshape(1, -1))[0]
        narrative_prefix = " ".join([f"[Narrative: {label}]" for label in predicted_labels])
        augmented_text = f"{narrative_prefix} {text}"
        augmented_texts.append(augmented_text)
    return augmented_texts


In [100]:
# Create augmented dataset
threshold = 0.5

# Augmenting training data with true labels
augmented_train_texts = augment_input_with_narratives(train_texts, y_train_narratives, narrative_mlb)
# display('augmented_train_texts: ', augmented_train_texts)
train_sub_narrative_dataset = NarrativeDataset(
    augmented_train_texts, None, y_train_sub, tokenizer, max_len
)

# The validation set is augmented with prediction results
val_narrative_predictions = n_trainer.predict(val_narrative_dataset).predictions
val_narrative_preds = (val_narrative_predictions > threshold).astype(int)
augmented_val_texts = augment_input_with_narratives(val_texts, val_narrative_preds, narrative_mlb)
val_sub_narrative_dataset = NarrativeDataset(
    augmented_val_texts, None, y_val_sub, tokenizer, max_len
)

# The test set maintains the original logic
test_narrative_predictions = n_trainer.predict(test_narrative_dataset).predictions
test_narrative_preds = (test_narrative_predictions > threshold).astype(int)
augmented_test_texts = augment_input_with_narratives(test_texts, test_narrative_preds, narrative_mlb)
test_sub_narrative_dataset = NarrativeDataset(
    augmented_test_texts, None, y_test_sub, tokenizer, max_len
)


In [101]:
# Create subnarrative model
sub_narrative_model = SubNarrativeClassificationModel(
    pretrained_model_name,
    num_sub_narrative_labels=y_train_sub.shape[1],
)

In [60]:
# Create subnarrative model trainer
sub_narrative_trainer = Trainer(
    model=sub_narrative_model,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Training the second layer model
sub_narrative_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1692,0.162261,0.969158,0.0,0.0,0.0
2,0.1213,0.126673,0.968886,0.018172,0.013216,0.015303
3,0.1025,0.114368,0.969565,0.076909,0.07489,0.075472
4,0.0981,0.10708,0.969293,0.113769,0.127753,0.111072
5,0.0783,0.104892,0.968886,0.168048,0.14978,0.137981
6,0.0742,0.103988,0.968207,0.207552,0.127753,0.131948
7,0.075,0.101634,0.969158,0.209414,0.140969,0.150289
8,0.0678,0.101802,0.969022,0.20437,0.167401,0.165416
9,0.0613,0.101521,0.968478,0.211576,0.193833,0.187421
10,0.0565,0.1022,0.968342,0.252261,0.167401,0.180365


TrainOutput(global_step=1600, training_loss=0.07038670759648084, metrics={'train_runtime': 1700.9797, 'train_samples_per_second': 7.513, 'train_steps_per_second': 0.941, 'total_flos': 3365276483174400.0, 'train_loss': 0.07038670759648084, 'epoch': 20.0})

In [62]:
# Save narrative model for more downstream use
with open('sub_narrative_model_with_augment.pkl','wb') as f:
    pickle.dump(sub_narrative_model,f)

In [113]:
# Sanity Check for sub_narrative_model_with_augment store

with open('sub_narrative_model_with_augment.pkl', 'rb') as f:
    sn_model_wa = pickle.load(f)

sn_wa_trainer = Trainer(
    model=sn_model_wa,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
print("Evaluating on test set...")
results = sn_wa_trainer.evaluate(test_sub_narrative_dataset)
results

Evaluating on test set...


{'eval_loss': 0.08247312903404236,
 'eval_model_preparation_time': 0.003,
 'eval_accuracy': 0.9773097826086956,
 'eval_precision': 0.3606323326432022,
 'eval_recall': 0.31521739130434784,
 'eval_f1': 0.3216928694874919,
 'eval_runtime': 5.2302,
 'eval_samples_per_second': 15.296,
 'eval_steps_per_second': 1.912}

In [107]:
from sklearn.metrics import f1_score, precision_score, recall_score

####################

with open('sub_narrative_model_with_augment.pkl', 'rb') as f:
    sn_model_wa = pickle.load(f)

sn_wa_trainer = Trainer(
    model=sn_model_wa,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
print("Evaluating on test set...")
results = sn_wa_trainer.evaluate(test_sub_narrative_dataset)
results

# Predict subnarratives for dev documents
sub_predictions = sn_wa_trainer.predict(dev_dataset)
sub_predicted_logits = sub_predictions.predictions
sub_predicted_labels = (sub_predicted_logits > 0).astype(int)

# Convert the binary matrix back to label names
sub_predicted_label_names = sub_narrative_mlb.inverse_transform(sub_predicted_labels)
for filename, sub_predicted_label in zip(dev_filenames, sub_predicted_label_names):
    print(f"Filename: {filename}")
    print(f"Predicted Sub-Narrative Labels: {sub_predicted_label}")
    print("--------------------------------------------------")
# Align the gold_data sub_narrative labels with the sub_predicted_label_names based on filenames
aligned_gold_sub_labels = []

for filename in dev_filenames:
    gold_row = gold_data[gold_data['filename'] == filename]
    if not gold_row.empty:
        aligned_gold_sub_labels.append(gold_row.iloc[0]['sub_narratives'])

# Ensure the lengths match
assert len(aligned_gold_sub_labels) == len(sub_predicted_label_names)

# Calculate F1 score, precision, and recall for each document
sub_f1_scores = []
sub_precision_scores = []
sub_recall_scores = []

for gold_sub_labels, sub_predicted_labelss, filename in zip(aligned_gold_sub_labels, sub_predicted_label_names, dev_filenames):
    print(f"filename: {filename}")
    print(f"Gold Labels: {gold_labels}")
    print(f"Predicted Labels: {predicted_labelss}")
    
    # Convert the gold labels and predicted labels to binary arrays
    binary_gold_sub_labels = sub_narrative_mlb.transform([gold_sub_labels])[0]
    binary_predicted_sub_labels = sub_narrative_mlb.transform([sub_predicted_labelss])[0]

    # Calculate F1 score, precision, and recall for this document
    sub_f1 = f1_score(binary_gold_sub_labels, binary_predicted_sub_labels, average='weighted', zero_division=1)
    sub_precision = precision_score(binary_gold_sub_labels, binary_predicted_sub_labels, average='weighted', zero_division=1)
    sub_recall = recall_score(binary_gold_sub_labels, binary_predicted_sub_labels, average='weighted', zero_division=1)

    print(f"sub_f1: {sub_f1}")
    print(f"sub_precision: {sub_precision}")
    print(f"sub_recall: {sub_recall}")
    
    sub_f1_scores.append(sub_f1)
    sub_precision_scores.append(sub_precision)
    sub_recall_scores.append(sub_recall)

# Calculate average scores
average_sub_f1 = np.mean(sub_f1_scores)
average_sub_precision = np.mean(sub_precision_scores)
average_sub_recall = np.mean(sub_recall_scores)

print(f"Average Sub-Narrative F1 Score: {average_sub_f1}")
print(f"Average Sub-Narrative Precision: {average_sub_precision}")
print(f"Average Sub-Narrative Recall: {average_sub_recall}")





Evaluating on test set...


Filename: EN_CC_200030.txt
Predicted Sub-Narrative Labels: ('Other',)
--------------------------------------------------
Filename: EN_CC_200033.txt
Predicted Sub-Narrative Labels: ('Other',)
--------------------------------------------------
Filename: EN_CC_200034.txt
Predicted Sub-Narrative Labels: ('Other',)
--------------------------------------------------
Filename: EN_CC_200035.txt
Predicted Sub-Narrative Labels: ()
--------------------------------------------------
Filename: EN_CC_200036.txt
Predicted Sub-Narrative Labels: ()
--------------------------------------------------
Filename: EN_CC_200040.txt
Predicted Sub-Narrative Labels: ('Other',)
--------------------------------------------------
Filename: EN_CC_200046.txt
Predicted Sub-Narrative Labels: ()
--------------------------------------------------
Filename: EN_CC_200047.txt
Predicted Sub-Narrative Labels: ('Other',)
--------------------------------------------------
Filename: EN_CC_200049.txt
Predicted Sub-Narrative Label



sub_f1: 1.0
sub_precision: 1.0
sub_recall: 1.0
filename: EN_UA_DEV_20.txt
Gold Labels: ['URW: Discrediting Ukraine', 'URW: Discrediting Ukraine']
Predicted Labels: ('URW: Blaming the war on others rather than the invader', 'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy')
sub_f1: 0.8190091001011123
sub_precision: 0.7740086000955566
sub_recall: 0.8695652173913043
filename: EN_UA_DEV_213.txt
Gold Labels: ['URW: Discrediting Ukraine', 'URW: Discrediting Ukraine']
Predicted Labels: ('URW: Blaming the war on others rather than the invader', 'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy')
sub_f1: 0.8873741095553918
sub_precision: 0.9297022684310019
sub_recall: 0.9239130434782609
filename: EN_UA_DEV_214.txt
Gold Labels: ['URW: Discrediting Ukraine', 'URW: Discrediting Ukraine']
Predicted Labels: ('URW: Blaming the war on others rather than the invader', 'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy')
sub_f1: 0.9192373087199417
su

### Submission format

In [132]:
import os
import pickle

dev_documents = './dev-documents_4_December/'
dev_documents_langs = [dev_documents+"PT/subtask-2-documents/"]


# Function to read and preprocess dev documents
def read_dev_documents(dev_documents_langs):
    dev_texts = []
    dev_filenames = []
    for dev_doc_dir in dev_documents_langs:
        for filename in os.listdir(dev_doc_dir):
            file_path = os.path.join(dev_doc_dir, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                dev_texts.append(file.read())
                dev_filenames.append(filename)
    return dev_texts, dev_filenames

# Read and preprocess dev documents
dev_texts, dev_filenames = read_dev_documents(dev_documents_langs)

# Tokenize dev documents
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")

# Create a dataset for dev documents
class DevDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

dev_dataset = DevDataset(dev_encodings)

# Reinitialize the Trainer for narrative prediction
n_trainer = Trainer(
    model=n_model,
    args=training_args,
    train_dataset=train_narrative_dataset,
    eval_dataset=val_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Predict narratives for dev documents
predictions = n_trainer.predict(dev_dataset)
predicted_logits = predictions.predictions
predicted_labels = (predicted_logits > 0).astype(int)

# Convert narrative predictions back to label names
predicted_label_names = narrative_mlb.inverse_transform(predicted_labels)

# Load the sub-narrative model
with open('sub_narrative_model_baseline.pkl', 'rb') as f:
    sn_model_wa = pickle.load(f)

# Initialize the Trainer for sub-narrative prediction
sn_wa_trainer = Trainer(
    model=sn_model_wa,
    args=training_args,
    train_dataset=train_sub_narrative_dataset,
    eval_dataset=val_sub_narrative_dataset,
    compute_metrics=compute_metrics,
)

# Predict sub-narratives for dev documents
sub_predictions = sn_wa_trainer.predict(dev_dataset)
sub_predicted_logits = sub_predictions.predictions
print(sub_predicted_logits)
sub_predicted_labels = (sub_predicted_logits > 0).astype(int)


# Convert sub-narrative predictions back to label names
sub_predicted_label_names = sub_narrative_mlb.inverse_transform(sub_predicted_labels)

# Format predictions into the required output format
output_lines = []

for idx, (narratives, sub_narratives) in enumerate(zip(predicted_label_names, sub_predicted_label_names)):
    # Extract file name for each document
    article_id = dev_filenames[idx]

    # Convert narrative and sub-narrative predictions to semicolon-separated strings
    narratives_str = ";".join(narratives) if narratives else "Other"
    sub_narratives_str = ";".join(sub_narratives) if sub_narratives else "Other"

    # Create the formatted output line
    output_line = f"{article_id}\t{narratives_str}\t{sub_narratives_str}"
    output_lines.append(output_line)

# Save predictions to a file
output_file = "predictions_subtask2_pt_dev.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write("\n".join(output_lines))

print(f"Predictions saved to {output_file}")


[[-4.0639596 -5.617237  -5.8707914 ... -5.3103714 -5.8869705 -5.519807 ]
 [-3.765885  -5.6189613 -4.9781237 ... -4.881353  -5.428876  -5.4265733]
 [-2.425073  -5.319734  -5.053     ... -5.2430506 -5.707826  -5.3044972]
 ...
 [-3.4629765 -4.3725796 -4.9724045 ... -4.164311  -4.788498  -4.467118 ]
 [-5.1191278 -5.412079  -5.9389477 ... -5.889707  -5.7471547 -5.6139436]
 [-3.8561044 -4.1560965 -5.213524  ... -2.4904811 -4.283989  -3.5664353]]
Predictions saved to predictions_subtask2_pt_dev.txt
