Ran the code on Google colab since it crashed locally

In [1]:
!pip install pymupdf
!pip install faiss-cpu
!pip install evaluate

Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
import fitz
import re
import faiss
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer
import evaluate
import torch.nn.functional as F
import os
import torch

os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
np.random.seed(42)

def extract_text(pdf_path, skip_pages):
    text = []
    doc = fitz.open(pdf_path)
    for page_num in range(skip_pages, len(doc)):
        page_text = doc[page_num].get_text()
        if page_text:
            text.append(page_text)
    return "\n".join(text)

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w.,!? %$€-]", "", text)
    text = re.sub(r'\n\d+\n', '\n', text) ##remove page numbers
    text = re.sub(r"(\d+(\.\d+)*)(,\s*\d+(\.\d+)*)+", "", text)
    text = re.sub(r"\b[A-Z]\.\d+(\.\d+)*", "", text)
    return text.strip()

text_1 = extract_text("knowledge_base_articles/SPM_version_report_LR.pdf", 3)
text_1_cleaned = clean_text(text_1)
text_2 = extract_text("knowledge_base_articles/01_SROCC_SPM_FINAL.pdf", 3)
text_2_cleaned = clean_text(text_2)
text_3 = extract_text("knowledge_base_articles/IPCC_AR6_SYR_SPM.pdf", 8)
text_3_cleaned = clean_text(text_3)
text_4 = extract_text("knowledge_base_articles/IPCC_AR6_WGIII_SummaryForPolicymakers.pdf", 7)
text_4_cleaned = clean_text(text_4)
text_5 = extract_text("knowledge_base_articles/IPCC_AR6_WGI_SPM.pdf", 3)
text_5_cleaned = clean_text(text_5)
text_6 = extract_text("knowledge_base_articles/SRCCL_SPM.pdf", 5)
text_6_cleaned = clean_text(text_6)
# text_7 = extract_text("knowledge_base_articles/s41558-025-02337-7.pdf", 0)
# text_7_cleaned = clean_text(text_7)
# text_8 = extract_text("knowledge_base_articles/s44168-025-00220-x.pdf", 0)
# text_8_cleaned = clean_text(text_8)
# text_9 = extract_text("knowledge_base_articles/weisner-et-al-cumulative-human-health-risk-assessment-of-regional-ozone-and-volatile-organic-compounds-from.pdf", 0)
# text_9_cleaned = clean_text(text_9)

def chunk_text(text, max_sentences=4):
    sentences = sent_tokenize(text)
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunk = " ".join(sentences[i:i+max_sentences])
        chunks.append(chunk)
    return chunks

all_chunks = []
all_chunks.extend(chunk_text(text_1_cleaned))
all_chunks.extend(chunk_text(text_2_cleaned))
all_chunks.extend(chunk_text(text_3_cleaned))
all_chunks.extend(chunk_text(text_4_cleaned))
all_chunks.extend(chunk_text(text_5_cleaned))
all_chunks.extend(chunk_text(text_6_cleaned))
# all_chunks.extend(chunk_text(text_7_cleaned))
# all_chunks.extend(chunk_text(text_8_cleaned))
# all_chunks.extend(chunk_text(text_9_cleaned))

embedder = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = embedder.encode(all_chunks, convert_to_numpy=True)

embedding_dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(chunk_embeddings)


df = pd.read_json("without_assessment_updated.jsonl", lines=True)
labels_df = pd.read_csv("group31_stage1.csv", sep=";")
labels_df["label"] = labels_df["real_news"].replace({"yes": 1, "no": 0})
adjusted_index = labels_df["index"] - 1
df.loc[adjusted_index, "labels"] = labels_df["label"].values
df["labels"] = df["labels"].astype(int)

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)
train_all = dataset['train']
test = dataset['test']
dataset_train = train_all.train_test_split(test_size=0.2)
train = dataset_train['train']
val = dataset_train['test']

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(articles):
   return tokenizer(articles["Text"], padding="max_length", truncation=True)

tokenized_dataset = dataset_train.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
        output_dir = "./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=2
)


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   compute_metrics=compute_metrics,

)

trainer.train()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  labels_df["label"] = labels_df["real_news"].replace({"yes": 1, "no": 0})


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,2.8905,1.071191,0.458333
2,0.5487,0.573339,0.666667
3,0.3173,0.453879,0.791667
4,0.2713,0.480108,0.833333
5,0.1162,0.493641,0.833333
6,0.1151,0.77571,0.791667
7,0.0506,1.017443,0.708333
8,0.1173,0.857952,0.833333
9,0.0042,1.009154,0.791667
10,0.0115,0.904246,0.875


TrainOutput(global_step=240, training_loss=0.22465080868350923, metrics={'train_runtime': 788.8533, 'train_samples_per_second': 2.434, 'train_steps_per_second': 0.304, 'total_flos': 1003379522273280.0, 'train_loss': 0.22465080868350923, 'epoch': 20.0})

In [None]:
trained_model = trainer.model

model = trained_model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def retrieve_relevant_chunks(query, k=2):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_embedding, k)
    return [all_chunks[i] for i in I[0]]

def build_prompt(context_chunks, article_text, max_context_chars=3000):
    context = "\n".join(context_chunks)
    if len(context) > max_context_chars:
        context = context[:max_context_chars] + "..."
    prompt = f"""Context:
{context}

Article:
{article_text}

Task: Is this article propaganda or real news? Respond with "Real News" or "Propaganda"."""
    return prompt


def classify_prompt(prompt, max_input_length=1000):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_length,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=-1).item()

    label_map = {0: "Propaganda", 1: "Real News"}
    return label_map[predicted_class]

In [4]:
tokenized_test = test.map(tokenize_function, batched=True)
metrics = trainer.evaluate(eval_dataset=tokenized_test)
print(metrics)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

{'eval_loss': 0.6668455600738525, 'eval_accuracy': 0.8666666666666667, 'eval_runtime': 2.2673, 'eval_samples_per_second': 13.232, 'eval_steps_per_second': 1.764, 'epoch': 20.0}


In [14]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

predictions = trainer.predict(tokenized_test)
predicted_labels = predictions.predictions.argmax(axis=-1)
real_labels = np.array(test['labels'])

accuracy = accuracy_score(real_labels, predicted_labels)
f1 = f1_score(real_labels, predicted_labels, average='binary')
recall = recall_score(real_labels, predicted_labels, average='binary')
precision = precision_score(real_labels, predicted_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Accuracy: 0.8666666666666667
F1 Score: 0.8823529411764706
Recall: 1.0
Precision: 0.7894736842105263


In [18]:
tokenized_train = train_all.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_train)
predicted_labels = predictions.predictions.argmax(axis=-1)
real_labels = np.array(train_all['labels'])

accuracy = accuracy_score(real_labels, predicted_labels)
f1 = f1_score(real_labels, predicted_labels, average='binary')
recall = recall_score(real_labels, predicted_labels, average='binary')
precision = precision_score(real_labels, predicted_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Accuracy: 0.975
F1 Score: 0.9714285714285714
Recall: 0.9807692307692307
Precision: 0.9622641509433962


In [23]:
real_labels = np.array(test['labels'])
all_prop_labels = np.zeros_like(real_labels)

accuracy = accuracy_score(real_labels, all_prop_labels)
f1 = f1_score(real_labels, all_prop_labels, average='binary')
recall = recall_score(real_labels, all_prop_labels, average='binary')
precision = precision_score(real_labels, all_prop_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Accuracy: 0.5
F1 Score: 0.0
Recall: 0.0
Precision: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
