# initializations

In [None]:
!pip install transformers bitsandbytes accelerate

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import pickle
import torch
import json
import ast
import os

# loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# %cd /content/drive/MyDrive/University/Research/SemEval 2025: Task 7
%cd /content/drive/MyDrive/Research/SemEval 2025: Task 7

/content/drive/.shortcut-targets-by-id/1iZ2XHgIpDSkxPjihIgMQ_KPj766HC2So/Research/SemEval 2025: Task 7


In [None]:
import sys
sys.path.append('./src')

In [None]:
import utils

In [None]:
parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s


fact_checks_df = pd.read_csv('./data/cleaned data/fact_checks.csv').fillna('').set_index('fact_check_id')

for col in ['claim', 'title']:
    fact_checks_df[col] = fact_checks_df[col].apply(parse_col)


posts_df = pd.read_csv('./data/cleaned data/posts.csv').fillna('').set_index('post_id')

mapping_df = pd.read_csv('./data/original data/pairs.csv')

with open('./data/original data/tasks.json', 'r') as file:
    tasks = json.load(file)

# Models

## gte-multilingual-base

In [None]:
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",  # Use float16 for better accuracy
    bnb_4bit_use_double_quant=True,    # Improves compression efficiency
    bnb_4bit_quant_type="nf4"          # NF4 works best for LLMs
)

model_name = "Alibaba-NLP/gte-multilingual-base"
model = AutoModel.from_pretrained(
                                    model_name,
                                    quantization_config= quantization_config,
                                    trust_remote_code=True,
                                    device_map="auto"
                                  )
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.bias', 'classifier.weight'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCla

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        inputs = tokenizer(batch_data, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = model(**inputs).last_hidden_state[:, 0, :]  # CLS token embedding

        embeddings.update(dict(zip(batch_id, emb.cpu().numpy())))

    return embeddings

# Building train dataset

In [None]:
def get_fact_checks(post_id):
    FCs = mapping_df[mapping_df['post_id'] == post_id]['fact_check_id'].to_list()
    return fact_checks_df.loc[FCs].index.to_list()

def get_negative_samples(FCs, nearest_FCs, k):
    result = []
    for element in nearest_FCs:
        if element not in FCs and len(result) < k:
            result.append(element)
    return result

def get_samples(posts_ids, top_indices_ids):
    positive_samples, negative_samples = [], []
    for i, p in enumerate(posts_ids):
        FCs = get_fact_checks(p)

        positive_samples.append(FCs)
        negative_samples.append(get_negative_samples(FCs, top_indices_ids[i], 3))

    return positive_samples, negative_samples

def sort_indices_by_similarity(similarities, top_indices):
    sorted_top_indices = []
    for i in range(similarities.shape[0]):
      # Get similarities for current post
        current_similarities = similarities[i, :]
        # Get the top indices for current post
        current_indices = top_indices[i]

        # Create a dictionary of indices and their corresponding similarities
        idx_similarity = {idx: sim for idx, sim in zip(current_indices, current_similarities[np.array(current_indices)])}

        # Sort indices by similarity in descending order
        sorted_indices = sorted(idx_similarity, key=idx_similarity.get, reverse=True)

        sorted_top_indices.append(sorted_indices)

    return sorted_top_indices

def get_positive_and_negative_samples(posts, fact_checks_embeddings):
    posts_embedding = get_embeddings(posts.index.to_list(), posts['content'].to_list(), batch_size = 16)

    similarities = cosine_similarity(list(posts_embedding.values()), fact_checks_embeddings['embedding'].to_list())
    # similarities = cosine_similarity(list(posts_embedding.values()), list(fc_emb.values()))

    nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
    nearest = sort_indices_by_similarity(similarities, nearest)
    top_indices = [[fact_checks_embeddings.iloc[idx].name for idx in sublist] for sublist in nearest]
    # top_indices = [[list(fact_checks_embeddings.keys())[idx] for idx in sublist] for sublist in nearest]

    positive_samples, negative_samples = get_samples(posts.index, top_indices)

    return positive_samples, negative_samples

In [None]:
fact_checks_embeddings = utils.load_fact_checks_embeddings("gte-multilingual-base")

## manual testing

In [None]:
lang = 'deu'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

fc_emb = fact_checks_embeddings.loc[fc.index]
# fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 16)

In [None]:
posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']].head(10)

In [None]:
get_positive_and_negative_samples(posts, fc_emb)

Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  1.84it/s]


([[87108],
  [150241],
  [98619],
  [118585],
  [77239],
  [45291],
  [25311, 45723],
  [53337],
  [111002],
  [58104]],
 [[89646, 53380, 45567],
  [23340, 151813, 151814],
  [62469, 71566, 44050],
  [118586, 151811, 49895],
  [44056, 49733, 77238],
  [71307, 70016, 118142],
  [45735, 44050, 42628],
  [26764, 41249, 40340],
  [68256, 44739, 118811],
  [44031, 45243, 63987]])

In [None]:
positive_samples

[[87108],
 [150241],
 [98619],
 [118585],
 [77239],
 [45291],
 [25311, 45723],
 [53337],
 [111002],
 [58104]]

In [None]:
negative_samples

[[89646, 53380, 45567],
 [23340, 151813, 151814],
 [62469, 71566, 44050],
 [118586, 151811, 49895],
 [44056, 49733, 77238],
 [71307, 70016, 118142],
 [45735, 44050, 42628],
 [26764, 41249, 40340],
 [68256, 44739, 118811],
 [44031, 45243, 63987]]

In [None]:
for i in range(len(posts)):
    print(f"content: {posts.iloc[i]['eng_content']}")
    print(f"\n positive samples:")
    for j in positive_samples[i]:
        print(f"\t{fc.loc[j]['claim'][1]}")
    print(f"\n negative samples:")
    for j in negative_samples[i]:
        print(f"\t{fc.loc[j]['claim'][1]}")
    print("============================================================================================================")

content: ! Brazen vaccination fake by Markus Söder! It's really unbelievable how bold Top politicians such as Markus Söder kidding us. On Instagram does Söder busy advertising for vaccination But if you look closely, you can see you that he can't be injected at all. The lid is still on the needle. You can see how much those who want to vaccinate you, the Trust vaccines! markus.soeder TBE ...

 positive samples:
	Markus Söder faked his vaccination.

 negative samples:
	With a combination of vaccination date photos, mood is being made on social media against Bavaria's Prime Minister Markus Söder. Some use the collage (archived here) to question how often the CSU boss got vaccinated. A connection to the corona vaccination is often made. Some make (archived here) - probably jokingly - a connection between the head of government and the shortage of preparations in Germany.
	It is suggested that Markus Söder did not have anything injected with a current corona vaccination because the cap was

In [None]:
posts.index.to_list()

[0, 9, 10, 25, 52, 95, 96, 98, 108, 120]

## building positive and negative samples for posts

In [None]:
positive_samples_indices, negative_samples_indices = {}, {}

### monoligual posts

In [None]:
for lang in tasks['monolingual'].keys():
    posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]
    fc_embeddings = fact_checks_embeddings.loc[tasks['monolingual'][lang]['fact_checks']]

    print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc_embeddings) }")

    positive_samples, negative_samples = get_positive_and_negative_samples(posts, fc_embeddings)

    positive_samples_indices.update(dict(zip(posts.index.to_list(), positive_samples)))
    negative_samples_indices.update(dict(zip(posts.index.to_list(), negative_samples)))

lang: fra, posts: 1596, fc: 4355


Processing Batches: 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


lang: spa, posts: 5628, fc: 14082


Processing Batches: 100%|██████████| 352/352 [04:51<00:00,  1.21it/s]


lang: eng, posts: 4351, fc: 85734


Processing Batches: 100%|██████████| 272/272 [03:27<00:00,  1.31it/s]


lang: por, posts: 2571, fc: 21569


Processing Batches: 100%|██████████| 161/161 [01:33<00:00,  1.72it/s]


lang: tha, posts: 465, fc: 382


Processing Batches: 100%|██████████| 30/30 [00:37<00:00,  1.24s/it]


lang: deu, posts: 667, fc: 4996


Processing Batches: 100%|██████████| 42/42 [00:58<00:00,  1.40s/it]


lang: msa, posts: 1062, fc: 8424


Processing Batches: 100%|██████████| 67/67 [00:43<00:00,  1.56it/s]


lang: ara, posts: 676, fc: 14201


Processing Batches: 100%|██████████| 43/43 [00:17<00:00,  2.42it/s]


### crosslingual posts

In [None]:
posts = posts_df.loc[tasks['crosslingual']['posts_train']]
fc_embeddings = fact_checks_embeddings.loc[tasks['crosslingual']['fact_checks']]

print(f"Crosslingual posts, posts: { len(posts) }, fc: { len(fc_embeddings) }")

positive_samples, negative_samples = get_positive_and_negative_samples(posts, fc_embeddings)

positive_samples_indices.update(dict(zip(posts.index.to_list(), positive_samples)))
negative_samples_indices.update(dict(zip(posts.index.to_list(), negative_samples)))

Crosslingual posts, posts: 4972, fc: 153743


Processing Batches: 100%|██████████| 311/311 [04:51<00:00,  1.07it/s]


### saving samples

In [None]:
folder_path = './data/training data'
model_name = 'gte-multilingual-base'
lang_type = 'multi'
# lang_type = 'eng'

In [None]:
# Convert numpy int64 to Python int before serializing
def convert_to_int(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    # If it's a dictionary or list, recursively convert values
    if isinstance(obj, dict):
        return {k: convert_to_int(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_to_int(v) for v in obj]
    return obj

# Now use the converted dictionary with json.dump
with open(f'{folder_path}/{model_name}_{lang_type}_positive_samples.json', 'w') as json_file:
    json.dump(convert_to_int(positive_samples_indices), json_file, indent=4)

with open(f'{folder_path}/{model_name}_{lang_type}_negative_samples.json', 'w') as json_file:
    json.dump(convert_to_int(negative_samples_indices), json_file, indent=4)

# training the Model

## loading and preparing train data

In [None]:
folder_path = './data/training data'
model_name = 'gte-multilingual-base'
lang_type = 'multi'

with open(f'{folder_path}/{model_name}_{lang_type}_positive_samples.json', 'r') as file:
    positive_samples = json.load(file)

# with open(f'{folder_path}/{model_name}_{lang_type}_negative_samples.json', 'r') as file:
#     negative_samples = json.load(file)

In [None]:
for key, value in positive_samples.items():
    positive_samples[key] = fact_checks_df.loc[positive_samples[key]]['claim'].apply(lambda x: x[0]).to_list()

# for key, value in negative_samples.items():
#     negative_samples[key] = fact_checks_df.loc[negative_samples[key]]['claim'].apply(lambda x: x[0]).to_list()

In [None]:
posts_train = []
posts_train.extend(tasks['crosslingual']['posts_train'])
for lang in tasks['monolingual'].keys():
    posts_train.extend(tasks['monolingual'][lang]['posts_train'])

## train the model

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA on attention layers
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, config)

In [None]:
dataset = []

# for post_id in posts_train:
for post_id in tasks['monolingual']['tha']['posts_train']:
    anchor = posts_df.loc[post_id]['eng_content']
    positives = positive_samples[str(post_id)]
    # negatives = negative_samples[str(post_id)]

    dataset.append({"anchor": anchor, "positives": positives})

In [None]:
import random
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        for item in data:
            anchor = item["anchor"]
            positives = item["positives"]
            for positive in positives:
                self.data.append((anchor, positive))  # Only use anchor-positive pairs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        anchor, positive = self.data[idx]

        # Tokenize anchor and positive text
        encoded = self.tokenizer(
            [anchor, positive],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "anchor": encoded["input_ids"][0],
            "positive": encoded["input_ids"][1],
            "anchor_mask": encoded["attention_mask"][0],
            "positive_mask": encoded["attention_mask"][1],
        }

# Create dataset and dataloader
train_dataset = CustomDataset(dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super().__init__()
        self.temperature = temperature

    def forward(self, query_embeddings, pos_embeddings):
        # Compute cosine similarities
        sim_pos = F.cosine_similarity(query_embeddings, pos_embeddings)

        # Contrastive loss (log-softmax over positive and negative pairs)
        logits = sim_pos.unsqueeze(1) / self.temperature
        labels = torch.zeros(logits.shape[0], dtype=torch.long, device=logits.device)

        return F.cross_entropy(logits, labels)

In [None]:
from torch.utils.data import DataLoader

# QLoRA configuration (LoRA with quantized model)
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, bias="none"
)

model = get_peft_model(model, lora_config)

# Training parameters
epochs = 1
batch_size = 4
learning_rate = 2e-5

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Loss function
loss_fn = ContrastiveLoss()

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_dataloader, desc = "Batch Processing:"):
        optimizer.zero_grad()

        # Move data to GPU
        anchor_input = batch["anchor"].to(device)
        positive_input = batch["positive"].to(device)

        anchor_mask = batch["anchor_mask"].to(device)
        positive_mask = batch["positive_mask"].to(device)

        # Compute embeddings
        anchor_emb = model(input_ids=anchor_input, attention_mask=anchor_mask).last_hidden_state[:, 0]
        positive_emb = model(input_ids=positive_input, attention_mask=positive_mask).last_hidden_state[:, 0]

        # Compute loss
        loss = loss_fn(anchor_emb, positive_emb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")

print("Training complete!")

# evaluating the model

In [None]:
def get_fact_checks(post_id):
    FCs = mapping_df[mapping_df['post_id'] == post_id]['fact_check_id'].to_list()
    return fact_checks_df.loc[FCs].index.to_list()

def common_element(list1, list2):
    return any(item in list2 for item in list1)

def get_accuracy(posts_ids, top_indices_ids, show_logs = False):
    mismatched_posts = []
    corrects = 0
    for i, p in enumerate(posts_ids):
      FCs = get_fact_checks(p)
      result = common_element(FCs, top_indices_ids[i])

      if show_logs:
        print("=================================================================")
        print(f'fact_checks for post {p}')
        print(f"content: {posts_df.loc[p]['content']}")
        print(FCs)
        for x in FCs:
          print(f"title: {fact_checks_df.loc[x]['title']}")
          print(f"claim: {fact_checks_df.loc[x]['claim'][1]}")
        print(result)
      corrects += result == True
      if not result:
        mismatched_posts.append(p)
    return corrects, mismatched_posts

In [None]:
lang = 'tha'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

# fc_emb = fact_checks_embeddings.loc[fc.index]
fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 16)
# fc_emb = get_embeddings(fc.index.to_list(), fc['content'].to_list(), batch_size = 32)

Processing Batches: 100%|██████████| 24/24 [00:02<00:00,  9.99it/s]


In [None]:
# posts = posts_summaries.loc[posts_summaries.index.isin(tasks['monolingual'][lang]['posts_train'])]
posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]
# posts = posts_df.loc[tasks['crosslingual']['posts_train']]

print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc) }")

posts_embedding = get_embeddings(posts.index.to_list(), posts['content'].to_list(), batch_size = 4)

# similarities = cosine_similarity(list(posts_embedding.values()), fc_emb['embedding'].to_list())
similarities = cosine_similarity(list(posts_embedding.values()), list(fc_emb.values()))

nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
# top_indices = [[fc_emb.iloc[idx].name for idx in sublist] for sublist in nearest]
top_indices = [[list(fc_emb.keys())[idx] for idx in sublist] for sublist in nearest]

corrects, mismatched_posts = get_accuracy(posts.index, top_indices)

print(f"accuracy: {corrects/len(posts) * 100}% !")

lang: tha, posts: 465, fc: 382


Processing Batches: 100%|██████████| 117/117 [00:34<00:00,  3.38it/s]


accuracy: 24.516129032258064% !


# saving the model

In [None]:
from google.colab import userdata

def login2HF():
  !huggingface-cli login --token '{userdata.get('HF_token')}'

In [None]:
login2HF()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Colab_notebook` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Colab_notebook`


In [None]:
model.save_pretrained('gte-multilingual-base_Fine_Tuned_1e')
tokenizer.save_pretrained('gte-multilingual-base_Fine_Tuned_1e')
!huggingface-cli upload 'gte-multilingual-base_Fine_Tuned_1e'