# initializations

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle
import torch
import json
import ast
import os

# loading data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# %cd /content/drive/MyDrive/University/Research/SemEval 2025: Task 7
%cd /content/drive/MyDrive/Research/SemEval 2025: Task 7

/content/drive/.shortcut-targets-by-id/1iZ2XHgIpDSkxPjihIgMQ_KPj766HC2So/Research/SemEval 2025: Task 7


In [9]:
parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s


fact_checks_df = pd.read_csv('./data/cleaned data/fact_checks.csv').fillna('').set_index('fact_check_id')

for col in ['claim', 'title']:
    fact_checks_df[col] = fact_checks_df[col].apply(parse_col)


posts_df = pd.read_csv('./data/cleaned data/posts.csv').fillna('').set_index('post_id')

mapping_df = pd.read_csv('./data/original data/pairs.csv')

with open('./data/original data/tasks.json', 'r') as file:
    tasks = json.load(file)

# Models

## UAE-Large-V1

In [None]:
!pip install -U angle-emb

In [None]:
from angle_emb import AnglE, Prompts

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').to(device)

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):
    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = angle.encode(batch_data, normalize_embedding=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## angle-llama-7b-nli-v2

In [None]:
!pip install -U angle-emb
!pip install transformers bitsandbytes accelerate



In [None]:
from angle_emb import AnglE, Prompts
from transformers import BitsAndBytesConfig

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

angle = AnglE.from_pretrained('NousResearch/Llama-2-7b-hf',
                              pretrained_lora_path='SeanLee97/angle-llama-7b-nli-v2',
                              pooling_strategy='last',
                              is_llm=True,
                              torch_dtype=torch.float16,
                              quantization_config=quantization_config).to(device)

print('All predefined prompts:', Prompts.list_prompts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prompts.A = 'Summarize sentence "{text}" in one word:"'
Prompts.B = 'You can only output one word. Summarize "{text}":"'
Prompts.C = 'Represent this sentence for searching relevant passages: {text}'
All predefined prompts: None


In [None]:
Prompts.list_prompts()

Prompts.A = 'Summarize sentence "{text}" in one word:"'
Prompts.B = 'You can only output one word. Summarize "{text}":"'
Prompts.C = 'Represent this sentence for searching relevant passages: {text}'


In [None]:
def get_embeddings(ids, data, batch_size = 16):
    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        batch_dict = [{'text': text} for text in batch_data]
        # Batch process embeddings
        emb = angle.encode(batch_dict, normalize_embedding=True, prompt=Prompts.A)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## multilingual-e5-large

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('intfloat/multilingual-e5-large').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16, is_post = False):

    if is_post:
        data = ['query: ' + x for x in data]
    else:
        data = ['passage: ' + x for x in data]

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## gte-multilingual-base

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', trust_remote_code=True).to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.bias', 'classifier.weight'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## gte-Qwen2-1.5B-instruct

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16, is_post = False):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        if is_post:
            emb = model.encode(batch_data, normalize_embeddings=True, prompt="query")
        else:
            emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## multilingual-e5-large-instruct

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [None]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a social media post, retrieve relevant passages about the post'
# task = 'Given a web search query, retrieve relevant passages that answer the query'

def get_embeddings(ids, data, batch_size = 16, is_post = False):

    if is_post:
        data = [get_detailed_instruct(task, x) for x in data]

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bilingual-embedding-large

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# French and English
model = SentenceTransformer('Lajavaness/bilingual-embedding-large', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/242k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

(…)ynb_checkpoints%2Fconfig-checkpoint.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bilingual-embedding-small

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# French and English
model = SentenceTransformer('Lajavaness/bilingual-embedding-small', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/242k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

config.py:   0%|          | 0.00/7.12k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Lajavaness/bilingual-embedding-small:
- config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/72.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Lajavaness/bilingual-embedding-small:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bge-m3

In [None]:
!pip install -U FlagEmbedding

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.3.4.tar.gz (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.19.0 (from FlagEmbedding)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting ir-datasets (from FlagEmbedding)
  Downloading ir_datasets-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.19.0->FlagEmbedding)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.19.0->FlagEmbedding)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.19.0->FlagEmbedding)
  Downloading multiprocess-0.70.16-py311-none-an

In [None]:
from FlagEmbedding import FlagAutoModel

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = FlagAutoModel.from_finetuned('BAAI/bge-m3',
                                      query_instruction_for_retrieval="Represent this post for searching relevant passages:",
                                      use_fp16=True)

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

imgs/.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

imgs/bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

imgs/long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

imgs/mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

imgs/others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

imgs/nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

imgs/miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

onnx/Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data)['dense_vecs']

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bge-m3-custom-fr

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('manu/bge-m3-custom-fr', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## KaLM-embedding-multilingual-mini-v1

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('HIT-TMG/KaLM-embedding-multilingual-mini-v1').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/601k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    prompt = "Instruct: Given a social media post, retrieve relevant passages about the post. \n Query: "
    emb = model.encode(
              data,
              normalize_embeddings=True,
              batch_size=batch_size,
              show_progress_bar=True
              )
    embeddings.update(dict(zip(ids, emb)))

    return embeddings

## Linq-Embed-Mistral-GPTQ

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('shuyuej/Linq-Embed-Mistral-GPTQ').to(device)

ImportError: Loading a GPTQ quantized model requires optimum (`pip install optimum`)

In [None]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a social media post, retrieve relevant passages about the post'
# task = 'Given a web search query, retrieve relevant passages that answer the query'

def get_embeddings(ids, data, batch_size = 16, is_post = False):

    if is_post:
        data = [get_detailed_instruct(task, x) for x in data]

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

In [None]:
# Each query must come with a one-sentence instruction that describes the task
task = 'Given a question, retrieve Wikipedia passages that answer the question'
prompt = f"Instruct: {task}\nQuery: "
queries = [
    "최초의 원자력 발전소는 무엇인가?",
    "Who invented Hangul?"
]
passages = [
    "현재 사용되는 핵분열 방식을 이용한 전력생산은 1948년 9월 미국 테네시주 오크리지에 설치된 X-10 흑연원자로에서 전구의 불을 밝히는 데 사용되면서 시작되었다. 그리고 1954년 6월에 구소련의 오브닌스크에 건설된 흑연감속 비등경수 압력관형 원자로를 사용한 오브닌스크 원자력 발전소가 시험적으로 전력생산을 시작하였고, 최초의 상업용 원자력 엉더이로를 사용한 영국 셀라필드 원자력 단지에 위치한 콜더 홀(Calder Hall) 원자력 발전소로, 1956년 10월 17일 상업 운전을 시작하였다.",
    "Hangul was personally created and promulgated by the fourth king of the Joseon dynasty, Sejong the Great.[1][2] Sejong's scholarly institute, the Hall of Worthies, is often credited with the work, and at least one of its scholars was heavily involved in its creation, but it appears to have also been a personal project of Sejong."
]

# Encode the queries and passages. We only use the prompt for the queries
query_embeddings = model.encode(queries, prompt=prompt)
passage_embeddings = model.encode(passages)

# Compute the (cosine) similarity scores
scores = model.similarity(query_embeddings, passage_embeddings) * 100
print(scores.tolist())
# [[73.72908782958984, 30.122787475585938], [29.15508460998535, 79.25375366210938]]


## gte-multilingual-base-Fine_Tuned

In [None]:
from sentence_transformers import SentenceTransformer, losses, InputExample

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('am-azadi/gte-multilingual-base_Fine_Tuned_2e', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bilingual-embedding-large_Fine_Tuned

In [10]:
from sentence_transformers import SentenceTransformer

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# French and English
model = SentenceTransformer('am-azadi/bilingual-embedding-large_Fine_Tuned_3e', trust_remote_code=True).to(device)

ValueError: The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has <class 'transformers_modules.dangvantuan.bilingual_impl.8079a782f9671d3696b5ab189781480fcca58d25.config.BilingualConfig'> and you passed <class 'transformers_modules.am-azadi.bilingual-embedding-large_Fine_Tuned_3e.640b6b3e3335c350bc728e0f6a83e4491066986b.config.BilingualConfig'>. Fix one of those so they match!

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## bilingual-embedding-small_Fine_Tuned

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# French and English
model = SentenceTransformer('am-azadi/bilingual-embedding-small_Fine_Tuned', trust_remote_code=True).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## UAE-Large-V1_Fine_Tuned

In [None]:
from sentence_transformers import SentenceTransformer, losses, InputExample

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('am-azadi/UAE-Large-V1_Fine_Tuned_2e', trust_remote_code=True).to(device)

In [None]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        emb = model.encode(batch_data, normalize_embeddings=True)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

## KaLM-embedding-multilingual-mini-v1_Fine_Tuned

In [13]:
from sentence_transformers import SentenceTransformer

In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = SentenceTransformer('am-azadi/KaLM-embedding-multilingual-mini-v1_Fine_Tuned_3e').to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/21.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [15]:
def get_embeddings(ids, data, batch_size = 16):

    embeddings = {}

    emb = model.encode(
              data,
              normalize_embeddings=True,
              batch_size=batch_size,
              show_progress_bar=True
              )
    embeddings.update(dict(zip(ids, emb)))

    return embeddings

# building fact checks embeddings

## preparing fact_checks *dataframe*

In [None]:
file_path = './data/summaries/Qwen2.5-7B-Instruct/fact_check_summaries.csv'

if os.path.exists(file_path):
    # Load the existing dataframe
    FC_summaries = pd.read_csv(file_path).fillna('').set_index('fact_check_id')
    print("Dataframe loaded from file.")
else:
    # Create an empty dataframe with specific columns
    FC_summaries = pd.DataFrame(columns=['fact_check_id', 'summary']).set_index('fact_check_id')
    print("File not found. Created an empty dataframe.")

Dataframe loaded from file.


In [None]:
fact_checks_df['content'] = fact_checks_df.apply(lambda r: r['claim'][0], axis=1)

In [None]:
related_FC_summaries = FC_summaries.loc[FC_summaries.index.isin(fact_checks_df.index)]
related_FC_summaries

Unnamed: 0_level_0,summary
fact_check_id,Unnamed: 1_level_1
9136,Michel Temer exaggerated the time it took for ...
100280,"Pope Francis, along with other religious leade..."
9135,Michel Temer claimed his government approved a...
36946,"Senators debated the 2016 project 131/2015, wh..."
63469,The government's roadmap to ease COVID-19 rest...
...,...
174690,"Demonstrations occurred in Kherrata, Algeria, ..."
95818,We made 12 CEUs.
95837,We passed the test.
154242,A 2019 picture from the Kolkata Pride Parade w...


In [None]:
# Inserting Summaries
fact_checks_df.loc[related_FC_summaries.index, 'content'] = related_FC_summaries['summary']
fact_checks_df.loc[related_FC_summaries.index, 'language'] = 'eng'

# fact_checks_df.loc[fact_checks_df['language'].apply(lambda x: x == 'hi-Latn'), 'language'] = 'hin'

## building the embeddings

In [None]:
folder_path = "./data/embeddings/fact_checks_embeddings/KaLM-embedding-multilingual-mini-v1"

fact_checks_embeddings = get_embeddings(fact_checks_df.index.to_list(), fact_checks_df['claim'].apply(lambda x: x[1]).to_list(), batch_size = 512)
# pd.DataFrame({'embedding': fact_checks_embeddings.values()}, index=fact_checks_embeddings.keys()).to_pickle(f'{folder_path}/fact_checks_embeddings.pkl')

In [None]:
fact_checks_embeddings = pd.DataFrame({'embedding': fact_checks_embeddings.values()}, index=fact_checks_embeddings.keys())

In [None]:
with open(f"{folder_path}/fact_checks_embeddings.pkl", "rb") as file:
    my_object = pickle.load(file)

my_object

# building posts embeddings

## preparing posts dataframe

In [None]:
file_path = './data/summaries/Qwen2.5-7B-Instruct/posts_summaries.csv'

if os.path.exists(file_path):
    # Load the existing dataframe
    posts_summaries = pd.read_csv(file_path).fillna('').set_index('post_id')
    print("Dataframe loaded from file.")
else:
    # Create an empty dataframe with specific columns
    posts_summaries = pd.DataFrame(columns=['post_id', 'summary']).set_index('post_id')
    print("File not found. Created an empty dataframe.")

Dataframe loaded from file.


In [None]:
related_post_summaries = posts_summaries.loc[posts_summaries.index.isin(posts_df.index)]
related_post_summaries

Unnamed: 0_level_0,summary
post_id,Unnamed: 1_level_1
21013,Prolonged use of masks can lead to hypercapnia...
5925,Joseph C De Gregorio and Tina M Brown pledge n...
15231,"The Mexican government plans to spend 5,792,11..."
26071,"On May 30, 2021, in Yongin Suji, videos claimi..."
27199,Weakened blood vessels in the head can rupture...
...,...
9720,"While humanity is in quarantine, Planet Earth ..."
3951,Painful scenes of oppression and suffering on ...
24262,The court has asked to seal the Shivling found...
6201,Chile's Prime Minister has nationalized all pr...


In [None]:
posts_df.loc[related_post_summaries.index, 'eng_content'] = related_post_summaries['summary']
posts_df.loc[related_post_summaries.index, 'language'] = 'eng'

# finding matchings

In [16]:
def get_fact_checks(post_id):
    FCs = mapping_df[mapping_df['post_id'] == post_id]['fact_check_id'].to_list()
    return fact_checks_df.loc[FCs].index.to_list()

def common_element(list1, list2):
    return any(item in list2 for item in list1)

def get_accuracy(posts_ids, top_indices_ids, show_logs = False):
    mismatched_posts = []
    corrects = 0
    for i, p in enumerate(posts_ids):
      FCs = get_fact_checks(p)
      result = common_element(FCs, top_indices_ids[i])

      if show_logs:
        print("=================================================================")
        print(f'fact_checks for post {p}')
        print(f"content: {posts_df.loc[p]['content']}")
        print(FCs)
        for x in FCs:
          print(f"title: {fact_checks_df.loc[x]['title']}")
          print(f"claim: {fact_checks_df.loc[x]['claim'][1]}")
        print(result)
      corrects += result == True
      if not result:
        mismatched_posts.append(p)
    return corrects, mismatched_posts

In [None]:
with open("./data/embeddings/fact_checks_embeddings/bilingual-embedding-large/fact_checks_embeddings.pkl", "rb") as file:
    fact_checks_embeddings = pickle.load(file)

fact_checks_embeddings

Unnamed: 0,embedding
0,"[0.025219444, -0.010132665, -0.0076080356, 0.0..."
1,"[0.019231465, -0.021279993, 0.009567103, 0.018..."
2,"[-0.0029463407, -0.032426212, 0.03459119, -0.0..."
3,"[0.036301985, 0.011263531, -0.034454156, 0.022..."
4,"[0.03394732, -0.05794316, -0.022736477, 0.0422..."
...,...
205744,"[0.03158213, 0.006222313, -0.011078036, -0.001..."
205745,"[-0.04643174, 0.012115832, -0.00062316406, -0...."
205747,"[0.032638386, -0.011957649, 0.047117464, 0.019..."
205749,"[0.0008446853, 0.0046829437, -0.030404292, -0...."


In [None]:
print(f"posts: { len(posts_summaries.loc[posts_summaries.index.isin(tasks['crosslingual']['posts_train'])]) }, fc: { len(tasks['crosslingual']['fact_checks']) }")

posts: 269, fc: 153743


In [None]:
# summarized posts
for lang in tasks['monolingual'].keys():
    print(f"lang: {lang}, posts: { len(posts_summaries.loc[posts_summaries.index.isin(tasks['monolingual'][lang]['posts_train'])]) }, fc: { len(tasks['monolingual'][lang]['fact_checks']) }")

lang: fra, posts: 102, fc: 4355
lang: spa, posts: 341, fc: 14082
lang: eng, posts: 232, fc: 85734
lang: por, posts: 174, fc: 21569
lang: tha, posts: 31, fc: 382
lang: deu, posts: 35, fc: 4996
lang: msa, posts: 54, fc: 8424
lang: ara, posts: 43, fc: 14201


In [None]:
# all train posts
for lang in tasks['monolingual'].keys():
    print(f"lang: {lang}, posts: { len(tasks['monolingual'][lang]['posts_train']) + len(tasks['monolingual'][lang]['posts_dev']) }, fc: { len(tasks['monolingual'][lang]['fact_checks']) }")

lang: fra, posts: 1784, fc: 4355
lang: spa, posts: 6243, fc: 14082
lang: eng, posts: 4829, fc: 85734
lang: por, posts: 2873, fc: 21569
lang: tha, posts: 507, fc: 382
lang: deu, posts: 750, fc: 4996
lang: msa, posts: 1167, fc: 8424
lang: ara, posts: 754, fc: 14201


In [None]:
len(tasks['crosslingual']['posts_train']) + len(tasks['crosslingual']['posts_dev'])

5524

In [None]:
dic = {'cross': 0}
for lang in tasks['monolingual'].keys():
    dic[lang] = 0

for post in mapping_df['post_id'].to_list():
    mamad = False
    for lang in tasks['monolingual'].keys():
        if post in tasks['monolingual'][lang]['posts_train'] + tasks['monolingual'][lang]['posts_dev']:
            dic[lang] += 1
            mamad = True
            break
    if not mamad:
        dic['cross'] += 1

dic

{'cross': 5787,
 'fra': 1667,
 'spa': 6313,
 'eng': 5446,
 'por': 3386,
 'tha': 465,
 'deu': 830,
 'msa': 1169,
 'ara': 680}

In [None]:
len(mapping_df['post_id'])

25743

In [None]:
len(posts_df)

24431

In [None]:
posts_summaries.loc[posts_summaries.index.isin(tasks['monolingual']['fra']['posts_train'])]

Unnamed: 0_level_0,summary
post_id,Unnamed: 1_level_1
1308,The Pasteur Institute recognizes the effective...
10803,The Court of Arbitration for Sport orders FIFA...
15385,"On May 6, 2022, a Live ISS Camera filmed more ..."
25918,Virologists wear protective gear to guard agai...
12470,"Artem Bonov, current first deputy chief of pol..."
...,...
14323,Mashaallah received Britain's COTM award for g...
7489,"Sami Abu Dyak, a 17-year-old Palestinian, died..."
25874,"A young girl was bitten by a mosquito, develop..."
27475,President Vladimir Putin released 800 tigers a...


## manual testing

In [23]:
lang = 'tha'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

# fc_emb = fact_checks_embeddings.loc[fc.index]
fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 128)
# fc_emb = get_embeddings(fc.index.to_list(), fc['content'].to_list(), batch_size = 32)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# posts = posts_summaries.loc[posts_summaries.index.isin(tasks['monolingual'][lang]['posts_train'])]
posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]
# posts = posts_df.loc[tasks['crosslingual']['posts_train']]

print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc) }")

posts_embedding = get_embeddings(posts.index.to_list(), posts['eng_content'].to_list(), batch_size = 128)

# similarities = cosine_similarity(list(posts_embedding.values()), fc_emb['embedding'].to_list())
similarities = cosine_similarity(list(posts_embedding.values()), list(fc_emb.values()))

nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
# top_indices = [[fc_emb.iloc[idx].name for idx in sublist] for sublist in nearest]
top_indices = [[list(fc_emb.keys())[idx] for idx in sublist] for sublist in nearest]

corrects, mismatched_posts = get_accuracy(posts.index, top_indices)

print(f"accuracy: {corrects/len(posts) * 100}% !")

lang: tha, posts: 465, fc: 382


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

## creating predictions

### loading dev gold labels

In [19]:
def common_element(list1, list2):
    return any(item in list2 for item in list1)

def get_dev_accuracy(posts_ids, top_indices_ids, gold_labels, show_logs = False):
    mismatched_posts = []
    corrects = 0
    for i, p in enumerate(posts_ids):
      FCs = gold_labels[str(p)]
      result = common_element(FCs, top_indices_ids[i])

      if show_logs:
        print("=================================================================")
        print(f'fact_checks for post {p}')
        print(f"content: {posts_df.loc[p]['content']}")
        print(FCs)
        for x in FCs:
          print(f"title: {fact_checks_df.loc[x]['title']}")
          print(f"claim: {fact_checks_df.loc[x]['claim'][1]}")
        print(result)
      corrects += result == True
      if not result:
        mismatched_posts.append(p)
    return corrects, mismatched_posts

In [20]:
with open('./data/SemEval2025-Task7 Dev Labels/monolingual_reference.json', 'r') as file:
    monolingual_reference = json.load(file)

with open('./data/SemEval2025-Task7 Dev Labels/crosslingual_reference.json', 'r') as file:
    crosslingual_reference = json.load(file)

gold_labels = {**monolingual_reference, **crosslingual_reference}

In [21]:
with open('./data/original data/monolingual_predictions.json', 'r') as file:
    monolingual_predictions = json.load(file)

with open('./data/original data/crosslingual_predictions.json', 'r') as file:
    crosslingual_predictions = json.load(file)

In [22]:
content_field = 'content'
model_path = 'gte-multilingual-base-Fine_Tuned'

### monolingual posts

In [16]:
for lang in tasks['monolingual'].keys():
    # posts = posts_df.loc[list(posts_summaries.index.intersection(tasks['monolingual'][lang]['posts_train']))]
    posts = posts_df.loc[tasks['monolingual'][lang]['posts_dev']]
    # posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]

    fc_embeddings = get_embeddings(fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']].index.to_list(), fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]['claim'].apply(lambda x: x[0]).to_list(), batch_size = 64)
    # fc_embeddings = fact_checks_embeddings.loc[tasks['monolingual'][lang]['fact_checks']]

    print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc_embeddings) }")

    posts_embedding = get_embeddings(posts.index.to_list(), posts[content_field].to_list(), batch_size = 64)

    similarities = cosine_similarity(list(posts_embedding.values()), list(fc_embeddings.values()))
    # similarities = cosine_similarity(list(posts_embedding.values()), fc_embeddings['embedding'].to_list())

    nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
    top_indices = [[list(fc_embeddings.keys())[idx] for idx in sublist] for sublist in nearest]
    # top_indices = [[fc_embeddings.iloc[idx].name for idx in sublist] for sublist in nearest]

    corrects, mismatched_posts = get_dev_accuracy(posts.index, top_indices, gold_labels)

    # saving the results
    monolingual_predictions.update(dict(zip([str(ind) for ind in posts.index.to_list()], top_indices)))

    print(f"accuracy: {corrects/len(posts) * 100}% !")

Processing Batches: 100%|██████████| 69/69 [00:35<00:00,  1.92it/s]


lang: fra, posts: 188, fc: 4355


Processing Batches: 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]


accuracy: 81.38297872340425% !


Processing Batches: 100%|██████████| 221/221 [01:58<00:00,  1.86it/s]


lang: spa, posts: 615, fc: 14082


Processing Batches: 100%|██████████| 10/10 [00:33<00:00,  3.36s/it]


accuracy: 84.71544715447155% !


Processing Batches:   9%|▉         | 127/1340 [01:32<14:43,  1.37it/s]


KeyboardInterrupt: 

### crosslingual posts

In [None]:
# posts = posts_df.loc[list(posts_summaries.index.intersection(tasks['crosslingual']['posts_train']))]
posts = posts_df.loc[tasks['crosslingual']['posts_dev']]
fc_embeddings = fact_checks_embeddings.loc[tasks['crosslingual']['fact_checks']]

print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc_embeddings) }")

posts_embedding = get_embeddings(posts.index.to_list(), posts[content_field].to_list(), batch_size = 32)

similarities = cosine_similarity(list(posts_embedding.values()), fc_embeddings['embedding'].to_list())

nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
top_indices = [[fc_embeddings.iloc[idx].name for idx in sublist] for sublist in nearest]

corrects, mismatched_posts = get_dev_accuracy(posts.index, top_indices, gold_labels)

#saving the results
crosslingual_predictions.update(dict(zip([str(ind) for ind in posts.index.to_list()], top_indices)))

print(f"accuracy: {corrects/len(posts) * 100}% !")

lang: ara, posts: 552, fc: 153743


Processing Batches: 100%|██████████| 18/18 [00:06<00:00,  2.68it/s]


accuracy: 64.85507246376811% !


### saving the results

In [None]:
file_path = f'./results/Submissions/{model_path}'

# Convert numpy int64 to Python int before serializing
def convert_to_int(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    # If it's a dictionary or list, recursively convert values
    if isinstance(obj, dict):
        return {k: convert_to_int(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_to_int(v) for v in obj]
    return obj

# Now use the converted dictionary with json.dump
with open(f'{file_path}/monolingual_predictions.json', 'w') as json_file:
    json.dump(convert_to_int(monolingual_predictions), json_file, indent=4)

with open(f'{file_path}/crosslingual_predictions.json', 'w') as json_file:
    json.dump(convert_to_int(crosslingual_predictions), json_file, indent=4)