# initializations

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle
import torch
import json
import ast
import os

# loading data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd /content/drive/MyDrive/University/Research/SemEval 2025: Task 7
# %cd /content/drive/MyDrive/Research/SemEval 2025: Task 7

/content/drive/MyDrive/University/Research/SemEval 2025: Task 7


In [5]:
parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s


fact_checks_df = pd.read_csv('./data/cleaned data/fact_checks.csv').fillna('').set_index('fact_check_id')

for col in ['claim', 'title']:
    fact_checks_df[col] = fact_checks_df[col].apply(parse_col)


posts_df = pd.read_csv('./data/cleaned data/posts.csv').fillna('').set_index('post_id')

mapping_df = pd.read_csv('./data/original data/pairs.csv')

with open('./data/original data/tasks.json', 'r') as file:
    tasks = json.load(file)

# Models

In [6]:
!pip install transformers bitsandbytes accelerate



In [7]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

In [8]:
model_name = "Linq-AI-Research/Linq-Embed-Mistral"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",  # Use float16 for better accuracy
    bnb_4bit_use_double_quant=True,    # Improves compression efficiency
    bnb_4bit_quant_type="nf4"          # NF4 works best for LLMs
)

# Load the model with quantization
model = AutoModel.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def get_embeddings(ids, data, batch_size = 1, is_post = False):

    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a social media post, retrieve relevant passages about the post'

    if is_post:
        data = [get_detailed_instruct(task, x) for x in data]

    embeddings = {}

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch_data = data[i:i + batch_size]
        batch_id = ids[i:i + batch_size]

        # Batch process embeddings
        ## emb = model.encode(batch_data, normalize_embeddings=True)

        max_length = 1024
        # Tokenize the input texts
        batch_dict = tokenizer(batch_data, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to(model.device)
        outputs = model(**batch_dict)
        emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

        # Normalize embeddings
        emb = F.normalize(emb, p=2, dim=1)

        embeddings.update(dict(zip(batch_id, emb)))

    return embeddings

In [16]:

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a social media post, retrieve relevant passages about the post'
x = posts_df.iloc[0]['eng_content']

embeddings = {}

# for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
    # batch_data = data[i:i + batch_size]
    # batch_id = ids[i:i + batch_size]

    # Batch process embeddings
    ## emb = model.encode(batch_data, normalize_embeddings=True)

batch_data = x
max_length = 1024
# Tokenize the input texts
batch_dict = tokenizer(batch_data, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to(model.device)
# outputs = model(**batch_dict)
# emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# # Normalize embeddings
# emb = F.normalize(emb, p=2, dim=1)

# embeddings.update(dict(zip(batch_id, emb)))

# print(embeddings)

In [18]:
outputs = model(**batch_dict)

In [19]:
emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

In [20]:
emb = F.normalize(emb, p=2, dim=1)

In [23]:
embeddings.update(dict(zip([0], emb)))

In [26]:
lang = 'tha'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]

In [28]:
fc.iloc[0].name

8144

In [10]:
!pip install datasets



In [11]:
from datasets import Dataset

In [12]:
fact_checks_df

Unnamed: 0_level_0,claim,instances,title,language,content
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(Are avocados good for you?, Are avocados good...","[(1525653998.0, 'https://metafact.io/factcheck...",,eng,title: ' ' | claim: ' Are avocados good for y...
1,"(Can animals have headaches?, Can animals have...","[(1617955634.0, 'https://metafact.io/factcheck...",,eng,title: ' ' | claim: ' Can animals have headac...
2,"(Can we help prevent Alzheimer's with diet?, C...","[(1525653998.0, 'https://metafact.io/factcheck...",,eng,title: ' ' | claim: ' Can we help prevent Alz...
3,(Do any benefits of alcohol outweigh the risks...,"[(1525653998.0, 'https://metafact.io/factcheck...",,eng,title: ' ' | claim: ' Do any benefits of alco...
4,"(Does acupuncture work for headaches?, Does ac...","[(1617955595.0, 'https://metafact.io/factcheck...",,eng,title: ' ' | claim: ' Does acupuncture work f...
...,...,...,...,...,...
205744,(في فرنسا ، يقرر رجال الشرطة العسكرية والمدنية...,"[(1617976680.0, 'https://factuel.afp.com/ar/Fr...",(هذا الفيديو ليس لتحرّك الشرطة الفرنسيّة ضدّ ا...,ara,title: ' هذا الفيديو ليس لتحرّك الشرطة الفرنسي...
205745,(This little beautiful girl was seen in Mangal...,"[(1576281540.0, 'https://youturn.in/articles/c...",(மங்களூரில் பிச்சை எடுக்கும் குழுவில் மீட்கப்ப...,eng,title: ' மங்களூரில் பிச்சை எடுக்கும் குழுவில் ...
205747,(إيطاليين و أجانب رجال و نساء ، أطفال و عجزة ا...,"[(1616693700.0, 'https://factuel.afp.com/ar/th...",(هذه الصور لطابورٍ أمام مركز توزيع مساعدات غذا...,ara,title: ' هذه الصور لطابورٍ أمام مركز توزيع مسا...
205749,(Confirmado... Amanhã acabarão as mensagens gr...,"[(1570924680.0, 'https://www.boatos.org/tecnol...",(WhatsApp vai cobrar 0.37 centavos por mensage...,por,title: ' WhatsApp vai cobrar 0.37 centavos por...


In [13]:
fact_checks_df['claim_orig'] = fact_checks_df['claim'].apply(lambda x: x[0])

In [14]:
fact_checks_df['claim_eng'] = fact_checks_df['claim'].apply(lambda x: x[1])

['claim', 'instances', 'title', 'language', 'content', 'claim_only']

In [15]:
fact_checks_df.drop(columns=[col for col in fact_checks_df.columns if col not in ['claim_orig', 'claim_eng']], inplace=True)

In [16]:
ds = Dataset.from_pandas(fact_checks_df)

In [17]:
ds

Dataset({
    features: ['claim_orig', 'claim_eng', 'fact_check_id'],
    num_rows: 153743
})

In [18]:
lang = 'tha'
filtered_ds = ds.filter(lambda example: example["fact_check_id"] in tasks['monolingual'][lang]['fact_checks'])

Filter:   0%|          | 0/153743 [00:00<?, ? examples/s]

In [19]:
filtered_ds

Dataset({
    features: ['claim_orig', 'claim_eng', 'fact_check_id'],
    num_rows: 382
})

In [28]:
def get_orig_embeddings(example, is_post = False):

    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a social media post, retrieve relevant passages about the post'

    # if is_post:
    #     data = [get_detailed_instruct(task, x) for x in data]


    # batch_data = data[i:i + batch_size]
    # batch_id = ids[i:i + batch_size]
    # orig_data = example['claim_orig']
    # eng_data = example('claim_eng')
    # id_data = example['fact_check_id']

    # Batch process embeddings
    ## emb = model.encode(batch_data, normalize_embeddings=True)

    max_length = 1024
    # Tokenize the input texts
    # batch_dict = tokenizer(orig_data, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to(model.device)
    batch_dict = tokenizer(example['claim_orig'], max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    # print(batch_dict)
    outputs = model(**batch_dict)
    # print(outputs)
    emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    # print(emb)

    # Normalize embeddings
    emb = F.normalize(emb, p=2, dim=1)
    # print(emb)
    # print(emd)
    # return
    return {'orig_emb': emb}

In [25]:
max_length = 1024
orig_data = filtered_ds[1]['claim_orig']
batch_dict = tokenizer(orig_data, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to(model.device)
outputs = model(**batch_dict)
emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

In [None]:
filtered_ds = filtered_ds.map(lambda row: get_orig_embeddings(row), batched=False)

In [None]:
lang = 'tha'
dataset = ds.filter(lambda example: example["fact_check_id"] in tasks['monolingual'][lang]['fact_checks'])
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

# fc_emb = fact_checks_embeddings.loc[fc.index]
fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 1)
# fc_emb = get_embeddings(fc.index.to_list(), fc['content'].to_list(), batch_size = 32)

In [54]:
filtered_ds[:10]

{'claim_orig': ["'ฮ.พม่า'โดนสอยร่วงในรัฐคะฉิ่น นักบิน-ผู้โดยสารเสียชีวิต?",
  "'เซเลนสกี' เผลอใส่เสื้อติดเครื่องหมายนาซีแบบฮิตเลอร์",
  '11ข้อคิด บิลเกตส์',
  '6 สิ่งที่ควรทำก่อนฉีดวัคซีนโควิด-19',
  'ATK มันแค่อุปกรณ์ตรวจแอนตี้บอดี้ ไม่ได้สามารถระบุไวรัส',
  'CEO ของ Moderna ขายหุ้นมูลค่า $400 ล้านดอลลาร์ทิ้ง พร้อมกับลบบัญชี Twitter',
  'CEO ของบริษัทไฟเซอร์ ยังไม่ได้รับการฉีดวัคซีน',
  'Merlino ได้รับแต่งตั้งเป็นหัวหน้างานรักษาความปลอดภัยของพระราชวังอักฤษ',
  'PM 2.5 วาสลีน',
  'Rick Simpson น้ำมันกัญชาสกัด สามารถใช้รักษาโรคมะเร็ง'],
 'claim_eng': ["'Myanmar helicopter' was taken down in Kachin state The pilot and the passenger died?",
  "'Zelenski' accidentally wears a shirt with Nazi insignia like Hitler",
  '11 thoughts Bill Gates',
  '6 things to do before vaccinating against COVID-19',
  'ATK is just an anti-body detection device. The virus could not be identified.',
  "Moderna's CEO sells $400 million worth of shares along with deleting the Twitter account.",
  'CEO of Pfizer h

In [30]:
lang = 'tha'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

# fc_emb = fact_checks_embeddings.loc[fc.index]
# fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 1)
for i in tqdm(range(len(fc))):
    get_embeddings([fc.iloc[i].name], [fc.iloc[i]['claim'][0]], batch_size = 1)
# fc_emb = get_embeddings(fc.index.to_list(), fc['content'].to_list(), batch_size = 32)

  0%|          | 0/382 [00:00<?, ?it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
  0%|          | 1/382 [00:00<01:58,  3.22it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  4.68it/s]
  1%|          | 2/382 [00:00<01:39,  3.83it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]
  1%|          | 3/382 [00:00<01:31,  4.15it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  4.78it/s]
  1%|          | 4/382 [00:00<01:28,  4.28it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  4.73it/s]
  1%|▏         | 5/382 [00:01<01:27,  4.30it/s]
Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Processing Batches: 100%

KeyboardInterrupt: 

In [10]:
posts_df.iloc[0]['eng_content']

"! Brazen vaccination fake by Markus Söder! It's really unbelievable how bold Top politicians such as Markus Söder kidding us. On Instagram does Söder busy advertising for vaccination But if you look closely, you can see you that he can't be injected at all. The lid is still on the needle. You can see how much those who want to vaccinate you, the Trust vaccines! markus.soeder TBE ..."

In [12]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a question, retrieve Wikipedia passages that answer the question'
queries = [
    get_detailed_instruct(task, "what is this post about?")
    # get_detailed_instruct(task, '최초의 원자력 발전소는 무엇인가?'),
    # get_detailed_instruct(task, 'Who invented Hangul?')
]
# No need to add instruction for retrieval documents
passages = [
    posts_df.iloc[0]['eng_content']
]


max_length = 4096
input_texts = [*queries, *passages]
# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# Normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

[[28.78125]]


In [12]:
model

MistralModel(
  (embed_tokens): Embedding(32000, 4096, padding_idx=2)
  (layers): ModuleList(
    (0-31): 32 x MistralDecoderLayer(
      (self_attn): MistralAttention(
        (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      )
      (mlp): MistralMLP(
        (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): MistralRMSNorm((4096,), eps=1e-05)
  (rotary_emb): MistralRo

# building fact checks embeddings

In [None]:
folder_path = "./data/embeddings/fact_checks_embeddings/KaLM-embedding-multilingual-mini-v1"

fact_checks_embeddings = get_embeddings(fact_checks_df.index.to_list(), fact_checks_df['claim'].apply(lambda x: x[0]).to_list(), batch_size = 128)
pd.DataFrame({'embedding': fact_checks_embeddings.values()}, index=fact_checks_embeddings.keys()).to_pickle(f'{folder_path}/fact_checks_embeddings.pkl')

Batches:   0%|          | 0/1202 [00:00<?, ?it/s]

In [None]:
with open(f"{folder_path}/fact_checks_embeddings.pkl", "rb") as file:
    my_object = pickle.load(file)

my_object

# finding matchings

In [9]:
def get_fact_checks(post_id):
    FCs = mapping_df[mapping_df['post_id'] == post_id]['fact_check_id'].to_list()
    return fact_checks_df.loc[FCs].index.to_list()

def common_element(list1, list2):
    return any(item in list2 for item in list1)

def get_accuracy(posts_ids, top_indices_ids, show_logs = False):
    mismatched_posts = []
    corrects = 0
    for i, p in enumerate(posts_ids):
      FCs = get_fact_checks(p)
      result = common_element(FCs, top_indices_ids[i])

      if show_logs:
        print("=================================================================")
        print(f'fact_checks for post {p}')
        print(f"content: {posts_df.loc[p]['content']}")
        print(FCs)
        for x in FCs:
          print(f"title: {fact_checks_df.loc[x]['title']}")
          print(f"claim: {fact_checks_df.loc[x]['claim'][1]}")
        print(result)
      corrects += result == True
      if not result:
        mismatched_posts.append(p)
    return corrects, mismatched_posts

In [10]:
with open("./data/embeddings/fact_checks_embeddings/KaLM-embedding-multilingual-mini-v1/fact_checks_embeddings.pkl", "rb") as file:
    fact_checks_embeddings = pickle.load(file)

fact_checks_embeddings

Unnamed: 0,embedding
0,"[-0.022592599, -0.02723657, 0.032828655, -0.04..."
1,"[0.021081995, -0.017943902, 0.004556368, 0.001..."
2,"[0.011077701, -0.024155142, -0.029340882, -0.0..."
3,"[0.017757082, -0.0072806375, 0.050494693, -0.0..."
4,"[0.034658972, 0.026354847, 0.012632944, 0.0351..."
...,...
205744,"[-0.019015046, -0.04366706, 0.037821624, 0.003..."
205745,"[-0.07111357, 0.03539614, 0.01701976, 0.024223..."
205747,"[-0.044656698, -0.027773362, 0.009518869, 0.00..."
205749,"[0.026000615, -0.10748318, 0.028869668, -0.042..."


In [None]:
# all train posts
for lang in tasks['monolingual'].keys():
    print(f"lang: {lang}, posts: { len(tasks['monolingual'][lang]['posts_train']) }, fc: { len(tasks['monolingual'][lang]['fact_checks']) }")

lang: fra, posts: 1596, fc: 4355
lang: spa, posts: 5628, fc: 14082
lang: eng, posts: 4351, fc: 85734
lang: por, posts: 2571, fc: 21569
lang: tha, posts: 465, fc: 382
lang: deu, posts: 667, fc: 4996
lang: msa, posts: 1062, fc: 8424
lang: ara, posts: 676, fc: 14201


## manual testing

In [11]:
lang = 'tha'
fc = fact_checks_df.loc[tasks['monolingual'][lang]['fact_checks']]
# fc = fact_checks_df.loc[tasks['crosslingual']['fact_checks']]

# fc_emb = fact_checks_embeddings.loc[fc.index]
fc_emb = get_embeddings(fc.index.to_list(), fc['claim'].apply(lambda x: x[0]).to_list(), batch_size = 1)
# fc_emb = get_embeddings(fc.index.to_list(), fc['content'].to_list(), batch_size = 32)

Processing Batches:   9%|▉         | 36/382 [00:09<01:34,  3.64it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 112.12 MiB is free. Process 4043 has 14.63 GiB memory in use. Of the allocated memory 14.30 GiB is allocated by PyTorch, and 211.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# posts = posts_summaries.loc[posts_summaries.index.isin(tasks['monolingual'][lang]['posts_train'])]
posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]
# posts = posts_df.loc[tasks['crosslingual']['posts_train']]

print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc) }")

posts_embedding = get_embeddings(posts.index.to_list(), posts['content'].to_list(), batch_size = 4)

# similarities = cosine_similarity(list(posts_embedding.values()), fc_emb['embedding'].to_list())
similarities = cosine_similarity(list(posts_embedding.values()), list(fc_emb.values()))

nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
# top_indices = [[fc_emb.iloc[idx].name for idx in sublist] for sublist in nearest]
top_indices = [[list(fc_emb.keys())[idx] for idx in sublist] for sublist in nearest]

corrects, mismatched_posts = get_accuracy(posts.index, top_indices)

print(f"accuracy: {corrects/len(posts) * 100}% !")

## creating predictions

In [None]:
with open('./data/original data/monolingual_predictions.json', 'r') as file:
    monolingual_predictions = json.load(file)

with open('./data/original data/crosslingual_predictions.json', 'r') as file:
    crosslingual_predictions = json.load(file)

In [None]:
content_field = 'content'
model_path = 'KaLM-embedding-multilingual-mini-v1'

### monolingual posts

In [None]:
for lang in tasks['monolingual'].keys():
    # posts = posts_df.loc[list(posts_summaries.index.intersection(tasks['monolingual'][lang]['posts_train']))]
    posts = posts_df.loc[tasks['monolingual'][lang]['posts_dev']]
    # posts = posts_df.loc[tasks['monolingual'][lang]['posts_train']]
    fc_embeddings = fact_checks_embeddings.loc[tasks['monolingual'][lang]['fact_checks']]

    print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc_embeddings) }")

    posts_embedding = get_embeddings(posts.index.to_list(), posts[content_field].to_list(), batch_size = 32)

    similarities = cosine_similarity(list(posts_embedding.values()), fc_embeddings['embedding'].to_list())

    nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
    top_indices = [[fc_embeddings.iloc[idx].name for idx in sublist] for sublist in nearest]

    corrects, mismatched_posts = get_accuracy(posts.index, top_indices)

    # saving the results
    monolingual_predictions.update(dict(zip([str(ind) for ind in posts.index.to_list()], top_indices)))

    print(f"accuracy: {corrects/len(posts) * 100}% !")

### crosslingual posts

In [None]:
# posts = posts_df.loc[list(posts_summaries.index.intersection(tasks['crosslingual']['posts_train']))]
posts = posts_df.loc[tasks['crosslingual']['posts_dev']]
fc_embeddings = fact_checks_embeddings.loc[tasks['crosslingual']['fact_checks']]

print(f"lang: {lang}, posts: { len(posts) }, fc: { len(fc_embeddings) }")

posts_embedding = get_embeddings(posts.index.to_list(), posts[content_field].to_list(), batch_size = 32)

similarities = cosine_similarity(list(posts_embedding.values()), fc_embeddings['embedding'].to_list())

nearest = np.argpartition(similarities, -10, axis=1)[:, -10:]
top_indices = [[fc_embeddings.iloc[idx].name for idx in sublist] for sublist in nearest]

corrects, mismatched_posts = get_accuracy(posts.index, top_indices)

#saving the results
crosslingual_predictions.update(dict(zip([str(ind) for ind in posts.index.to_list()], top_indices)))

print(f"accuracy: {corrects/len(posts) * 100}% !")

### saving the results

In [None]:
file_path = f'./results/Submissions/{model_path}'

# Convert numpy int64 to Python int before serializing
def convert_to_int(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    # If it's a dictionary or list, recursively convert values
    if isinstance(obj, dict):
        return {k: convert_to_int(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_to_int(v) for v in obj]
    return obj

# Now use the converted dictionary with json.dump
with open(f'{file_path}/monolingual_predictions.json', 'w') as json_file:
    json.dump(convert_to_int(monolingual_predictions), json_file, indent=4)

with open(f'{file_path}/crosslingual_predictions.json', 'w') as json_file:
    json.dump(convert_to_int(crosslingual_predictions), json_file, indent=4)