# Generate BeIR data for translation

This notebook generates data pending for translation for the BeIR task.

In [1]:
import os
import sys

base_dir = '/home/nlp/achimoa/projects/hebrew_text_retrieval'
src_dir = os.path.join(base_dir, 'src')

os.chdir(base_dir)
print('Current directory:', os.getcwd())

if src_dir not in sys.path:
    sys.path.append(src_dir)
    print('Appended source directory:', src_dir)


Current directory: /home/nlp/achimoa/projects/hebrew_text_retrieval
Appended source directory: /home/nlp/achimoa/projects/hebrew_text_retrieval/src


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
from data.beir import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/nlp/achimoa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/nlp/achimoa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
model_name = 'dicta-il/dictalm2.0-AWQ'
dataset_name = 'BeIR/msmarco'
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
data = build_data(dataset_name=dataset_name, tokenizer=tokenizer, n=10)
data

([{'id': '168216',
   'text': 'does legionella pneumophila cause pneumonia',
   'context': {'id': '1381477',
    'text': "Legionella pneumophila is the bacterial type that causes the pneumonia known as Legionnaires' disease. Other bacteria types that can cause pneumonia include the bacteria that cause so-called atypical pneumonia, Legionella pneumophila, Mycoplasma pneumoniae, and Chlamydophila pneumonia. Haemophilus influenzae is a type of bacteria that can cause pneumonia."}},
  {'id': '87181',
   'text': 'causes of left ventricular hypertrophy',
   'context': {'id': '47212',
    'text': 'Ventricular hypertrophy. Ventricular hypertrophy (VH) is thickening of the walls of a ventricle (lower chamber) of the heart.[1][2] Although left ventricular hypertrophy (LVH) is more common, right ventricular hypertrophy (RVH), as well as concurrent hypertrophy of both ventricles can also occur.'}},
  {'id': '527433',
   'text': 'types of dysarthria from cerebral palsy',
   'context': {'id': '12127

In [1]:
from huggingface_hub import HfApi, HfFolder

# Set the API key
HfFolder.save_token("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")


In [4]:
beir = {
    'Misc': ['BeIR/msmarco'], 
    'Fact checking': ['BeIR/fever', 'BeIR/climate-fever', 'BeIR/scifact'],
    'Citation-Prediction': ['BeIR/scidocs'],
    'Duplicate question retrieval': ['BeIR/quora'], # CQADupStack
    'Argument retrieval': ['BeIR/arguana'], # Touche-2020
    'News retrieval': [], # TREC-NEWS, Robust04
    'Question answering': ['BeIR/nq', 'BeIR/hotpotqa'], # FiQA-2018
    'Tweet retrieval': [], # Signal-1M
    'Bio-medical IR': ['BeIR/trec-covid', 'BeIR/nfcorpus'], # BioASQ
    'Entity retrieval': ['BeIR/dbpedia-entity'],
}

new_beir = {}
for k, v in beir.items():
    for vv in v:
        new_beir[vv] = {
            'category': k,
            'dataset_name': vv,
            'max_document_segment_tokens': 256
        }

print(new_beir)

{'BeIR/msmarco': {'category': 'Misc', 'dataset_name': 'BeIR/msmarco', 'max_document_segment_tokens': 256}, 'BeIR/fever': {'category': 'Fact checking', 'dataset_name': 'BeIR/fever', 'max_document_segment_tokens': 256}, 'BeIR/climate-fever': {'category': 'Fact checking', 'dataset_name': 'BeIR/climate-fever', 'max_document_segment_tokens': 256}, 'BeIR/scifact': {'category': 'Fact checking', 'dataset_name': 'BeIR/scifact', 'max_document_segment_tokens': 256}, 'BeIR/scidocs': {'category': 'Citation-Prediction', 'dataset_name': 'BeIR/scidocs', 'max_document_segment_tokens': 256}, 'BeIR/quora': {'category': 'Duplicate question retrieval', 'dataset_name': 'BeIR/quora', 'max_document_segment_tokens': 256}, 'BeIR/arguana': {'category': 'Argument retrieval', 'dataset_name': 'BeIR/arguana', 'max_document_segment_tokens': 256}, 'BeIR/nq': {'category': 'Question answering', 'dataset_name': 'BeIR/nq', 'max_document_segment_tokens': 256}, 'BeIR/hotpotqa': {'category': 'Question answering', 'dataset_na

In [2]:
from datasets import load_dataset as load_dataset_hf

ds = load_dataset_hf("BeIR/msmarco", "corpus")

In [1]:
from datasets import load_dataset as load_dataset_hf

ds = load_dataset_hf("fka/awesome-chatgpt-prompts")

  from .autonotebook import tqdm as notebook_tqdm
