<a href="https://colab.research.google.com/github/annanasnas/askqe/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import userdata

GH_TOKEN = userdata.get('GH_TOKEN')
HF_TOKEN = userdata.get('HF_TOKEN')

In [10]:
from huggingface_hub import login
login(token=HF_TOKEN)

In [13]:
!git clone https://{GH_TOKEN}@github.com/annanasnas/askqe.git

fatal: destination path 'askqe' already exists and is not an empty directory.


### Fact Generation

In [11]:
folder_path = "/content/askqe/baseline"

os.makedirs(folder_path, exist_ok=True)


In [1]:
atomic_fact_prompt = """Task: You will be given an English sentence. Your goal is to identify a list of atomic facts from the sentence. Atomic fact is a short sentence conveying one piece of information. Output the list of atomic facts in Python list format without giving any additional explanation. Do not output as code format (```python```).

*** Example Starts ***
Sentence: The number of accessory proteins and their function is unique depending on the specific coronavirus.
Atomic facts: ['The number of accessory proteins is unique depending on the specific coronavirus.', 'The function of accessory proteins is unique depending on the specific coronavirus.']
*** Example Ends ***

Sentence: {{sentence}}
Atomic facts: """

In [3]:
import torch
import json
from transformers import pipeline
from tqdm.notebook import tqdm
import sys
import os


if torch.cuda.is_available():
    torch.cuda.empty_cache()

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"dtype": torch.bfloat16},
    device="cuda", # Используем GPU
)
pipe.tokenizer.padding_side = "left"

BATCH_SIZE = 32
LIMIT = 128 # temporary


input_file = f"/content/askqe/biomqm/dev_with_backtranslation.jsonl"
output_file = f"/content/askqe/baseline/askqe_atomic_facts_backup.jsonl"


data_buffer = []
prompts_list = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        if len(prompts_list) >= LIMIT: # temporary
            break
        try:
            d = json.loads(line)
            if "src" in d:
                data_buffer.append(d)
                p = atomic_fact_prompt.replace("{{sentence}}", d["src"])
                full_p = f"<start_of_turn>user\n{p}<end_of_turn>\n<start_of_turn>model\n"
                prompts_list.append(full_p)
        except: pass

def data_generator():
    for p in prompts_list:
        yield p

print(f"Starting generation of {len(prompts_list)} samples...")
final_results = []

for out in tqdm(pipe(data_generator(), batch_size=BATCH_SIZE, max_new_tokens=512, return_full_text=False), total=len(prompts_list)):
    generated_text = out[0]["generated_text"].strip()
    if generated_text.endswith("<end_of_turn>"):
        generated_text = generated_text[:-len("<end_of_turn>")].strip()
    final_results.append(generated_text)

print("Saving...")
with open(output_file, "w", encoding="utf-8") as f:
    for d, res in zip(data_buffer, final_results):
        d["atomic_facts"] = res
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


Starting generation of 128 samples...


  0%|          | 0/128 [00:00<?, ?it/s]

Saving...


### NLI