<a href="https://colab.research.google.com/github/annanasnas/askqe/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import os
from google.colab import userdata

GH_TOKEN = userdata.get('GH_TOKEN')
HF_TOKEN = userdata.get('HF_TOKEN')

In [10]:
from huggingface_hub import login
login(token=HF_TOKEN)

In [8]:
!git clone https://{GH_TOKEN}@github.com/annanasnas/askqe.git

Cloning into 'askqe'...
remote: Enumerating objects: 1159, done.[K
remote: Counting objects: 100% (70/70), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 1159 (delta 35), reused 7 (delta 3), pack-reused 1089 (from 2)[K
Receiving objects: 100% (1159/1159), 52.48 MiB | 14.55 MiB/s, done.
Resolving deltas: 100% (905/905), done.
Updating files: 100% (1039/1039), done.


### Fact Generation

In [11]:
folder_path = "/content/askqe/baseline"

os.makedirs(folder_path, exist_ok=True)


In [None]:
import torch
import json
from transformers import pipeline
from tqdm.notebook import tqdm
import sys


if torch.cuda.is_available():
    torch.cuda.empty_cache()

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"dtype": torch.bfloat16},
    device="cuda", # Используем GPU
)
pipe.tokenizer.padding_side = "left"

BATCH_SIZE = 32

atomic_fact_prompt = """Task: You will be given an English sentence. Your goal is to identify a list of atomic facts from the sentence. Atomic fact is a short sentence conveying one piece of information. Output the list of atomic facts in Python list format without giving any additional explanation. Do not output as code format (```python```).

*** Example Starts ***
Sentence: The number of accessory proteins and their function is unique depending on the specific coronavirus.
Atomic facts: ['The number of accessory proteins is unique depending on the specific coronavirus.', 'The function of accessory proteins is unique depending on the specific coronavirus.']
*** Example Ends ***

Sentence: {{sentence}}
Atomic facts: """

input_file = f"/content/askqe/biomqm/dev_with_backtranslation.jsonl"
output_file = f"/content/askqe/baseline/askqe_atomic_facts_backup.jsonl"


prompts = []
data_buffer = []


with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        try:
            d = json.loads(line)
            if "src" in d:
                data_buffer.append(d)
                p = atomic_fact_prompt.replace("{{sentence}}", d["src"])
                full_p = f"<start_of_turn>user\n{p}<end_of_turn>\n<start_of_turn>model\n"
                prompts_list.append(full_p)
        except: pass

def data_generator():
    for p in prompts_list:
        yield p

print(f"Starting generation of {len(prompts_list)} samples...")
final_results = []

for out in tqdm(pipe(data_generator(), batch_size=BATCH_SIZE, max_new_tokens=512, return_full_text=False), total=len(prompts_list)):
    generated_text = out[0]["generated_text"].strip()
    if generated_text.endswith("<end_of_turn>"):
        generated_text = generated_text[:-len("<end_of_turn>")].strip()
    final_results.append(generated_text)

print("Saving...")
with open(output_file, "w", encoding="utf-8") as f:
    for d, res in zip(data_buffer, final_results):
        d["atomic_facts"] = res
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda


Prompts initialization
Pipe initialization
