<a href="https://colab.research.google.com/github/annanasnas/askqe/blob/QE%2BQA/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BioMQM + gemma-2-2b-it

In [1]:
import os
from google.colab import userdata

GH_TOKEN = userdata.get('GH_TOKEN')
HF_TOKEN = userdata.get('HF_TOKEN')

In [2]:
from huggingface_hub import login
login(token=HF_TOKEN)

In [3]:
!git clone https://{GH_TOKEN}@github.com/annanasnas/askqe.git

Cloning into 'askqe'...
remote: Enumerating objects: 1191, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 1191 (delta 53), reused 7 (delta 3), pack-reused 1089 (from 2)[K
Receiving objects: 100% (1191/1191), 52.54 MiB | 9.82 MiB/s, done.
Resolving deltas: 100% (923/923), done.
Updating files: 100% (1041/1041), done.


## 1. Question Generation (QG)

### 1.1 Fact Generation

In [3]:
import json
from tqdm.notebook import tqdm

In [6]:
!pip install -q -U bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch
from transformers import pipeline, BitsAndBytesConfig
if torch.cuda.is_available():
    torch.cuda.empty_cache()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
)
pipe.tokenizer.padding_side = "left"

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
atomic_fact_prompt = """Task: You will be given an English sentence. Your goal is to identify a list of atomic facts from the sentence. Atomic fact is a short sentence conveying one piece of information. Output the list of atomic facts in Python list format without giving any additional explanation. Do not output as code format (```python```).

*** Example Starts ***
Sentence: The number of accessory proteins and their function is unique depending on the specific coronavirus.
Atomic facts: ['The number of accessory proteins is unique depending on the specific coronavirus.', 'The function of accessory proteins is unique depending on the specific coronavirus.']
*** Example Ends ***

Sentence: {{sentence}}
Atomic facts: """

In [6]:
from collections import deque

BATCH_SIZE = 100
input_file = f"/content/askqe/biomqm/dev_with_backtranslation.jsonl"
output_file = f"/content/askqe/baseline/askqe_atomic_facts.jsonl"

data_buffer = deque()

processed_count = 0 # checkpoint
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        processed_count = sum(1 for _ in f)

with open(input_file, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)


def data_generator():
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            yield data

def prompt_generator(source_gen):
    generated_so_far = 0
    for data in source_gen:
        if "src" in data:
            ################# DELETE LATER #################
            if data.get("lang_tgt") != "es": #temporary
                continue
            ################################################
            if generated_so_far < processed_count:
                generated_so_far += 1
                continue
            data_buffer.append(data)
            prompt = atomic_fact_prompt.replace("{{sentence}}", data["src"])
            full_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
            yield full_prompt
        else:
            continue

data_gen = data_generator()
prompts_gen = prompt_generator(data_gen)


with open(output_file, "a", encoding="utf-8") as f_out:
    pipeline_iterator = pipe(prompts_gen, batch_size=BATCH_SIZE, max_new_tokens=1024, return_full_text=False)
    for out in tqdm(pipeline_iterator, total=total_lines, initial=processed_count):
          current_data = data_buffer.popleft()

          response = out[0]["generated_text"].strip()
          if response.endswith("<end_of_turn>"):
              response = response[:-len("<end_of_turn>")].strip()

          current_data["atomic_facts"] = response

          f_out.write(json.dumps(current_data, ensure_ascii=False) + "\n")


  0%|          | 0/5216 [00:00<?, ?it/s]

In [8]:
import pandas as pd

file_path = "/content/askqe/baseline/askqe_atomic_facts.jsonl"

df = pd.read_json(file_path, lines=True)

df[["src", "atomic_facts"]].head()

Unnamed: 0,src,atomic_facts
0,"However, in the last years several step forwar...",['Several steps forwards in the field of preci...
1,In this review we focused on some of these ele...,['This review focuses on some of these element...
2,"Although several progresses have been made, at...","['Several progresses have been made.', 'At the..."
3,In this review we focused on some of these ele...,['This review focused on some of these element...
4,Transurethral resection of the bladder represe...,['Transurethral resection of the bladder is a ...


### 1.2 Entailment classification

In [None]:
import torch
import json
import ast
from transformers import pipeline
from tqdm.notebook import tqdm

In [None]:
model_name = "roberta-large-mnli" #"roberta-base-mnli"
device = 0 if torch.cuda.is_available() else -1

nli_pipe = pipeline("text-classification", model=model_name, device=device, top_k=None)


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [None]:
input_file = "/content/askqe/baseline/askqe_atomic_facts_backup.jsonl"
output_file = "/content/askqe/baseline/askqe_atomic_facts_filtered.jsonl"


with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for line in tqdm(f_in):
        item = json.loads(line)

        if "src" not in item or "atomic_facts" not in item:
            continue

        # string to list
        try:
            facts = ast.literal_eval(item["atomic_facts"])
            if not isinstance(facts, list): facts = [str(facts)]
        except:
            facts = [item["atomic_facts"]]

        valid_facts = [f for f in facts if isinstance(f, str) and len(f) > 5]
        if not valid_facts:
            item["atomic_facts"] = []
            f_out.write(json.dumps(item, ensure_ascii=False) + "\n")
            continue

        pairs = [{"text": item["src"], "text_pair": fact} for fact in valid_facts]
        results = nli_pipe(pairs)

        clean_facts = []
        for fact, res in zip(valid_facts, results):
            # label with maximum weight
            top_label = max(res, key=lambda x: x['score'])['label'].upper()
            if "CONTRADICTION" not in top_label:
                clean_facts.append(fact)
            else:
                print(f"CONTRADICTION: {fact}. {res}")

        item["atomic_facts"] = clean_facts
        f_out.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Finished")