In [None]:
"""
This script installs necessary packages, logs into Hugging Face Hub, and sets up a language model for generating question-answer pairs using Gemma-2B-IT.
It fetches a dataset from Kubermatic for training, and employs a function 'gemma_result' to generate QA pairs from extracted text snippets, saving results in a CSV file.

Dependencies:
- accelerate
- datasets
- huggingface_hub
- transformers
- torch
- tqdm
- csv
- google.colab

Usage:
Ensure correct mounting of Google Drive for output. Adjust 'question_ratio' to control QA generation frequency.

"""

!pip install accelerate
!pip install datasets
from huggingface_hub import notebook_login
notebook_login()

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
from datasets import load_dataset
import torch
import re
from tqdm import tqdm

import csv
from google.colab import drive
drive.mount('/content/gdrive')

model_id = "google/gemma-1.1-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=dtype,
)

dataset = load_dataset("Kubermatic/cncf-raw-data-for-llm-training", split="train")

#taken in parts from https://medium.com/@lucamassaron/sherlock-holmes-q-a-enhanced-with-gemma-2b-it-fine-tuning-2907b06d2645

def gemma_result(question: str, 
                 model: torch.nn.Module = model, 
                 tokenizer = tokenizer, 
                 temperature: float = 0.0, 
                 max_new_tokens: int = 256, 
                 return_answer: bool = False) -> None:
                 
    """
    Generate a response to a given question using a model.

    Args:
        question (str): The input question to generate a response for.
        model (torch.nn.Module): The pretrained model to use for generating responses.
        tokenizer: The tokenizer used for tokenizing the input.
        temperature (float, optional): The temperature parameter for sampling. Default is 0.0.
        max_new_tokens (int, optional): The maximum number of tokens to generate. Default is 256.
        return_answer (bool, optional): Whether to return the answer instead of printing it. Default is False.

    Returns:
        None if return_answer is False, otherwise returns the generated answer as a string.
    """

    input_ids = tokenizer(question, return_tensors="pt").to("cuda")
    if temperature > 0:
        do_sample=True
        outputs = model.generate(**input_ids,
                                max_new_tokens=max_new_tokens,
                                do_sample=do_sample,
                                temperature=temperature)
    else:
        do_sample=False
        outputs = model.generate(**input_ids,
                                max_new_tokens=max_new_tokens)
    result = str(tokenizer.decode(outputs[0])).replace("<bos>", "").replace("<eos>", "").strip()
    if return_answer:
        return result
    else:
        print(result)

qa_data = []
fail_count = 0


def extract_json(text: str, word: str) -> str:
    """
    Extracts the value associated with a specified key from JSON-formatted text.

    Args:
        text (str): The JSON-formatted text string.
        word (str): The key to extract the value for.

    Returns:
        str: The value associated with the key 'word' in the JSON text, or an empty string if not found.
    """
    pattern = fr'"{word}": "(.*?)"'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return ""
# chunks = 2 # increment this number up to len(extracted_texts)

question = ""
answer = ""
question_ratio = 1000 # decrement this number to produce more questions (suggested: 24)

for i in tqdm(range(len(dataset['content']))):
    text_category = dataset['tag'][i]['category']
    text_subcategory = dataset['tag'][i]['subcategory']
    text_project = dataset['tag'][i]['project_name']
    information_chunk = dataset['content'][i][0]['data']

    question_text = f"""Create a question and its answer from the following piece of information for a project of the Cloud Native Computing Foundation landscape,
    do not assume the reader knows the text hence put all the necessary information into the question,
    and return it exclusively in JSON format in the format {'{"question": "...", "answer": "..."}'}.
    Here is the piece of information to elaborate: 
    "{information_chunk}"

    OUTPUT JSON:
    """
    # no_questions = max(1, (len(dataset['content'][i][0]['data']) // question_ratio))
    no_questions = 1
    for j in range(no_questions):
      try:
        result = gemma_result(question_text, model=model, temperature=0, return_answer=True)
        result = result.split("OUTPUT JSON:")[-1]
      
        question = extract_json(result, "question")
        answer = extract_json(result, "answer")
        text_project = dataset['tag'][i]['project_name']
        qa_data.append([f"{question}",f"{answer}",f"{text_project}"])
      except:
        print(f"Gemma wasn't able to create a proper question answer pair. No. of failed attempts: {fail_count}") 
        fail_count =+ 1
# opening the csv file in 'a+' mode
file = open('/content/gdrive/My Drive/filename.csv', 'a+', newline ='')
 
# writing the data into the file
with file:    
    write = csv.writer(file)
    write.writerows(qa_data)