In [None]:
"""
This script installs and imports necessary packages, logs into Hugging Face Hub, and sets up a training pipeline for a language model using Gemma-1.1-2B-IT. It fine-tunes the model on a specific dataset, configuring hyperparameters and using advanced training techniques.

Dependencies:
- accelerate
- datasets
- trl
- peft
- bitsandbytes
- huggingface_hub
- transformers
- torch
- tqdm
- pandas

Environment Setup:
Ensure 'GITHUB_TOKEN' environment variable is configured for Hugging Face Hub authentication.

Training Data:
- Uses dataset from "Kubermatic/cncf-question-and-answer-dataset-for-llm-training".

Training Pipeline:
- Utilizes 'SFTTrainer' from 'trl' with 'LoraConfig' for training configuration.
- Fine-tunes 'AutoModelForCausalLM' with custom configurations including 4-bit quantization.

Output:
- Trained model and tokenizer are saved to 'trained_model'.
"""

!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes

from huggingface_hub import notebook_login
notebook_login()

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
from datasets import load_dataset
import torch
import re
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
from peft import LoraConfig, PeftConfig
import bitsandbytes as bnb
import accelerate
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoModelForQuestionAnswering,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          )


# training pipeline taken from https://huggingface.co/blog/gemma-peft
model_id = "google/gemma-1.1-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
# TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

# Training Data
dataset = load_dataset("Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train[:50]")


# Training (hyper)parameters (initial config taken from: https://medium.com/@lucamassaron/sherlock-holmes-q-a-enhanced-with-gemma-2b-it-fine-tuning-2907b06d2645)
max_seq_length = 1024


output_dir = "trained_model"


training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    evaluation_strategy='steps',
    eval_steps = 500,
    eval_accumulation_steps=1,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    disable_tqdm=True,
    # debug="underflow_overflow"
)


def formatting_func(example):
    """
    Formats a dictionary containing 'Question' and 'Answer' keys into a specific text format.

    Args:
        example (dict): A dictionary containing 'Question' and 'Answer' keys.

    Returns:
        list: A list containing formatted strings based on the input example.
    """
    text = f"### Question: {example['Question'][0]}\nAuthor: {example['Answer'][0]}<eos>"
    return [text]

lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    # TODO: Check if this can be changed to QUESTION_ANS with GEMMA
    task_type="CAUSAL_LM",
)


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_arguments,
    peft_config=lora_config,
    formatting_func=formatting_func,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
)
trainer.train()

# Save model
trainer.save_model()
tokenizer.save_pretrained(output_dir)