In [4]:
import os
import torch
from datasets import load_dataset, Dataset
import pandas as pd
import transformers
from transformers import AutoTokenizer
from trl import SFTTrainer
import transformers
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from pynvml import *

In [None]:
base_model = 'HuggingFaceH4/zephyr-7b-beta'
lora_output = 'HAJJLLM_zephyr7b_lora'
full_output = 'HAJJLLM_zephyr7b_beta'
DEVICE = 'cuda'

In [None]:
from huggingface_hub import login
# huggingface token for uploading
token = ""
login(token) 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
### read csv with Prompt, Answer pair 
data_location = r"path_to_csv" ## replace here
data_df=pd.read_csv( data_location ,encoding='unicode_escape')

### formatting function using tokenizer chat template, system text is set for KUETLLM
def formatted_text(x):
    temp = [
    {"role": "system", "content": "In this task, you are an AI bot designed to answer queries about Hajj, a significant religious event in Islam. Focus on being conversational and informative. Don't add unnecessary information."},
    {"role": "user", "content": x["Prompt"]},
    {"role": "assistant", "content": x["Answer"]}
    ]
    return tokenizer.apply_chat_template(temp, add_generation_prompt=False, tokenize=False)

### set formatting
data_df["text"] = data_df[["Prompt", "Answer"]].apply(lambda x: formatted_text(x), axis=1) ## replace Prompt and Answer if collected dataset has different column names
print(data_df.iloc[0])
dataset = Dataset.from_pandas(data_df)

In [None]:

# Get quantized model
model = transformers.AutoModelForCausalLM.from_pretrained(base_model,
                                                          load_in_8bit=True,     # call for the 8 bit bnb quantized version
                                                          device_map='auto'
                                                          )

In [None]:
# print model to find lora layers
print(model)

In [None]:
# Set PEFT adapter config (16:32)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# target modules are currently selected for zephyr base model
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj","k_proj","o_proj","gate_proj","up_proj","down_proj"],   # target all the linear layers for full finetuning
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM")

In [None]:
# stabilize output layer and layernorms
model = prepare_model_for_kbit_training(model, 8)
# Set PEFT adapter on model (Last step)
model = get_peft_model(model, config)

In [None]:
# Set Hyperparameters
MAXLEN=512
BATCH_SIZE=4
GRAD_ACC=4
OPTIMIZER='paged_adamw_8bit' # save memory
LR=5e-06                      # slightly smaller than pretraining lr | and close to LoRA standard

In [None]:
# Set training config
training_config = transformers.TrainingArguments(per_device_train_batch_size=BATCH_SIZE,
                                                 gradient_accumulation_steps=GRAD_ACC,
                                                 optim=OPTIMIZER,
                                                 learning_rate=LR,
                                                 fp16=True,            # consider compatibility when using bf16
                                                 logging_steps=10,
                                                 num_train_epochs = 2,
                                                 output_dir=lora_output,
                                                 remove_unused_columns=False,
                                                 )

# Set collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Setup trainer
trainer = SFTTrainer(model=model,
                               train_dataset=dataset,
                               data_collator=data_collator,
                               args=training_config,
                               dataset_text_field="text",
                            #    callbacks=[early_stop], need to learn, lora easily overfits
                              )

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
trainer.save_model(lora_output)

In [None]:
# Get peft config
from peft import PeftConfig
config = PeftConfig.from_pretrained(lora_output)

In [None]:
# Get base model
model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                                          return_dict=True,
                                                          )

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model,
                                                       add_eos_token=True
                                                       )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained("zephyr-7b-beta-base-full")

In [None]:
# Load the Lora model
from peft import PeftModel
model = PeftModel.from_pretrained(model, lora_output)

# Get tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(config.base_model_name_or_path,
                                                       add_eos_token=True
                                                       )
tokenizer.pad_token = tokenizer.eos_token

In [None]:
merged_model = model.merge_and_unload()

In [None]:
merged_model.save_pretrained(full_output)
tokenizer.save_pretrained(full_output)

In [None]:
# push model to hub
merged_model.push_to_hub(full_output)
tokenizer.push_to_hub(full_output)

In [None]:
# load for inferencing
tokenizer = AutoTokenizer.from_pretrained(full_output)
model = transformers.AutoModelForCausalLM.from_pretrained(full_output)

In [None]:
# load base for comparison
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = transformers.AutoModelForCausalLM.from_pretrained(base_model)

In [None]:
model.save_pretrained("zephyr7b-beta-full")
tokenizer.save_pretrained("zephyr7b-beta-full")

In [None]:
### inferencing on the new model

def process_data_sample(example):

    processed_example = "<|system|>\n In this task, you are an AI bot designed to answer queries about Hajj, a significant religious event in Islam. Focus on being conversational and informative. Don't add unnecessary information.\n<|user|>\n" + example["instruction"] + "\n<|assistant|>\n"

    return processed_example

inp_str = process_data_sample(
    {
        # "instruction": "Tell me about the importance of Tawaf.",
        # "instruction": "What are the steps of Tawaf?",
        # "instruction": "What are the steps of Hajj?",
        # "instruction": "What happens in Mina during Hajj?",
        "instruction": "What should I do after I reach mina while in Hajj?",
        # "instruction": "Tell me about the history of Hajj.",
        # "instruction": "What started Hajj historically?",
        # "instruction": "what is the origin of hajj?",
    }
)

inputs = tokenizer(inp_str, return_tensors="pt")

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

import time
st_time = time.time()
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(time.time()-st_time)

In [None]:
print(model.get_memory_footprint()//1024**2)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.current_device())
# additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

In [None]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

print_gpu_utilization()