In [1]:
!pip install -q evaluate transformers accelerate bitsandbytes peft

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from evaluate import load
from datasets import load_from_disk , Dataset
from peft import PeftModel, PeftConfig
import pandas as pd

In [3]:
model_name_or_path = "databricks/dolly-v2-3b"
tokenizer_name_or_path = "databricks/dolly-v2-3b"
peft_model_path = "/content/drive/MyDrive/Colab Notebooks/Table to insights/dolly/prefix_tuned_model_1200"
text_column = "prompt"
label_column = "label"
max_length = 1200

In [4]:
# Specify the directory where the dataset is saved
saved_directory = "/content/drive/MyDrive/Colab Notebooks/Table to insights/Data/Analytical Datset"

# Load the dataset from the specified directory
dataset = load_from_disk(saved_directory)
dataset = dataset['validation']

In [None]:
# The below code is not working as its finishing the memory on free Collab
# the general idea is to have a final dataframe with 3 column the 1st column
# is the human baseline (Label in analytical datset) , 2nd column is
# the inference from base model (model without fine tuning) and the 3rd column
# is to get inference of peft model (model with fine tuning)
# if this dataframe is created then generating the rouge and blue metrics is
# trivial

In [5]:
def generate_summary(model, tokenizer, query, max_length=1024):
    input_ids = tokenizer.encode(query, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)
    summary_ids = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def summarize_example(example, model, tokenizer, text_column, label_column):
    prompt = example[text_column]
    label = example[label_column]
    model_summary = generate_summary(model, tokenizer, prompt)
    return {"prompt": prompt, "label": label, "model_summary": model_summary}

In [6]:
def process_model_and_save_results(tokenizer_name_or_path, model_name_or_path, peft_model_path, dataset, text_column, label_column, batch_size=8, chunk_size=32):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
    base_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

    config = PeftConfig.from_pretrained(peft_model_path)
    peft_base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,load_in_8bit=True,device_map="auto")
    peft_tuned_model = PeftModel.from_pretrained(peft_base_model, peft_model_path)
    del peft_base_model

    tqdm_desc_base = "Processing base model"
    tqdm_desc_tuned = "Processing fine-tuned model"

    summarize = lambda examples, model: [summarize_example(example, model, tokenizer, text_column, label_column) for example in examples]

    # Process the base model in chunks
    df_base_list = []
    for i in range(0, len(dataset), chunk_size):
        chunk = dataset[i:i+chunk_size]
        chunk = Dataset.from_dict(chunk)
        df_base_chunk = pd.DataFrame(chunk.map(summarize, batched=True, batch_size=batch_size, desc=tqdm_desc_base))
        df_base_list.append(df_base_chunk)

    df_base = pd.concat(df_base_list, axis=0)

    # Free up memory by deleting the base model
    del base_model

    # Process the fine-tuned model in chunks
    df_tuned_list = []
    for i in range(0, len(dataset), chunk_size):
        chunk = dataset[i:i+chunk_size]
        chunk = Dataset.from_dict(chunk)
        df_tuned_chunk = pd.DataFrame(chunk.map(summarize, batched=True, batch_size=batch_size, desc=tqdm_desc_tuned))
        df_tuned_list.append(df_tuned_chunk)

    df_tuned = pd.concat(df_tuned_list, axis=0)

    df_final = pd.concat([df_base, df_tuned], axis=1)
    return df_final

In [7]:
results = process_model_and_save_results(tokenizer_name_or_path, model_name_or_path, peft_model_path, dataset, text_column, label_column, batch_size=8, chunk_size=10)

Processing base model:   0%|          | 0/10 [00:00<?, ? examples/s]

TypeError: ignored

In [None]:
#