In [1]:
import os
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

In [2]:
#############################################################################################################

In [2]:
model_id = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model =AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [3]:
model = prepare_model_for_kbit_training(model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [4]:
config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
        ]
    )

model = get_peft_model(model, config)

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 130547712 || all params: 3739292544 || trainable%: 3.4912409356543783


In [7]:
#############################################################################################################

In [6]:
conv_dataset = load_dataset("json", data_files="data/processed-interview-data_v1.2.json", field="data")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
print(conv_dataset["text"][3])

###INSTRUCTION:
You are 'Lucy' who is intended to interview, 'Madabathula Yashaswini' by asking proper follow-up questions to test the his/her abilities in 'maths'.
follow the below interview parameters:
difficulty: low
medium: English


###CONVERSATION:
interviewer: Hello, I'm Lucy and I'll be interviewing you for the maths job. Can you tell me about your educational background in mathematics?
interviewee: 5th class
interviewer: That's great! Let's start with some basic math questions. What is the value of 2+2?
interviewee: four four
interviewer: Good job! Now, what is the result of 10 divided by 2?
interviewee: 5


###OUTPUT:
Excellent! Can you tell me what is the square of 5? <|endoftext|>


In [8]:
conv_dataset

Dataset({
    features: ['text'],
    num_rows: 15176
})

In [9]:
conv_dataset = conv_dataset.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=1024), batched=True)

Map:   0%|          | 0/15176 [00:00<?, ? examples/s]

In [10]:
conv_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 15176
})

In [11]:
#############################################################################################################

In [12]:
domain_dataset = load_dataset("json", data_files="data/domain_data.json", field="data")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
def gen_text(sample):
    sample["text"] = sample["prompt"] + sample["completion"]
    return sample

In [14]:
def process_dataset(data):
    return (
        data.shuffle(seed=42)
        .map(gen_text)
        .remove_columns(
            [
                "prompt",
                "completion",
            ]
        )
    )

In [15]:
dataset = process_dataset(domain_dataset)

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

In [16]:
dataset

Dataset({
    features: ['text'],
    num_rows: 235
})

In [17]:
domain_dataset = dataset.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=1024), batched=True)

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

In [18]:
domain_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 235
})

In [19]:
from datasets import concatenate_datasets
dataset = concatenate_datasets([domain_dataset, conv_dataset])

In [20]:
dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 15411
})

In [21]:
#############################################################################################################

In [22]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    bf16=True,
    do_eval=False,
    save_total_limit=4,
    logging_steps=10,
    output_dir="/home/sivakrishna/Documents/jupyter/finetuning-test/interviewer_tuning/outputs/falcon7b-out",
    save_strategy='epoch',
)
model.config.use_cache = False

In [23]:
peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=conv_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [24]:
peft_trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [16]:
peft_trainer.save_model("/home/sivakrishna/Documents/jupyter/finetuning-test/interviewer_tuning/outputs/")