In [1]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
import transformers
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#############################################################################################################

In [3]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [4]:
model_name='microsoft/phi-2'
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.01s/it]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from transformers import set_seed
seed = 42
set_seed(seed)

In [7]:
from peft import prepare_model_for_kbit_training

original_model = prepare_model_for_kbit_training(original_model)

In [8]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
)# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, config)

In [9]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(peft_model)

trainable params: 20971520 || all params: 1542364160 || trainable%: 1.3596996444730667


In [10]:
#############################################################################################################

In [11]:
conv_dataset = load_dataset("json", data_files="data/conv_data.json", field="data")["train"]

In [12]:
print(conv_dataset["text"][3])

### Instruction: You are an interviwer who intends to test the interviewee's technical capabilities by asking proper follow-up question based on your previous conversation with the interviewee.

### Conversation: 
interviewer: Hi, I'm Lucy! I'll be interviewing you for the React.js job. Are you ready to get started, Modern Silpi?
interviewee: hello Lucy yeah I am ready to take the to give my interview for react this job
interviewer: Great to hear, Modern Silpi! Let's start with the first question:
interviewee: yeah sure please go ahead
interviewer: Can you explain the concept of state in React?
interviewee: yeah like the concept of estate is used to change something like for example we have given some initial default value so we can if some that point is clicked or done something some thing with that things then we can change its default value to our requirement you have to import the state on the top like import use State from react like that and then you can use it in our code to sho

In [13]:
# dataset = dataset.train_test_split(test_size=0.997)

In [14]:
# dataset = dataset["train"]

In [15]:
conv_dataset

Dataset({
    features: ['text'],
    num_rows: 3900
})

In [16]:
conv_dataset = conv_dataset.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=1024), batched=True)

In [17]:
conv_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 3900
})

In [18]:
#############################################################################################################

In [19]:
domain_dataset = load_dataset("json", data_files="data/domain_data.json", field="data")["train"]

In [20]:
def gen_text(sample):
    sample["text"] = sample["prompt"] + sample["completion"]
    return sample

In [21]:
def process_dataset(data):
    return (
        data.shuffle(seed=42)
        .map(gen_text)
        .remove_columns(
            [
                "prompt",
                "completion",
            ]
        )
    )

In [22]:
dataset = process_dataset(domain_dataset)

In [23]:
dataset

Dataset({
    features: ['text'],
    num_rows: 235
})

In [24]:
domain_dataset = dataset.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=1024), batched=True)

In [25]:
domain_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 235
})

In [27]:
from datasets import concatenate_datasets
dataset = concatenate_datasets([domain_dataset, conv_dataset])

In [28]:
dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 4135
})

In [29]:
#############################################################################################################

In [30]:
peft_training_args = TrainingArguments(
    output_dir = "/home/sivakrishna/Documents/jupyter/finetuning-test/interview_tuning/outputs",
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=100,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=100,
    do_eval=False,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)
peft_model.config.use_cache = False

In [31]:
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [32]:
peft_trainer.train()

Step,Training Loss
100,1.6936
200,1.5462
300,1.4943
400,1.4439
500,1.3647
600,1.3622
700,1.3951
800,1.3195
900,1.3433
1000,1.329


TrainOutput(global_step=1033, training_loss=1.4246627460490815, metrics={'train_runtime': 3271.4836, 'train_samples_per_second': 1.264, 'train_steps_per_second': 0.316, 'total_flos': 3.641223072872448e+16, 'train_loss': 1.4246627460490815, 'epoch': 1.0})

In [34]:
peft_trainer.save_model("/home/sivakrishna/Documents/jupyter/finetuning-test/interviewer_tuning/outputs/final")