In [1]:
import pathlib
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from transformers import TrainingArguments
from trl import SFTTrainer
from evaluate import load
from peft import LoraConfig, prepare_model_for_kbit_training
from args_parser import get_args
import re
from pathlib import Path
from utils import *

In [2]:
import numpy as np

In [3]:
args = get_args()

args.field = 'prompt'

for arg in vars(args):
    print(arg, getattr(args, arg))







logging_dir ./logs
use_flash_attention_2 False
report_to tensorboard
max_steps 10000
save_steps 400
logging_steps 100
max_seq_length 256
checkpoint_path None
do_eval False
do_train False
evaluation_strategy steps
eval_steps 10
log_level info
logging_strategy steps
save_total_limit 10
run_name LLama2
base_prompt Below is a clue for a decrypting crossword. Your task is to solve this clue. The number of characters in the answer should be same as the number in the parenthesis. Just output the answer only.
dataset_path ../data/disjoint_word_init.json
field prompt
model_name meta-llama/Llama-2-7b-hf
output_dir ./experiments
per_device_train_batch_size 4
per_device_val_batch_size 2
gradient_accumulation_steps 2
optim paged_adamw_32bit
learning_rate 0.0002
max_grad_norm 0.3
warmup_ratio 0.03
lr_scheduler_type constant
group_by_length True
bnb_4bit_quant_type nf4
bnb_4bit_compute_dtype bfloat16
bnb_4bit_use_double_quant True
gradient_checkpointing False
lora_alpha 16
lora_dropout 0.1
lora_r 64


In [4]:

model_name = args.model_name

## Bits and Bytes config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_flash_attention_2=args.use_flash_attention_2,
)


## Enable gradient checkpointing
if args.gradient_checkpointing:
    model.gradient_checkpointing_enable()

## Prepare model for k-bit training
# model = prepare_model_for_kbit_training(model)


## Print the number of trainable parameters
print_trainable_parameters(model)

## Silence the warnings
model.config.use_cache = False

## Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


output_dir = args.output_dir

per_device_train_batch_size =  args.per_device_train_batch_size
per_device_val_batch_size = args.per_device_val_batch_size
gradient_accumulation_steps = args.gradient_accumulation_steps



optim = args.optim

save_steps = args.save_steps
logging_steps = args.logging_steps
learning_rate = args.learning_rate
max_grad_norm = args.max_grad_norm



print(f"save_steps: {save_steps}")
print(f"logging_steps: {logging_steps}")



warmup_ratio = args.warmup_ratio
lr_scheduler_type = args.lr_scheduler_type


output_dir = args.output_dir + f"/{model_name.split('/')[-1]}"
loggig_dir = args.logging_dir + f"/{model_name.split('/')[-1]}" + f"/logs"
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(loggig_dir).mkdir(parents=True, exist_ok=True)
print(f"Saving the model to {output_dir}")





lora_alpha = args.lora_alpha
lora_dropout = args.lora_dropout
lora_r = args.lora_r
lora_target_modules = args.lora_target_modules

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = lora_target_modules
)


max_seq_length = args.max_seq_length







Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 262410240 || all params: 3500412928 || trainable%: 7.496550989769399
save_steps: 400
logging_steps: 100
Saving the model to ./experiments/Llama-2-7b-hf


In [5]:
print("Loading the datasets")
train_dataset   = get_dataset(dataset_path=args.dataset_path,tokenizer=tokenizer, field = args.field, split = 'train')
val_dataset     = get_dataset(dataset_path=args.dataset_path,tokenizer=tokenizer, field = args.field, split = 'val')

Loading the datasets


In [6]:
def extract_output(batch):
    
    outputs = []
    for j,sample in enumerate(batch):
        lines = sample.split('\n')
        for i,l in enumerate(lines):

            if l=='### Response:':
                outputs.append( lines[i+1].lower().strip() )
                break
        if len(outputs) <= j :
            outputs.append('')
    
    return outputs
            


In [7]:

# tokenizer_test = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer_test.pad_token = tokenizer.eos_token

def compute_metrics(eval_pred):
    predictions, labels, inputs = eval_pred
    # predictions = predictions[:, 0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    pred = predictions.argmax(-1)
    
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    pred = tokenizer.batch_decode(pred, skip_special_tokens=True)

    extracted_labels = extract_output(labels)
    extracted_pred = extract_output(pred)
    

    correct = 0
    for l,p in zip(extracted_labels, extracted_pred):
        correct += int(l==p)

    return { 'accuracy': correct/len(labels) }

   


    

In [8]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    per_device_eval_batch_size=per_device_val_batch_size,
    evaluation_strategy=args.evaluation_strategy,
    eval_steps=10,
    logging_steps=10,
    run_name=args.run_name,
    save_steps=save_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=100,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    report_to=args.report_to,
    gradient_checkpointing=args.gradient_checkpointing,
    neftune_noise_alpha=0.1,
    logging_dir=loggig_dir,
    include_inputs_for_metrics=True
)

In [9]:
val_dataset

Dataset({
    features: ['prompt', 'clue', 'labels'],
    num_rows: 32628
})

In [10]:
val_dataset = val_dataset.select(range(10))  

In [11]:
val_dataset[0]['prompt']

'### Instruction: Below is a clue for a decrypting crossword. Your task is to solve this clue. The number of characters in the answer should be same as the number in the parenthesis. Just output the answer only.\n\n### Input:\nTension in an arm? Slightly (1,6)\n\n### Response:\na trifle'

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field=args.field,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    compute_metrics=compute_metrics,
)

In [13]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

if args.checkpoint_path:
    trainer.train(resume_from_checkpoint=args.checkpoint_path)
else:
    trainer.train()


# trainer.save_model()


print("Done training")
print(trainer.model)

  0%|          | 0/100 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.8151, 'learning_rate': 0.0002, 'epoch': 0.0}


  0%|          | 0/5 [00:00<?, ?it/s]


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



{'eval_loss': 0.8718175888061523, 'eval_accuracy': 0.0, 'eval_runtime': 171.797, 'eval_samples_per_second': 0.058, 'eval_steps_per_second': 0.029, 'epoch': 0.0}
{'loss': 0.8515, 'learning_rate': 0.0002, 'epoch': 0.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7333288788795471, 'eval_accuracy': 0.0, 'eval_runtime': 1.64, 'eval_samples_per_second': 6.097, 'eval_steps_per_second': 3.049, 'epoch': 0.0}


KeyboardInterrupt: 

: 

In [4]:
import time

x = []



for i in range(10):
    time.sleep(1)
    x.append(i)

KeyboardInterrupt: 

In [5]:
x

[0, 1, 2]