load and fine-tune llms on human data

here's the colab link --> https://colab.research.google.com/drive/1IFh4vqZRAiiP1aAO2EJBgz7bPVvc2slx#scrollTo=vaiptgAekStt 

In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1,2,3"

In [3]:
import os
os.environ['HF_TOKEN'] = ""

In [4]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-13b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id, 
                                         quantization_config=bnb_config,
                                         #load_in_4bit=True,
                                         device_map="auto")

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
# check lora trainable layers
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)
        break
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "q_proj", "v_proj", "o_proj"],
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    modules_to_save=['weight']
)

lora_model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

# import torch
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# lora_model.to(device)

trainable params: 26214400 || all params: 6698193920 || trainable%: 0.391365199531279


In [10]:
fname = "/home/gunala/LLMDialGen/human_data/human_dat_reformatted.json"
# data processing
import json
with open(fname,'r') as inf:
    human_dat = json.load(inf)

def format_history(text,play):
  input = f"""
      <<SYS>>
      Imagine that you and your criminal partner have been placed under arrest and are being interrogated in separate rooms. Both you and your partner have the option to either 'Silence' or 'Confess'. If both of you are silent, you both receive a prison sentence of 1 year. If one of you confesses and the other stays silent, the partner that confesses will receive a prison sentence of 0 years while the one of the stayed silent will receive a sentence of 5 years. If both of you confess, you will both receive sentences of 3 years. You and your partner will be interrogated for 20 rounds total, and your goal is to minimize overall prison sentence across all rounds. Respond in one word, either 'Silence' or 'Confess'.\n
      <</SYS>>
      [INST]
      User:{text}
      [/INST]\n

      Assistant:{play}
  """
  return input

# now will want to reformat each game into llama2-chat friendly format
train_texts = {'texts':[]}
ind = 0
for game_id in human_dat.keys():
    round_ids = sorted([int(x) for x in human_dat[game_id].keys()],reverse=True)
    game_history = ""
    for round_id in range(len(round_ids) - 1):
        game_history += "Round " + str(round_id) + ": " + \
                        "Player 1 played " + human_dat[game_id][str(round_id)]['p1'] + \
                        ", Player 2 played " + human_dat[game_id][str(round_id)]['p2'] + ". "
    user_prompt = "You are Player 1, and this is the history of actions so far: " + game_history + ". Will you choose 'Silence' or 'Confess' in the next round? Please answer in 1 word."
    play = human_dat[game_id][str(round_id)]['p1']
    input = format_history(user_prompt,play)
    train_texts['texts'].append(input)

    user_prompt = "You are Player 2, and this is the history of actions so far: " + game_history + ". Will you choose 'Silence' or 'Confess' in the next round? Please answer in 1 word."
    play = human_dat[game_id][str(round_id)]['p2']
    input = format_history(user_prompt,play)
    train_texts['texts'].append(input)

    if ind == 5000:
        break

from datasets import Dataset
data = Dataset.from_dict(train_texts)
data = data.map(lambda samples: tokenizer(samples['texts']), batched=True)
data = data.train_test_split(test_size=0.15)

Map:   0%|          | 0/24716 [00:00<?, ? examples/s]

In [15]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
      per_device_train_batch_size=1,
     # gradient_accumulation_steps=4,
      evaluation_strategy='epoch',
      num_train_epochs=1,
     # warmup_steps=2,
      max_steps=1,
      learning_rate=2e-4,
     # fp16=True,
      logging_steps=1,
      output_dir="outputs_train_13bchat",
      optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.2088,0.152779


TrainOutput(global_step=1, training_loss=0.2087707370519638, metrics={'train_runtime': 1304.2957, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.001, 'total_flos': 29362384281600.0, 'train_loss': 0.2087707370519638, 'epoch': 0.0})

In [17]:
os.environ['HF_TOKEN'] = "" # need to use write-access token this time
lora_model.push_to_hub("aegunal/llama13bchat_ft_ipd")

adapter_model.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aegunal/llama13bchat_ft_ipd/commit/af5e08a0e9a161f6e472729947dd1cc0e7cb0183', commit_message='Upload model', commit_description='', oid='af5e08a0e9a161f6e472729947dd1cc0e7cb0183', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# for inference, can load the model:
from peft import AutoPeftModelForCausalLM
loaded_model = AutoPeftModelForCausalLM.from_pretrained("aegunal/llama7b_ft_ipd")