# CODE LLAMA 7B FINE TUNING - ALPACA 20K DATASET

## IMPORTS

In [None]:
!pip install -q -U accelerate==0.23.0 peft==0.5.0 bitsandbytes==0.41.1 transformers==4.34 trl==0.7.2 wandb
!pip install huggingface_hub
!pip install scipy

In [1]:
import os
import wandb

os.environ["WANDB_NOTEBOOK_NAME"] = "code_llama_alpaca_20K"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33marjuntheprogrammer[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    CodeLlamaTokenizer,
    default_data_collator,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    BitsAndBytesConfig,
    AutoTokenizer,
)
from contextlib import nullcontext
from tqdm import tqdm
import json
import copy
import datasets
from peft import LoraConfig, PeftConfig
from transformers import default_data_collator, Trainer

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


## PRETRAINED MODEL AND DATASET SETUP

In [4]:
model_name = "codellama/CodeLlama-7b-hf"
dataset_id = "HuggingFaceH4/CodeAlpaca_20K"

# compute_dtype = getattr(torch, "float16")

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtyp=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    load_in_8bit=True
#     quantization_config=bnb_config,
)

model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading config.json: 100%|██████████| 637/637 [00:00<00:00, 2.49MB/s]
Downloading (…)fetensors.index.json: 100%|██████████| 25.1k/25.1k [00:00<00:00, 49.6MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 9.98G/9.98G [00:25<00:00, 388MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 3.50G/3.50G [00:09<00:00, 371MB/s]
Downloading shards: 100%|██████████| 2/2 [00:36<00:00, 18.34s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.90s/it]
Downloading generation_config.json: 100%|██████████| 116/116 [00:00<00:00, 350kB/s]
Downloading tokenizer_config.json: 100%|██████████| 749/749 [00:00<00:00, 2.64MB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 341MB/s]
Downloading tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 7.54MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 411/411 [00:00<00:00, 1.47MB/s]


In [10]:
print(tokenizer)

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


CodeLlamaTokenizerFast(name_or_path='codellama/CodeLlama-7b-hf', vocab_size=32016, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>', 'additional_special_tokens': ['▁<PRE>', '▁<MID>', '▁<SUF>', '▁<EOT>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	32007: AddedToken("▁<PRE>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=True),
	32008: AddedToken("▁<SUF>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=True),
	32009: AddedToken("▁<MID>", rstrip=True, lstrip=True

## PRE TRAINED MODEL INFERENCE

In [14]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer,
)

sequences = pipeline(
    'import socket\n\ndef ping_exponential_backoff(host: str):',
    do_sample=True,
    top_k=10,
    temperature=0.1,
    top_p=0.95,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=500,

)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartForCausalLM', 'Prophe

Result: import socket

def ping_exponential_backoff(host: str):
    """
    Ping a host with exponential backoff.

    :param host: host to ping
    :return: True if host is reachable, False otherwise
    """
    for i in range(1, 10):
        try:
            socket.create_connection((host, 80), 2).close()
            return True
        except OSError as e:
            if i < 10:
                time.sleep(2 ** i)
            else:
                raise e


def get_host_ip(host: str):
    """
    Get the IP address of a host.

    :param host: host to get the IP address of
    :return: IP address of host
    """
    return socket.gethostbyname(host)


def get_host_name(host: str):
    """
    Get the hostname of a host.

    :param host: host to get the hostname of
    :return: hostname of host
    """
    return socket.gethostname(host)


def get_host_fqdn(host: str):
    """
    Get the fully qualified domain name of a host.

    :param host: host to get the fully qualified domain 

## SETUP DATASET

In [5]:
def get_preprocessed_cmg_history(dataset_id, tokenizer, split):
    dataset = datasets.load_dataset(dataset_id, split=split)

    def apply_prompt_template(sample):
        return {
            "prompt": sample["prompt"],
            "message": sample["completion"],
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

    # mx = 0

    def tokenize_add_label(sample):
        prompt = tokenizer.encode(tokenizer.bos_token + sample["prompt"], add_special_tokens=False, max_length=200, truncation=True)
        message = tokenizer.encode(sample["message"] +  tokenizer.eos_token, max_length=400, truncation=True, add_special_tokens=False)
        max_length = 601 - len(prompt) - len(message)
        # mx = max(mx, len(prompt) + len(message))
        if max_length < 0:
            print("OK")

        pad = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False, max_length=max_length, padding='max_length', truncation=True)

        sample = {
            "input_ids": prompt + message + pad,
            "attention_mask" : [1] * (len(prompt) + len(message) + len(pad)),
            "labels": [-100] * len(prompt) + message + [-100] * len(pad),
            }

        return sample

    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))

    # print(mx)
    return dataset

In [6]:
train_dataset = get_preprocessed_cmg_history(dataset_id, tokenizer, 'train')


Downloading readme: 100%|██████████| 195/195 [00:00<00:00, 970kB/s]
Downloading metadata: 100%|██████████| 756/756 [00:00<00:00, 4.00MB/s]
Downloading data: 100%|██████████| 3.01M/3.01M [00:03<00:00, 998kB/s]
Downloading data: 100%|██████████| 336k/336k [00:02<00:00, 146kB/s]t]
Downloading data files: 100%|██████████| 2/2 [00:05<00:00,  2.66s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 1475.83it/s]
Generating train split: 100%|██████████| 18019/18019 [00:00<00:00, 664426.31 examples/s]
Generating test split: 100%|██████████| 2003/2003 [00:00<00:00, 563422.37 examples/s]
Map: 100%|██████████| 18019/18019 [00:00<00:00, 22602.66 examples/s]
Map: 100%|██████████| 18019/18019 [00:15<00:00, 1177.36 examples/s]


## SETUP TRAINING CONFIG

In [7]:
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,
        lora_alpha=64,
        lora_dropout=0.1,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config


model, lora_config = create_peft_config(model)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=0.5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=True,
    logging_strategy="steps",
    save_strategy="no",
    gradient_checkpointing=False,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)



trainable params: 2,097,152 || all params: 6,740,643,840 || trainable%: 0.031112042851977773


In [8]:
import torch;
print(torch.__version__)


2.3.0+cu121


## START TRAINING

Training Time: 1 hour 47 Mins

GPU Memeory USED: 11.6GB

In [15]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")



Step,Training Loss
10,0.8804
20,0.6159
30,0.5595
40,0.5885
50,0.5287
60,0.4553
70,0.5049
80,0.4357
90,0.4725
100,0.4321


## FINETUNED MODEL INFERENCE

In [16]:
import time

# Before executing the code
start_time = time.time()
print("start_time:", start_time)


## evaluate the model
model.eval()
eval_prompt = """Create a Python class with the following attributes: firstname, lastname and address.
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    output = tokenizer.decode(model.generate(**model_input, max_new_tokens=400, pad_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True)
print("output:", output)


# After executing the code
end_time = time.time()
print("end_time:", end_time)

# Calculate the execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

start_time: 1717157111.0090144




output: Create a Python class with the following attributes: firstname, lastname and address.
 class Person:
    def __init__(self, firstname, lastname, address):
        self.firstname = firstname
        self.lastname = lastname
        self.address = address
end_time: 1717157122.3060226
Execution time: 11.30 seconds


## SAVE MODEL TO HUGGING FACE

In [17]:
# !hugging-face-cli login --token xxx
from huggingface_hub import login
login(token="***")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/cto_auraml_com/.cache/huggingface/token
Login successful


In [19]:
model.push_to_hub('trained-model', use_temp_dir=False)
tokenizer.push_to_hub('trained-model', use_temp_dir=False)

adapter_model.bin: 100%|██████████| 8.43M/8.43M [00:02<00:00, 3.56MB/s]


CommitInfo(commit_url='https://huggingface.co/arjuntheprogrammer/trained-model/commit/681585b24fcff61087f57596c3bf05c18cbc2dc3', commit_message='Upload tokenizer', commit_description='', oid='681585b24fcff61087f57596c3bf05c18cbc2dc3', pr_url=None, pr_revision=None, pr_num=None)