In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd '/content/drive/zolo-ai'

Mounted at /content/drive
[Errno 2] No such file or directory: '/content/drive/zolo-ai'
/content


In [1]:
!pip install -q accelerate peft bitsandbytes transformers trl

In [None]:
# !pip install huggingface_hub["cli"]
# !huggingface-cli delete-cache
# !huggingface-cli scan-cache

# from transformers import TRANSFORMERS_CACHE
# print(TRANSFORMERS_CACHE)

In [1]:
import os
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                   AutoTokenizer,
                   BitsAndBytesConfig,
                   HfArgumentParser,
                   TrainingArguments,
                   TextDataset,
                   DataCollatorForLanguageModeling,
                   pipeline,
                   logging)
from accelerate import Accelerator
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [2]:
# The model that you want to train from the Hugging Face hub
model_name = "deepseek-ai/deepseek-coder-7b-instruct-v1.5"

# Fine-tuned model name
new_model = "zolo-deep-seek-7b"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 5

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0


################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
# max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}
# accelerator = Accelerator()
# device_map={"": accelerator.process_index}

# print(device_map)

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
      target_modules = [
    "lm_head",
    "q_proj",
    "k_proj",
    "v_proj",
    "up_proj",
    "down_proj",
    "o_proj",
    "gate_proj"
  ]
)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
!huggingface-cli login --token "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"

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /teamspace/studios/this_studio/.cache/huggingface/token
Login successful


In [7]:
# dataset
dataset_name = "shetumohanto/zoloai"
dataset = load_dataset(dataset_name, split='train')
dataset = dataset.shuffle()
len_data = len(dataset['text'])
print(len(dataset))

Downloading data: 100%|██████████| 5.49M/5.49M [00:00<00:00, 22.9MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

1400


In [18]:
# add zolo vocab
vocabulary = tokenizer.get_vocab().keys()

zolo_vocab = open("vocab.txt", "r").read().split()

for new_word in zolo_vocab:
    if new_word not in vocabulary:
        tokenizer.add_tokens(new_word)
        print(new_word)

model.resize_token_embeddings(len(tokenizer))

Embedding(100057, 4096)

In [17]:
# test tokenizer
# text_in = """<s> [INST] <<SYS>> Generate HTML code in ZOLO format. uniqueId must be of 8 character <<\SYS>> Develop a paragraph describing the capabilities of cars and motorcycles, formatted into two columns. [/INST] <!-- wp:zolo/container {"variationStatus":true,"isBlockRootParent":true,"zolo_ContainerGapGap":31,"zolo_ContentWidthRange":1737,"zolo_MinHeightRange":917,"uniqueId":"container-13ktzz6v","parentClasses":["zolo-block parent-container-13ktzz6v"],"zoloStyles":{"desktop":".container-13ktzz6v.block-editor-block-list__block.wp-block-zolo-container > .zolo-container-inner-blocks-wrap,.wp-block-zolo-container.zolo-root-container.alignfull.container-13ktzz6v > .zolo-container-inner-blocks-wrap { max-width:1737px }.is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-6fd81acc-a47d-474c-82a3-cf17aa46642e,.wp-block-zolo-container.zolo-root-container.frontend .container-13ktzz6v { max-width:100%; width:100% }.container-13ktzz6v { min-height:917px }.container-13ktzz6v.wp-block-zolo-container > .zolo-container-inner-blocks-wrap > .block-editor-inner-blocks > .block-editor-block-list__layout,.container-13ktzz6v.wp-block-zolo-container.zolo-root-container.alignfull > .zolo-container-inner-blocks-wrap { gap:31px; flex-direction:row; flex-wrap:nowrap; justify-content:center; align-items:center }","tab":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-6fd81acc-a47d-474c-82a3-cf17aa46642e,.wp-block-zolo-container.zolo-root-container.frontend .container-13ktzz6v { width:100% }","mobile":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-6fd81acc-a47d-474c-82a3-cf17aa46642e,.wp-block-zolo-container.zolo-root-container.frontend .container-13ktzz6v { width:100% }"}} --> <div class="wp-block-zolo-container container-13ktzz6v alignfull zolo-root-container frontend zolo-block parent-container-13ktzz6v"><div class="zolo-container-inner-blocks-wrap"><!-- wp:zolo/container {"zolo_ContainerGapGap":21,"zolo_ContainerWidthRange":702,"zolo_TABContainerWidthRange":100,"zolo_TABContainerWidthUnit":"%","zolo_MinHeightRange":948,"FlexDirectionZRPAlign":"column","uniqueId":"container-0724suae","parentClasses":["zolo-block parent-container-0724suae"],"zoloStyles":{"desktop":".container-0724suae.block-editor-block-list__block.wp-block-zolo-container > .zolo-container-inner-blocks-wrap,.wp-block-zolo-container.zolo-root-container.alignfull.container-0724suae > .zolo-container-inner-blocks-wrap { max-width:1200px }.is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-160a1371-a6fe-4150-9282-4414d1f32e31,.wp-block-zolo-container.zolo-root-container.frontend .container-0724suae { max-width:702px; width:100% }.container-0724suae { min-height:948px }.container-0724suae.wp-block-zolo-container > .block-editor-inner-blocks > .block-editor-block-list__layout,.container-0724suae.wp-block-zolo-container.frontend { gap:21px; flex-direction:column; flex-wrap:nowrap; justify-content:center; align-items:center }","tab":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-160a1371-a6fe-4150-9282-4414d1f32e31,.wp-block-zolo-container.zolo-root-container.frontend .container-0724suae { max-width:100%; width:100% }","mobile":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-160a1371-a6fe-4150-9282-4414d1f32e31,.wp-block-zolo-container.zolo-root-container.frontend .container-0724suae { width:100% }"}} --> <div class="wp-block-zolo-container container-0724suae frontend zolo-block parent-container-0724suae"><!-- wp:paragraph --> <p>Televisions provide entertainment, and sound systems offer immersive audio.</p> <!-- /wp:paragraph --></div> <!-- /wp:zolo/container --> <!-- wp:zolo/container {"zolo_ContainerGapGap":103,"zolo_ContainerWidthRange":1370,"zolo_TABContainerWidthRange":100,"zolo_TABContainerWidthUnit":"%","zolo_MinHeightRange":578,"FlexDirectionZRPAlign":"column","uniqueId":"container-gm2zt4f8","parentClasses":["zolo-block parent-container-gm2zt4f8"],"zoloStyles":{"desktop":".container-gm2zt4f8.block-editor-block-list__block.wp-block-zolo-container > .zolo-container-inner-blocks-wrap,.wp-block-zolo-container.zolo-root-container.alignfull.container-gm2zt4f8 > .zolo-container-inner-blocks-wrap { max-width:1200px }.is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-0c06dc24-3b16-42ea-9eed-4d4d0b195e36,.wp-block-zolo-container.zolo-root-container.frontend .container-gm2zt4f8 { max-width:1370px; width:100% }.container-gm2zt4f8 { min-height:578px }.container-gm2zt4f8.wp-block-zolo-container > .block-editor-inner-blocks > .block-editor-block-list__layout,.container-gm2zt4f8.wp-block-zolo-container.frontend { gap:103px; flex-direction:column; flex-wrap:nowrap; justify-content:center; align-items:center }","tab":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-0c06dc24-3b16-42ea-9eed-4d4d0b195e36,.wp-block-zolo-container.zolo-root-container.frontend .container-gm2zt4f8 { max-width:100%; width:100% }","mobile":".is-root-container > .block-editor-block-list__block .block-editor-block-list__block#block-0c06dc24-3b16-42ea-9eed-4d4d0b195e36,.wp-block-zolo-container.zolo-root-container.frontend .container-gm2zt4f8 { width:100% }"}} --> <div class="wp-block-zolo-container container-gm2zt4f8 frontend zolo-block parent-container-gm2zt4f8"><!-- wp:paragraph --> <p>Cars provide convenience, while motorcycles offer agility.</p> <!-- /wp:paragraph --></div> <!-- /wp:zolo/container --></div></div> <!-- /wp:zolo/container --> </s>"""
# encoded = tokenizer.encode(text_in)
# tokenized = tokenizer.tokenize(text_in)
# print(f"Encoded: {encoded}")
# print(f"Tokenized: {tokenized}")

In [9]:
max_seq_length = model.config.max_position_embeddings
print(f"Model can handle maximum {max_seq_length} tokens")

Model can handle maximum 4096 tokens


In [19]:
max_seq_length = max([len(tokenizer.encode(data["text"])) for data in dataset])
print(f"max_seq_length set to {max_seq_length}")

max_seq_length set to 1458


In [23]:
# messages_list = [
#     [{"role": "user", "content": "Who are you?"}],
#     [{"role": "user", "content": "What can you do?"}],
#     [{"role": "user", "content": "Explain Transformer briefly."}],
# ]
# prompts = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) for messages in messages_list]
# print(prompts)

In [None]:
# del model
# del tokenizer
# del trainer
# import gc
# gc.collect()
# torch.cuda.empty_cache()

# !nvidia-smi

In [15]:
# adapter_url = "shetumohanto/Code-Llama-2-7b"
# model.load_adapter(adapter_url)

In [23]:
# Number of training epochs
num_train_epochs = 2

# Batch size per GPU for training
per_device_train_batch_size = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 94.38 MiB is free. Process 6188 has 15.68 GiB memory in use. Of the allocated memory 14.84 GiB is allocated by PyTorch, and 483.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def ask_ques(question, model, tokenizer):
  prompt = question
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
  result = pipe(f"<s> [INST] <<SYS>> Generate HTML code in ZOLO format. uniqueId must be of 8 character. <<\SYS>> {prompt} [/INST]")
  return result[0]['generated_text'].replace(f"[INST] {prompt} [/INST]", "")

In [None]:
import warnings
warnings.filterwarnings("ignore")

#-------------Question goes here---------------------
user_ip = "Formulate a paragraph about Computer."
# user_ip = "Write a paragraph about motorbike."
# model.load_adapter(new_model)
response = ask_ques(user_ip, model, tokenizer)
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>> Generate HTML code with CSS style in ZOLO format. 'uniqueId' must be of 8 character. <<\SYS>> Formulate a paragraph about Computer. [/INST] <!-- wp:zolo/container {"variationStatus":true,"isBlockRootParent":true,"zolo_ContainerGapGap":10,"zolo_ContentWidthRange":1000,"zolo_MinHeightRange":50,"FlexDirectionZRPAlign":"column","uniqueId":"container-6xq3237","parentClasses":["zolo-block parent-container-6xq3237"],"zoloStyles":{"desktop":".container-6xq3237.block-editor-block-list__block.wp-block-zolo-container >.zolo-container-inner-blocks-wrap,.wp-block-zolo-container.zolo-root-container.alignfull.container-6xq3237 >.zolo-container-inner-blocks-wrap { max-width:1000px }.is-root-container >.block-editor-block-list__block.block-editor-block-list__block#block-8a4350b5-1364-45b6-9dc4-53e1135edc81,.wp-block-zolo-container.zolo-root-container.frontend.container-6xq3237 { max-width:100%; width:100% }.container-6xq3237 { min-height:50px }.container-6xq3237.wp-block-zolo-contain