In [None]:
# Setting Environment

!pip install accelerate
!pip install peft
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
!pip install transformers==4.34.0
!pip install trl==0.7.1
!pip install datasets==2.14.5
!pip install wandb

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.6.2
Looking in indexes: https://download.pytorch.org/whl/cu121
Looking in indexes: https://pypi.org/simple, https://jllllll.github.io/bitsandbytes-windows-webui
Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0

In [None]:
# All imports go here

import os


from copy import deepcopy
from random import randrange
from functools import partial


import torch
import accelerate
import bitsandbytes as bnb


from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from transformers.integrations import WandbCallback
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)
from trl import SFTTrainer

from huggingface_hub import login
login(token="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")

import wandb
wandb.login(key="aefafd1eeb36b853fd75c422ffc021d30bd259db")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Select Model here

model_name = "codellama/CodeLlama-7b-Instruct-hf"

project_name = "test-model"

hf_repo = "Insight244/test-model"

data_repo = "Insight244/3k_synthetic_python"

In [None]:
# Defining the Tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

Downloading tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Auto selects device to put model on.
)
model.config.use_cache = False

Downloading config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)  # Explicitly specify!

# This function does the following:
# 1. freezes the model weights
# 2. cast all non INT8 parameters (layer norm and lm head) to fp32 if the model is not gptq quantized

In [None]:
# Finding all LORA supporting layers

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    # lm_head is often excluded.
    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)

In [None]:
modules

['down_proj', 'q_proj', 'k_proj', 'up_proj', 'o_proj', 'gate_proj', 'v_proj']

In [None]:
# LoRA Parameters

lora_alpha = 16
lora_dropout = 0.1
lora_r = 8


peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

# The parameters are described as followed:
# r: the rank of the update matrices, expressed in `int`. Lower rank results in smaller update matrices with fewer trainable parameters.
# lora_alpha: The alpha parameter for Lora scaling.
# lora_dropout: The dropout probability for Lora layers.
# bias: Bias type for Lora. Can be ‘none’, ‘all’ or ‘lora_only’. If ‘all’ or ‘lora_only’, the corresponding biases will be updated during training. Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation.
# task_type: one of {SEQ_CLS, TOKEN_CLS, CAUSAL_LM, SEQ_2_SEQ_LM, QUESTION_ANS, FEATURE_EXTRACTION}



In [None]:
model = get_peft_model(model, peft_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 19988480 | total: 6758535168 | Percentage: 0.2958%


In [None]:
!nvidia-smi

Wed Nov 29 12:51:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    39W / 300W |   6260MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
data_files={'train': 'synthetic_static_python_train.parquet', 'test': 'synthetic_static_python_test.parquet', 'validation': 'synthetic_static_python_val.parquet.parquet'}
dataset = load_dataset(data_repo)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset['train']["data"][0]

'<s>\n[INST] Aggregate customer feedback from project Alpha and assign tasks to team members [/INST]\n```\nvar_1 = create_actionable_tasks_from_text(text="project Alpha")\nvar_2 = prioritize_objects(objects=var_1)\n```\n</s>'

In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# Change the max length depending on hardware constraints.
max_length = get_max_length(model)


In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset: str, seed: int = 42):
    # Format each prompt.
    print("Preprocessing dataset...")

    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    with torch.no_grad():
      model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id

    # https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/
    def preprocess_batch(batch, tokenizer, max_length):
        return tokenizer(
            batch["data"],
            max_length=max_length,
            truncation=True,
        )


    # Apply preprocessing to each batch of the dataset & and remove "conversations" and "text" fields.
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["data"],
    )


    # Filter out samples that have input_ids exceeding max_length.
    # Not needed as the tokenizer truncates all prompts over max length.
    # dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)


    # Shuffle dataset.
    dataset = dataset.shuffle(seed=seed)


    return dataset


In [None]:
formatted_dataset = deepcopy(dataset)
dataset = preprocess_dataset(tokenizer, max_length, dataset)

In [None]:
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,  # Best practice: https://huggingface.co/docs/transformers/main/main_classes/quantization#tips-and-best-practices
    gradient_accumulation_steps=4,  # Powers of 2.
    learning_rate=2e-4,
    max_grad_norm=1.0,
    # max_steps=40,
    lr_scheduler_type="linear",
    warmup_steps=5,
    fp16=True,
    logging_strategy="steps",
    logging_steps=1,
    # save_strategy="epochs",
    # save_steps=10,
    optim="paged_adamw_8bit",
    report_to="wandb",
    num_train_epochs=2,
    evaluation_strategy='steps',
    eval_steps=100,
    push_to_hub=True
)

training_args = training_args.set_push_to_hub(hf_repo, strategy='all_checkpoints')
training_args = training_args.set_save(strategy="epoch", steps=10) #change to epoch later

In [None]:
run = wandb.init(
    project=project_name,
    name="fine-tuning",  # Sometimes I use the run name as short descriptor for the run.
    config={
        "split": "train",
        # Optionally, you can add all hyperparameters and configs here for better reproducibility!
    },
    group="train",
    tags=["train"],  # Add tags for what might characterize this run.
    notes="Initial finetuning."
)
# You can call wandb.init before instantiating the `Trainer` to customize your run!


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],

)


In [None]:
results = trainer.train()  # Now we just run train()!
run.finish()


You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,0.3984,0.468896
200,0.3526,0.429267
300,0.4405,0.41496
400,0.3626,0.40165
500,0.2566,0.397892
600,0.3261,0.400152
700,0.357,0.391582
800,0.3793,0.388457




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

### INFERENCE

In [None]:
# Setting Environment

!pip install accelerate
!pip install peft
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
!pip install transformers==4.34.0
!pip install trl==0.7.1
!pip install datasets==2.14.5
!pip install wandb


# All imports go here

import os


from copy import deepcopy
from random import randrange
from functools import partial


import torch
import accelerate
import bitsandbytes as bnb


from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from transformers.integrations import WandbCallback
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)
from trl import SFTTrainer

from huggingface_hub import login
login(token="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")

import wandb
wandb.login(key="aefafd1eeb36b853fd75c422ffc021d30bd259db")

inf_model_name = "codellama/CodeLlama-7b-Instruct-hf"


tokenizer = AutoTokenizer.from_pretrained(inf_model_name)
tokenizer.pad_token = tokenizer.eos_token


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


# You can just use model.
inf_model = AutoModelForCausalLM.from_pretrained(
    inf_model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# Select Model here

model_name = "codellama/CodeLlama-7b-Instruct-hf"

project_name = "test-model"

hf_repo = "Insight244/test-model"

data_repo = "Insight244/3k_synthetic_python"

data_files={'train': 'synthetic_static_python_train.parquet', 'test': 'synthetic_static_python_test.parquet', 'validation': 'synthetic_static_python_val.parquet.parquet'}
dataset = load_dataset(data_repo)
formatted_dataset = deepcopy(dataset)

# import PEFT adapter

from huggingface_hub import login
login(token="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")

model = PeftModel.from_pretrained(inf_model, hf_repo, device_map='auto')



In [None]:
formatted_dataset['test']['data'][0]

In [None]:
def parse_latest_content(txt):
    # Read the entire file content with 'windows-1252' encoding
    content = txt

    # Split the content by the Query and JSON pattern
    pattern = re.compile(r'(.+?)```\n(.+?)```', re.DOTALL)
    matches = pattern.findall(content)

    # Create a list of dictionaries with 'query' and 'output' as keys
    data = [{'query': match[0].strip(), 'output': match[1].strip()} for match in matches]

    return data

In [None]:
import re
import time

queries = []
model_outputs = []
actual_outputs = []
sum = 0

for i in range(15):
  s = formatted_dataset['test']['data'][i]
  start = time.time()
  result = re.search('[INST](.*)[/INST]', s)
  result = '<s>\n[I' + result.group(1) + 'T]\n'

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model_input = tokenizer(result, return_tensors="pt").to(device)


  _ = model.eval()
  with torch.no_grad():
      out = model.generate(**model_input, max_new_tokens=200)
  op = parse_latest_content(tokenizer.decode(out[0], skip_special_tokens=True))[0]
  stop = time.time()
  sum += stop - start
  print(stop-start)
  real = parse_latest_content(formatted_dataset['test']['data'][i])[0]
  queries.append(op['query'])
  model_outputs.append(op['output'])
  actual_outputs.append(real['output'])
  #print(i)
  #print('Query:'+op['query'])
  #print('Output:'+op['output'])
  #print('Real:'+real['output'])
  #print()
sum = sum/15
print(sum)


0
Query:[INST] Generate a list of my P1 and P2 issues and create action items for them [/INST]
Output:# Not implemented yet: Generate a list of my P1 and P2 issues and create action items for them
Real:var_1 = who_am_i()
var_2 = works_list(issue.priority=["p1", "p2"], owned_by=[var_1], type=["issue"])
var_3 = create_actionable_tasks_from_text(text=var_2)

1
Query:[INST] What if our dreams are messages from a deeper part of ourselves, offering insights into our true nature? [/INST]
Output:#Unanswerable_Invalid_Query_Error
Real:#Unanswerable_Invalid_Query_Error

2
Query:[INST] Retrieve the current sprint ID and assign newly created tasks from the brainstorming session notes [/INST]
Output:var_1 = get_sprint_id()
var_2 = create_actionable_tasks_from_text(text="brainstorming session notes")
var_3 = add_work_items_to_sprint(work_ids=var_2, sprint_id=var_1)
Real:var_1 = create_actionable_tasks_from_text(text="brainstorming session notes")
var_2 = get_sprint_id()
var_3 = add_work_items_to_spr

IndexError: ignored

In [None]:
import pandas as pd

eval_data = pd.DataFrame({'Query': queries, 'Actual_Output': actual_outputs, 'Model_Output': model_outputs})
eval_data.head()

In [None]:
run = wandb.init(project=project_name)
# Create and log a new table.
my_table = wandb.Table(columns=["Query", "Actual_Output", "Model_Output"], data=[queries, actual_outputs, model_outputs])
run.log({"Eval Table": my_table})