# Finetuning Dolly GPT-J-6B with LoRa

LoRa paper - https://arxiv.org/abs/2106.09685

In [None]:
!git clone https://github.com/gururise/AlpacaDataCleaned.git

Cloning into 'AlpacaDataCleaned'...
remote: Enumerating objects: 747, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 747 (delta 64), reused 94 (delta 53), pack-reused 623[K
Receiving objects: 100% (747/747), 76.51 MiB | 15.41 MiB/s, done.
Resolving deltas: 100% (411/411), done.
Updating files: 100% (69/69), done.


In [None]:
!pip list

Package                       Version
----------------------------- --------------------
absl-py                       1.4.0
alabaster                     0.7.13
albumentations                1.2.1
altair                        4.2.2
anyio                         3.6.2
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
array-record                  0.2.0
arviz                         0.15.1
astropy                       5.2.2
astunparse                    1.6.3
attrs                         23.1.0
audioread                     3.0.0
autograd                      1.5
Babel                         2.12.1
backcall                      0.2.0
beautifulsoup4                4.11.2
bleach                        6.0.0
blis                          0.7.9
blosc2                        2.0.0
bokeh                         2.4.3
branca                        0.6.0
build                         0.10.0
CacheControl                  0.12.11
cac

In [2]:
!pip install -q datasets loralib sentencepiece
!pip uninstall transformers
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes


Found existing installation: transformers 4.30.0.dev0
Uninstalling transformers-4.30.0.dev0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.30.0.dev0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.30.0.dev0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## Load Tokenizer

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id

data = load_dataset("nelson2424/FAQ_NelsMarketplace")



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#Visualize data
print(data)
print(data.data['train'])

DatasetDict({
    train: Dataset({
        features: ['Instruction', 'Question', 'Context/Answer'],
        num_rows: 84
    })
})
MemoryMappedTable
Instruction: string
Question: string
Context/Answer: string
----
Instruction: [["Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:",...,"Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:","Answer the following question about the company Nels Marketplace:"]]
Question: [["What time will the products be dispatched?","Where do your items come fr

In [None]:
def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["Instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        {data_point["Instruction"]}

        ### Input:
        {data_point["Question"]}

        ### Response:
        {data_point["Context/Answer"]}"""
        
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

        ### Instruction:
        {data_point["Instruction"]}

        ### Response:
        {data_point["Context/Answer"]}"""


data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

data

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Instruction', 'Question', 'Context/Answer', 'prompt'],
        num_rows: 84
    })
})

In [None]:
data["train"][5]

{'Instruction': 'Answer the following question about the company Nels Marketplace:',
 'Question': 'Can you provide details on the countries or regions where the items are produced?',
 'Context/Answer': 'Our items are made in Cali and directly distributed to other major cities of Colombia: Barranquilla, Bogotá, Cartagena, and Medellin.',
 'prompt': {'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   

## Finetuning Dolly

In [None]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [28]:
# Settings for A100 - For 3090 
MICRO_BATCH_SIZE = 8  # change to 4 for 3090
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3  # paper uses 3
LEARNING_RATE = 2e-5  
CUTOFF_LEN = 512  
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B",
                                          add_eos_token=True, 
                                          )

model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B",
                                  load_in_8bit=True,
                                  device_map="auto", 
                                  )


model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)



Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

In [29]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

data = load_dataset("nelson2424/FAQ_NelsMarketplace")



  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

In [31]:
data

DatasetDict({
    train: Dataset({
        features: ['Instruction', 'Question', 'Context/Answer', 'input_ids', 'attention_mask'],
        num_rows: 84
    })
})

In [None]:
data['train'][1]

{'Instruction': 'Answer the following question about the company Nels Marketplace:',
 'Question': 'Is there a specific search term or keyword I should use to locate your fashion marketplace?',
 'Context/Answer': 'Our online fashion marketplace is easily accessible from various platforms. Here\'s where you can find us:\n\nWebsite: You can visit our official website by typing our web address into your web browser. Simply enter www.NelsMarketplace.com.co in the address bar, and you\'ll be directed to our online fashion marketplace. Our website is optimized for desktop, laptop, and mobile devices, providing a seamless shopping experience across different platforms.\n\nSocial Media: Stay connected with us through our official social media channels. We have active accounts on platforms such as Facebook, Instagram, Twitter, and Pinterest. Search for our brand name on these platforms and click the follow or like button to receive updates, fashion inspiration, promotions, and exclusive offers d

In [32]:

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        output_dir="lora-dolly",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("gptj6b-Finetune-FAQ_NelsMarketplace")



Step,Training Loss
1,1.336
2,0.6104
3,0.7223


In [26]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
model.push_to_hub("nelson2424/gptj6b-FAQ-NelsMarketplace", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/7.38M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nelson2424/gptj6b-FAQ-NelsMarketplace/commit/241036e71422e1c8d5f8063fcd722aabce1f56ac', commit_message='Upload model', commit_description='', oid='241036e71422e1c8d5f8063fcd722aabce1f56ac', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
import torch
from transformers import pipeline
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

peft_model_id = "nelson2424/gptj6b-FAQ-NelsMarketplace"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

In [4]:
model_peft = PeftModel.from_pretrained(model, peft_model_id)

In [10]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B",
                                          add_eos_token=True, 
                                          )
inputs = tokenizer(f' Answer the following question about the company Nels Marketplace: Where would I be able to discover a rebate code?', return_tensors="pt")
print(inputs)

with torch.no_grad():
    outputs = model_peft.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[23998,   262,  1708,  1808,   546,   262,  1664,   399,  1424, 36703,
            25,  6350,   561,   314,   307,  1498,   284,  7073,   257, 49099,
          2438,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[23998,   262,  1708,  1808,   546,   262,  1664,   399,  1424, 36703,
            25,  6350,   561,   314,   307,  1498,   284,  7073,   257, 49099,
          2438,    30,   198,   198,    32,    13,   383,  1664,   447,   247,
            82,  3052]])
[' Answer the following question about the company Nels Marketplace: Where would I be able to discover a rebate code?\n\nA. The company’s website']


In [11]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B",
                                          add_eos_token=True, 
                                          )
inputs = tokenizer(f' Answer the following question about the company Nels Marketplace: How can I contact customer support?', return_tensors="pt")
print(inputs)

with torch.no_grad():
    outputs = model_peft.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[23998,   262,  1708,  1808,   546,   262,  1664,   399,  1424, 36703,
            25,  1374,   460,   314,  2800,  6491,  1104,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[23998,   262,  1708,  1808,   546,   262,  1664,   399,  1424, 36703,
            25,  1374,   460,   314,  2800,  6491,  1104,    30,   198,   198,
            32,    13,  4889,   262,  1664,   379,   357,  7410]])
[' Answer the following question about the company Nels Marketplace: How can I contact customer support?\n\nA. Call the company at (800']
