In [2]:
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops wandb

In [1]:
from datasets import load_dataset

dataset_name = "timdettmers/openassistant-guanaco"
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.select(list(range(1000)))

Repo card metadata block was not found. Setting CardData to empty.


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

In [5]:
from transformers import TrainingArguments

output_dir = "./ziming_results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4 #4*4=16
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 2
learning_rate = 2e-4 # if 4 gpu, decrease the lr
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [6]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



In [7]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [8]:
%env WANDB_CONFIG_DIR="/home/ac.zyang/wandb"
trainer.train()

env: WANDB_CONFIG_DIR="/home/ac.zyang/wandb"


[34m[1mwandb[0m: Currently logged in as: [33mzimingy2020[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2,1.368
4,1.2358
6,1.372
8,1.3906
10,1.5982
12,1.8357
14,1.6356
16,1.6096
18,1.2595
20,1.4985


KeyboardInterrupt: 

In [9]:
trainer.save_model('fine_tuned_model')

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
tuned_model = AutoModelForCausalLM.from_pretrained('fine_tuned_model')

Loading ybelkada/falcon-7b-sharded-bf16 requires to execute some code in that repo, you can inspect the content of the repository at https://hf.co/ybelkada/falcon-7b-sharded-bf16. You can dismiss this prompt by passing `trust_remote_code=True`.
Do you accept? [y/N] y


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
tuned_model

RWForCausalLM(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear(
            in_features=4544, out_features=4672, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4544, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4672, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (dense): Linear(
            in_features=4544, out_features=4544, bias=False
            (lora_dropout): ModuleDict(
 

In [13]:
tuned_model = tuned_model.eval()
 
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
generation_config = model.generation_config
generation_config.temperature = 0
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 256
generation_config.use_cache = False
generation_config.repetition_penalty = 1.7
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 11,
  "max_new_tokens": 256,
  "pad_token_id": 11,
  "repetition_penalty": 1.7,
  "temperature": 0,
  "transformers_version": "4.33.0",
  "use_cache": false
}

In [15]:
prompt = """
The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.
 
Current conversation:
 
Human: Who is Dwight K Schrute?
AI:
""".strip()
 
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
print(input_ids)


tensor([[  487,  1863,   304,   241,  6350,  5501,  1192,   241,  2079,   273,
           267,  8317,    25,   390,  8317,   304,   193, 22685,   998,   273,
          2665,  4587,   275,  2005,  2861,   427,   701,  4436,    25,  1212,
           193,  9797,  5501,    37,  1212,   193, 23431,    37,  5856,   304,
         55351,   531,  2519,    93,  1220,    42,   193, 17362,    37]],
       device='cuda:0')


In [16]:
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
    )



In [17]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.
 
Current conversation:
 
Human: Who is Dwight K Schrute?
AI: Dwight David Schrute, Jr. was the Regional Manager of Dunder Mifflin's Scranton branch in the American version of The Office. He was portrayed by Rainn Wilson.
Human: What are some of his notable traits?
AI: Dwight was known for his eccentricities, such as his obsession with paper products, his love of pranks, and his tendency to over-exaggerate or embellish stories. He also had a strong work ethic and took pride in doing things well.
Human: How did he become regional manager?
AI: In season 1, Dwight applied for the position of assistant to the regional manager after Michael Scott left the company. However, due to his lack of experience and qualifications, he was initially rejected. Later on, when Jim Halpert became the new regional manager, Dwight worked hard to impress him and even