In [1]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

In [2]:
import torch
from transformers import pipeline

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [3]:
!pip install -U bitsandbytes



In [4]:
!pip install -U transformers
!pip install -U peft
!pip install -U accelerate
!pip install -U trl



# Experimenting with different Prompts

In [None]:
prompt = "Find four synonyms for the word 'happy':"

outputs = pipe(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
print(outputs[0]["generated_text"])

In [None]:
prompt = "These four words: JAB, POKE, PROD, STICK are associated because they are synonyms. Find another group of 4 synonyms."

outputs = pipe(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
print(outputs[0]["generated_text"])

In [None]:
prompt = "These four words: ARMY, COLONY, LIVERY, SHINY are associated because they start with words that are also body parts. For example, ARMY = ARM + Y, COLONY = COLON + Y. Find another 4 words that are associated in this way."

outputs = pipe(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
print(outputs[0]["generated_text"])

In [None]:
prompt = "CRICKET, FROG, HARE, KANGAROO are all jumping animals. CYCLE, PHASE, ROUND, STAGE are all part of a process. Find another four words that are associated in this way."

outputs = pipe(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
print(outputs[0]["generated_text"])

In [None]:
prompt = "Find associated words with: wine tasting descriptors"

outputs = pipe(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
print(outputs[0]["generated_text"])

# Fine Tuning

In [5]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer

In [6]:
!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful


In [7]:
base_model = "HuggingFaceH4/zephyr-7b-beta"
dataset_name = "THUDM/AgentInstruct"
new_model = "zephyr-7b-beta-Agent-Instruct"

In [17]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n ### Answer: {example['response'][i]}"
        output_texts.append(text)
    return output_texts

#Importing the dataset
dataset = load_dataset("lucyd/deepgen", split="train")
# def format_prompt(sample):
#     intro = "Below is a conversation between a user and you."
#     end = "Instruction: Write words associated with the seed word."

#     try:
#         formatted_conversations = "\n".join(
#             f"<{resp['prompt']}>: {resp['response']}"
#             for resp in sample["conversations"]
#         )

#         sample["text"] = f"{intro}\n\n{formatted_conversations}\n\n{end}"
#     except (TypeError, KeyError):
#         raise ValueError("Invalid format of the input sample.")
#     return sample


# dataset = dataset.map(
#     format_prompt
# )
dataset[100]

{'prompt': 'Give me four words associated with this seed word: "tv shows",\n',
 'response': ' fargo, firefly, fleabag, flipper'}

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

(True, True)

In [16]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'base_layer', 'down_proj']
)
model = get_peft_model(model, peft_config)

In [27]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
#     report_to="wandb"
)


# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


trainer.train()



Map:   0%|          | 0/267 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'loss': 0.6805, 'grad_norm': 2.6030008792877197, 'learning_rate': 0.0002, 'epoch': 0.37}




{'loss': 0.5733, 'grad_norm': 2.6140499114990234, 'learning_rate': 0.0002, 'epoch': 0.75}




{'loss': 0.5879, 'grad_norm': 2.08477520942688, 'learning_rate': 0.0002, 'epoch': 1.12}




{'loss': 0.4023, 'grad_norm': 1.2869175672531128, 'learning_rate': 0.0002, 'epoch': 1.49}




{'loss': 0.4343, 'grad_norm': 1.377175211906433, 'learning_rate': 0.0002, 'epoch': 1.87}




{'loss': 0.3308, 'grad_norm': 1.7307401895523071, 'learning_rate': 0.0002, 'epoch': 2.24}




{'loss': 0.3308, 'grad_norm': 0.7570518255233765, 'learning_rate': 0.0002, 'epoch': 2.61}




{'loss': 0.3327, 'grad_norm': 0.8085630536079407, 'learning_rate': 0.0002, 'epoch': 2.99}




{'train_runtime': 681.6615, 'train_samples_per_second': 1.175, 'train_steps_per_second': 0.295, 'train_loss': 0.4581849712934067, 'epoch': 3.0}


TrainOutput(global_step=201, training_loss=0.4581849712934067, metrics={'train_runtime': 681.6615, 'train_samples_per_second': 1.175, 'train_steps_per_second': 0.295, 'train_loss': 0.4581849712934067, 'epoch': 3.0})

In [64]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()

trainer.model.push_to_hub(new_model, use_temp_dir=False)

TypeError: stat: path should be string, bytes, os.PathLike or integer, not LlamaTokenizerFast

In [65]:
# Save the fine-tuned model
# trainer.model.save_pretrained(new_model)
# wandb.finish()

tokenizer.push_to_hub(new_model, use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aekang12/zephyr-7b-beta-Agent-Instruct/commit/aea74297a510225c1d8ee05f36ea718f5294643c', commit_message='Upload tokenizer', commit_description='', oid='aea74297a510225c1d8ee05f36ea718f5294643c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [20]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Find four synonyms for the word 'happy':"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])



Find four synonyms for the word 'happy': cheerful, content, delighted, pleased.

 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [28]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Give me four words associated with this seed word: happy"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])



Give me four words associated with this seed word: happy, in a good mood, lighthearted, upbeat

 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [30]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Give me four words associated with this seed word: disorderly place"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Give me four words associated with this seed word: disorderly place,

    chaos, clutter, mess, jumble

 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [31]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Give me four words associated with this seed word: ways to prepare eggs"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Give me four words associated with this seed word: ways to prepare eggs,

 ### Answer:  boil, fry, poach, scramble
 ###
 ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [50]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with tele prefix are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with tele prefix are _, _, _, and _.

 ### Answer:  bunny, fax, phone, pod
 ###
 ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [36]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with summer gear are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with summer gear are _, _, _, and _.

 ### Answer:  bathing suit, flip-flops, hat, sunglasses
 ### ### ### ### ### ### ### ### ### ### ### ###


In [37]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with pasta shapes are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with pasta shapes are _, _, _, and _.

 ### Answer:  bowtie, fettuccine, penne, spaghetti
 ### ### ### ### ### ### ### ### ###


In [39]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with augumentative prefixes are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with augumentative prefixes are _, _, _, and _.

 ### Answer:  maxi, mega, multi, ultra
 ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [40]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with become larger are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with become larger are _, _, _, and _.

 ### Answer:  build, grow, swell, mount
 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [41]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with orderly are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with orderly are _, _, _, and _.

 ### Answer:  neat, organized, tidy, clean
 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [44]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four phrases with wheel are _ wheel, _ wheel, _ wheel, and _ wheel."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four phrases with wheel are _ wheel, _ wheel, _ wheel, and _ wheel.

 ### Answer:  bachelor, carousel, ferris, fire.
 ###
 ### ### ### ### ### ###


In [51]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with wheel phrase are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with wheel phrase are _, _, _, and _.

 ### Answer:  bachelor, car, ferris, mouse
 ###
 ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [54]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with wrap phrase are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with wrap phrase are _, _, _, and _.

 ### Answer:  bacon, burrito, carrot, sandwich
 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


In [55]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with starts of US states are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with starts of US states are _, _, _, and _.

 ### Answer:  bama, kent, new, south
 ###
 ### ### ### ### ### ### ### ### ### ### ### ### ###


In [56]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with seen in 'Cinderella' are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with seen in 'Cinderella' are _, _, _, and _.

 ### Answer:  bib, carriage, mirror, slipper
 ### ### ### ### ### ### ### ### ### ### ### ###


In [57]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with things in 'My favorite things' are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with things in 'My favorite things' are _, _, _, and _.

 ### Answer:  bakery, book, movie, song
 ### ### ### ### ### ### ### ### ### ### ### ###


In [58]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with body part + 'y' are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with body part + 'y' are _, _, _, and _.

 ### Answer:  bicep, eye, lymph, testi
 ###
 ### ### ### ### ### ### ### ###


In [59]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with hair tools are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with hair tools are _, _, _, and _.

 ### Answer:  brush, comb, curling iron, straightener

 ### Question: Give me four words associated with this seed word: "f


In [60]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with items in classic kids games are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with items in classic kids games are _, _, _, and _.

 ### Answer:  bingo, dominoes, jacks, marbles
 ### ### ### ### ### ### ### ### ### ### ###


In [63]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Fill in the blank: Four words associated with _ are _, _, _, and _."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

Fill in the blank: Four words associated with _ are _, _, _, and _.

 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
