In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
deepgen = load_dataset("lucyd/deepgen", split="train")

In [3]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops sentencepiece

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
#Use a sharded model to fine-tune in the free version of Google Colab.
base_model = "mistralai/Mistral-7B-Instruct-v0.1" #bn22/Mistral-7B-Instruct-v0.1-sharded
dataset_name, new_model = "deepgen/connections", "deepgen/connections_mistral7b"

In [5]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(True, True)

In [6]:
wandb.login(key = "f2454892ec51c6f33d3046dee54701a325a1fc44")
run = wandb.init(project='Fine tuning mistral 7B instruct', job_type="training", anonymous="allow")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlucyduan[0m ([33mdeepgenerative[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc


In [7]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

In [8]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"{example['prompt'][i]}\n{example['response'][i]}"
        output_texts.append(text)
    return output_texts

In [9]:
# Training Arguments
# Hyperparameters should beadjusted based on the hardware you using
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 3,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=deepgen,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/267 [00:00<?, ? examples/s]



In [10]:
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()



Step,Training Loss
30,2.0635


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁█
train/global_step,▁█
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
total_flos,994555929919488.0
train/epoch,3.0
train/global_step,51.0
train/grad_norm,1.68288
train/learning_rate,0.0002
train/loss,2.0635
train_loss,1.54199
train_runtime,377.3529
train_samples_per_second,2.123
train_steps_per_second,0.135


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer):

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
model.push_to_hub("mistral_instruct")
tokenizer.push_to_hub("mistral_instruct")

adapter_model.safetensors:   0%|          | 0.00/92.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lucyd/mistral_instruct/commit/72f09b5ba01f86b267ada5653ef467251ede3fb9', commit_message='Upload tokenizer', commit_description='', oid='72f09b5ba01f86b267ada5653ef467251ede3fb9', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
logging.set_verbosity(logging.CRITICAL)

# prompt = """
# You will be given a seed word. Your task to provide exactly four different words that are associated with the seed word.

# Provide your answer in the format as follows:
# Answer:::

# Now here is the seed word.
# Seed word: summer gear\n
# """

prompt = """
Generate five words associated with: species of eagle.
"""

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
result = pipe(f"{prompt}", temperature=0.7)
print(result[0]['generated_text'])




Generate five words associated with: species of eagle.
 golden, bald, brown, white, harpy


In [50]:
evalset = deepgen = load_dataset("lucyd/deepgen_eval", split="train")

Downloading readme:   0%|          | 0.00/302 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.84k/3.84k [00:00<00:00, 22.5kB/s]


Generating train split:   0%|          | 0/43 [00:00<?, ? examples/s]

In [71]:
def extract_four_words_after_answer(model_output):
    # Find the index of the first occurrence of 'Answer:'
    first_index = model_output.find('Answer:')
    if first_index == -1:
        return None  # 'Answer:' not found in the model output

    # Find the index of the second occurrence of 'Answer:'
    second_index = model_output.find('Answer:', first_index + 1)
    if second_index == -1:
        return None  # Second 'Answer:' not found in the model output

    # Extract substring starting from the second 'Answer:' to the end
    substring = model_output[second_index + len('Answer:'):]

    # Tokenize the substring into words
    words = substring.split()

    # Extract the first four words after the second 'Answer:'
    four_words = ' '.join(words[:4])

    return four_words

In [87]:
gen_ans = []
real_ans = []
for line in evalset:

    prompt, response = line['prompt'], line['response']
    format_prompt = prompt_format.format(eval_prompt=prompt)
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
    result = pipe(f"{format_prompt}", temperature=0.7)
    result = result[0]['generated_text']
    gen_an = extract_four_words_after_answer(result)
    if gen_an is not None:
        gen_ans.append(gen_an)
    else:
        print(result)
    real_ans.append(response)


KeyboardInterrupt: 

In [74]:
print(len(gen_ans))
print(len(real_ans))

43
43


In [75]:
from semscore import EmbeddingModelWrapper
from statistics import mean

em = EmbeddingModelWrapper()
similarities = em.get_similarities(
    em.get_embeddings(gen_ans),
    em.get_embeddings(real_ans),
)

In [76]:
print(similarities)

[0.3435998857021332, 0.6570336818695068, 0.16413824260234833, 0.5825481414794922, 0.17860861122608185, 0.8117187023162842, 0.26339519023895264, 0.2896694242954254, 0.16583339869976044, 0.674803614616394, 0.48852917551994324, 0.10901779681444168, 0.4350118041038513, 0.7604237198829651, 0.28480106592178345, 0.26413607597351074, 0.5928313732147217, 0.6101856231689453, 0.3057096600532532, 0.5407426357269287, 0.24475297331809998, 0.6230244636535645, 0.6528857946395874, 0.4020737111568451, 0.680112361907959, 0.23881296813488007, 0.0536806657910347, 0.40123242139816284, 0.36349576711654663, 0.44656020402908325, 0.3494698405265808, 0.22100679576396942, 0.3997664451599121, 0.6010756492614746, 0.6462447047233582, 0.2501295804977417, 0.31834667921066284, 0.4244151711463928, 0.6849631071090698, 0.42106473445892334, 0.16639307141304016, 0.6763612031936646, 0.10546360909938812]


In [77]:
print(gen_ans)

['mess, muddle, muddle, mudd', 'firm, strong, stout, tough', 'flip, flip-flops, flip-flops, flip', 'fruity, nutty, oaky, sp', 'brass, copper, iron, silver', 'boil, fry, scramble, scram', 'bean, beanbag, beanbag, beanbag', 'fusilli, penne, spaghetti, tag', 'bounce, hush, munch, plush', 'fall, plummet, plunge, slide', 'azure, navy, sapphire, sky', 'eye, hand, leg, mouth', 'cheered, clapped, hooted, roared', 'bird, plane, pilot, pilotess', 'bean, corn, lettuce, tomato', 'cite, mention, raise, recall', 'beams, rays, streams, waves', 'quick, rapid, swift, swiftly', 'call, phone, talk, words', 'miss, no show, pull', 'churn, spindle, turn, wheelbarrow', 'applause, praise, recognition, salute', 'catch, discover, find out,', 'visa, visa, visa, visa', 'kangaroo, rabbit, squirrel,', 'beatles, clash, fleetwood, rolling', 'best, place, rate, time', 'amber, gold, ochre, t', 'Curry, Durant, LeB', 'blue, gray, humpback, sperm', 'cake, fish, lamps, p', 'narrow, pinched, snug', 'ala, ark, cali', 'armoir

score: need semantic meaning, reinforce that four words are not the same.
need to check all four words are real words
words satisfy prompt constraints (ie, begins with)


In [85]:
for i in range(len(gen_ans)):
    print("Generated: ", gen_ans[i], "Real: ", real_ans[i], "Semscore: ", similarities[i])

Generated:  mess, muddle, muddle, mudd Real:  "dump, place, pigpen, sty" Semscore:  0.3435998857021332
Generated:  firm, strong, stout, tough Real:  "firm, solid, sound, stable" Semscore:  0.6570336818695068
Generated:  flip, flip-flops, flip-flops, flip Real:  "hat, shorts, sunglasses, tee" Semscore:  0.16413824260234833
Generated:  fruity, nutty, oaky, sp Real:  "balanced, dry, full, sweet" Semscore:  0.5825481414794922
Generated:  brass, copper, iron, silver Real:  "goldilocks, ironic, leadership, tinder" Semscore:  0.17860861122608185
Generated:  boil, fry, scramble, scram Real:  "boil, fry, poach, scramble" Semscore:  0.8117187023162842
Generated:  bean, beanbag, beanbag, beanbag Real:  "axe, dart, horseshoe, ring" Semscore:  0.26339519023895264
Generated:  fusilli, penne, spaghetti, tag Real:  "bowtie, elbow, tube, wheel" Semscore:  0.2896694242954254
Generated:  bounce, hush, munch, plush Real:  "brain, train, rain, train" Semscore:  0.16583339869976044
Generated:  fall, plummet

In [104]:
prompt_format = """
You will be given a seed word. Your task is to provide four different words that are associated with the seed word.

Provide your answer in the format as follows:
Answer:::

Now here is the seed word.
Seed word: items in classic kids games
"""

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
result = pipe(f"{prompt_format}", temperature=0.5)
print(result[0]['generated_text'])


You will be given a seed word. Your task is to provide four different words that are associated with the seed word.

Provide your answer in the format as follows:
Answer:::

Now here is the seed word.
Seed word: items in classic kids games

 ### Answer:  ball, bean, beanbag, beanbag, beanbag, beanbag, beanbag, beanbag, beanbag, beanbag, beanbag, bean
