In [None]:
# You only need to run this once per machine
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets scipy ipywidgets
!pip install -q -U trl

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_API")
secret_wandb = user_secrets.get_secret("wandb")

!huggingface-cli login --token $secret_hf

wandb.login(key = secret_wandb)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import os

from copy import deepcopy
from random import randrange
from functools import partial

import torch
import accelerate
import bitsandbytes as bnb

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from transformers.integrations import WandbCallback
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)
from trl import SFTTrainer



In [None]:
model_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Auto selects device to put model on.
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# model.gradient_checkpointing_enable() # use the line below instead
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)  # Explicitly specify!

In [None]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    # lm_head is often excluded.
    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['q_proj', 'o_proj', 'k_proj', 'v_proj', 'gate_proj', 'down_proj', 'up_proj']

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=modules,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


In [None]:
!nvidia-smi

Wed Nov 22 02:15:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    27W /  70W |   3274MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   50C    P0    27W /  70W |   4072MiB / 15109MiB |      0%      Default |
|       

In [None]:
dataset = load_dataset("LDJnr/Puffin", split="train")

Downloading readme:   0%|          | 0.00/2.70k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
random_sample = dataset[randrange(len(dataset))]
print(type(random_sample))
print(random_sample.keys())
print(random_sample['id'])
print(random_sample["conversations"])

<class 'dict'>
dict_keys(['id', 'conversations'])
2820
[{'from': 'human', 'value': 'can you explain this in laymans terms?\n\n(B) Schema of the haplo-cord stem cell transplant. The participant received an allogeneic stem cell transplant per institutional standard care. Conditioning regimen was fludarabine 30 mg/m2 daily on days −7 to −3, melphalan 140 mg/m2 × 1 dose (day −2), and total body irradiation at 400 CGy on days −7 to −6. Haploidentical stem cells were infused on day 0, and CCR5 Δ32/Δ32 cord stem cells were infused on day +1. Graft versus host (GVH) disease prophylaxis included: antithymocyte globulin (ATG) 1.5 mg/kg on days −5, −3, and −1; mycophenolate mofetil (MMF) 1 g three times daily on day −2 through day +28; and tacrolimus from day −2 to day 180 post-transplant.'}, {'from': 'gpt', 'value': 'In simpler terms, this passage is describing a procedure called "haplo-cord stem cell transplant" for a patient. This is a type of transplant where stem cells from a partially match

In [None]:
print(len(random_sample["conversations"]))
print(random_sample["conversations"][0])
print(random_sample["conversations"][1])

2
{'from': 'human', 'value': 'can you explain this in laymans terms?\n\n(B) Schema of the haplo-cord stem cell transplant. The participant received an allogeneic stem cell transplant per institutional standard care. Conditioning regimen was fludarabine 30 mg/m2 daily on days −7 to −3, melphalan 140 mg/m2 × 1 dose (day −2), and total body irradiation at 400 CGy on days −7 to −6. Haploidentical stem cells were infused on day 0, and CCR5 Δ32/Δ32 cord stem cells were infused on day +1. Graft versus host (GVH) disease prophylaxis included: antithymocyte globulin (ATG) 1.5 mg/kg on days −5, −3, and −1; mycophenolate mofetil (MMF) 1 g three times daily on day −2 through day +28; and tacrolimus from day −2 to day 180 post-transplant.'}
{'from': 'gpt', 'value': 'In simpler terms, this passage is describing a procedure called "haplo-cord stem cell transplant" for a patient. This is a type of transplant where stem cells from a partially matched donor (haploidentical) and cord blood stem cells wit

In [None]:
# These are multi-turn conversations. They don't end after GPT-4 responds to the user's first message.

for i in dataset:
  if len(i["conversations"]) > 2:
    for j in i["conversations"]:
      print(j)  # Conversations are multi-turn (>= 2) and always even in count (human then gpt response).
    break

{'from': 'human', 'value': 'How do I center a text element vertically in a row in jetpack compose? The height of the row is determined by a button next to the text element.'}
{'from': 'gpt', 'value': 'In Jetpack Compose, you can center a text element vertically within a row by using the `Arrangement.Center` parameter for the `verticalAlignment` property in the `Row` composable. Here\'s an example:\n```kotlin\nimport androidx.compose.foundation.layout.Arrangement\nimport androidx.compose.foundation.layout.Row\nimport androidx.compose.material.Button\nimport androidx.compose.material.Text\nimport androidx.compose.runtime.Composable\nimport androidx.compose.ui.Alignment\n\n@Composable\nfun CenteredTextWithButton() {\n    Row(\n        verticalAlignment = Alignment.CenterVertically\n    ) {\n        Text("Centered Text")\n        Button(onClick = { /* Do something on button click */ }) {\n            Text("Button")\n        }\n    }\n}\n```\nIn this example, the `Text` element will be cent

In [None]:
run = wandb.init(
    project="Fine tuning mistral 7B",  # Project name.
    name="log_dataset",          # name of the run within this project.
    config={                     # Configuration dictionary.
        "split": "train"
    },
    group="dataset",             # Group runs. This run belongs in "dataset".
    tags=["dataset"],            # Tags. More dynamic, low-level grouping.
    notes="Logging subset of Puffin dataset.",  # Description about the run.
    job_type="training",
)  # Check out the other parameters in the `wandb.init`!

[34m[1mwandb[0m: Currently logged in as: [33malinourian10[0m ([33msut-ee[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
data = []
for i in range(1000):  # Log 1000 instances.
    x = dataset[i]
    id_ = x["id"]
    conversations = x["conversations"]
    for idx, response in enumerate(conversations):
        data.append([id_, idx, response["from"], response["value"]])


table = wandb.Table(data=data, columns=["id", "idx", "from", "value"])
run.log({"first1000_Puffin": table})


In [None]:
run.finish()

Below is a conversation between a user and you.


<human>: <value>
<gpt>: <value>
...


Instruction: Write a response appropriate to the conversation.


In [None]:
def format_prompt(sample):
    """Given a sample dictionary with key "conversations", format the conversation into a prompt.


    Args:
      sample: A sample dictionary from a Hugging Face dataset.


    Returns:
      sample: sample dictionary with "text" key for the formatted prompt.
    """


    INTRO = "Below is a conversation between a user and you."
    END = "Instruction: Write a response appropriate to the conversation."


    conversations = ""
    for response in sample["conversations"]:
      from_, value = response["from"], response["value"]
      conversations += f"<{from_}>: " + value + "\n"


    sample["text"] = "\n\n".join([INTRO, conversations, END])


    return sample

format_prompt(random_sample)["text"]

'Below is a conversation between a user and you.\n\n<human>: can you explain this in laymans terms?\n\n(B) Schema of the haplo-cord stem cell transplant. The participant received an allogeneic stem cell transplant per institutional standard care. Conditioning regimen was fludarabine 30 mg/m2 daily on days −7 to −3, melphalan 140 mg/m2 × 1 dose (day −2), and total body irradiation at 400 CGy on days −7 to −6. Haploidentical stem cells were infused on day 0, and CCR5 Δ32/Δ32 cord stem cells were infused on day +1. Graft versus host (GVH) disease prophylaxis included: antithymocyte globulin (ATG) 1.5 mg/kg on days −5, −3, and −1; mycophenolate mofetil (MMF) 1 g three times daily on day −2 through day +28; and tacrolimus from day −2 to day 180 post-transplant.\n<gpt>: In simpler terms, this passage is describing a procedure called "haplo-cord stem cell transplant" for a patient. This is a type of transplant where stem cells from a partially matched donor (haploidentical) and cord blood ste

In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# Change the max length depending on hardware constraints.
max_length = get_max_length(model)
print(max_length)

Found max length: 32768
32768


In [None]:
tokenizer(
    random_sample["text"],
    max_length=max_length,
    truncation=True
)

{'input_ids': [1, 20811, 349, 264, 7114, 1444, 264, 2188, 304, 368, 28723, 13, 13, 28789, 18529, 9670, 541, 368, 7282, 456, 297, 4897, 20661, 3471, 28804, 13, 13, 28732, 28760, 28731, 24051, 302, 272, 295, 377, 731, 28733, 19056, 17854, 3601, 1203, 18071, 28723, 415, 28503, 3874, 396, 544, 17344, 294, 17854, 3601, 1203, 18071, 660, 28211, 4787, 1656, 28723, 28237, 288, 983, 21538, 403, 972, 554, 283, 375, 473, 28705, 28770, 28734, 18144, 28748, 28719, 28750, 6790, 356, 2202, 8798, 28787, 298, 8798, 28770, 28725, 8970, 721, 282, 276, 28705, 28740, 28781, 28734, 18144, 28748, 28719, 28750, 15770, 28705, 28740, 20222, 325, 1466, 8798, 28750, 557, 304, 3102, 2187, 4139, 4306, 6752, 438, 28705, 28781, 28734, 28734, 334, 28777, 28724, 356, 2202, 8798, 28787, 298, 8798, 28784, 28723, 382, 377, 731, 1129, 745, 17854, 8894, 654, 4319, 3436, 356, 1370, 28705, 28734, 28725, 304, 334, 5728, 28782, 28705, 29475, 28770, 28750, 28748, 29475, 28770, 28750, 16732, 17854, 8894, 654, 4319, 3436, 356, 137

In [None]:
# https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset: str, seed: int = 42):
    # Format each prompt.
    print("Preprocessing dataset...")
    dataset = dataset.map(format_prompt)


    # https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/
    def preprocess_batch(batch, tokenizer, max_length):
        return tokenizer(
            batch["text"],
            max_length=max_length,
            truncation=True,
        )


    # Apply preprocessing to each batch of the dataset & and remove "conversations" and "text" fields.
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["conversations", "text"],
    )


    # Filter out samples that have input_ids exceeding max_length.
    # Not needed as the tokenizer truncates all prompts over max length.
    # dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)


    # Shuffle dataset.
    dataset = dataset.shuffle(seed=seed)


    return dataset

In [None]:
formatted_dataset = deepcopy(dataset).map(format_prompt)
dataset = preprocess_dataset(tokenizer, max_length, dataset)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
run = wandb.init(
    project="Fine tuning mistral 7B preprocessing",  # Project name.
    name="log_prep_dataset",     # name of the run within this project.
    config={                     # Configuration dictionary.
        "split": "train"
    },
    group="dataset",             # Group runs. This run belongs in "dataset".
    tags=["dataset"],            # Tags. More dynamic, low-level grouping.
    notes="Logging preprocessed subset of Puffin dataset."  # Description about the run.
)  # Check out the other parameters in the `wandb.init`!


data = []
for i in range(1000):  # Log 1000 instances.
    x = formatted_dataset[i]
    id_ = x["id"]
    conversation = x["text"]
    data.append([id_, conversation])


table = wandb.Table(data=data, columns=["id", "value"])
run.log({"first1000_prep_Puffin": table})


In [None]:
dataset.save_to_disk("Puffin_prep.hf")

artifact = wandb.Artifact(name="Puffin_prep", type="dataset")
artifact.add_dir("./Puffin_prep.hf", name="train")
run.log_artifact(artifact)
run.finish()

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Adding directory to artifact (./Puffin_prep.hf)... Done. 0.1s


In [None]:
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,  # Best practice: https://huggingface.co/docs/transformers/main/main_classes/quantization#tips-and-best-practices
    gradient_accumulation_steps=1,  # Powers of 2.
    learning_rate=2e-4,
    max_grad_norm=1.0,
    max_steps=20,
    lr_scheduler_type="linear",
    warmup_steps=5,
    fp16=True,
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=5,
    optim="paged_adamw_8bit",
    report_to="wandb"
)

# training_arguments = TrainingArguments(
#     weight_decay=0.001,
#     bf16=False,
#     warmup_ratio=0.03,
#     group_by_length=True,
# )

In [None]:
run = wandb.init(
    project="Fine tuning mistral 7B Main",
    name="train_run0",  # Sometimes I use the run name as short descriptor for the run.
    config={
        "split": "train",
        # Optionally, you can add all hyperparameters and configs here for better reproducibility!
    },
    group="train",
    tags=["train", "AdamW"],  # Add tags for what might characterize this run.
    notes="Initial finetuning."
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=dataset,
#     dataset_text_field=dataset["text"]
#     packing=True
)

In [None]:
results = trainer.train()  # Now we just run train()!
run.finish()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.3266
2,1.0218
3,1.9415
4,1.0639
5,1.066
6,0.7358
7,0.7766
8,1.2065
9,1.1641
10,0.5098


VBox(children=(Label(value='0.001 MB of 0.026 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.050795…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁███████
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/learning_rate,▂▄▅▇██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,▅▄█▄▄▃▃▅▅▂▃▅▃▂▃▂▁▂▃▃
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,0.01
train/global_step,20.0
train/learning_rate,0.0
train/loss,0.8933
train/total_flos,1004233574375424.0
train/train_loss,0.90044
train/train_runtime,165.4855
train/train_samples_per_second,0.121
train/train_steps_per_second,0.121


Inference Using Mistral 7B

In [None]:
model_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


# You can just use model.
inf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


In [None]:
run = wandb.init(project="Fine tuning mistral 7B Main")  # MAKE SURE TO PASS IN YOUR PROJECT NAME!
artifact = run.use_artifact('vincenttu/finetuning_mistral7b/model-t6rw0dav:v0', type='model')
artifact_dir = artifact.download()
run.finish()

In [None]:
model = PeftModel.from_pretrained(inf_model, "/content/artifacts/model-t6rw0dav:v0")

In [None]:
prompt = "What is a neural network??"


device = "cuda" if torch.cuda.is_available() else "cpu"
model_input = tokenizer(prompt, return_tensors="pt").to(device)


_ = model.eval()
with torch.no_grad():
    out = model.generate(**model_input, max_new_tokens=100)


print(tokenizer.decode(out[0], skip_special_tokens=True))