In [None]:
# Commented out when not using Colab
#!pip install datasets bitsandbytes accelerate flash_attn

Collecting flash_attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash_attn)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.5.9.post1-cp310-cp310-linux_x86_64.whl size=120889689 sha256=5022ba11d48bf74926da9c16260f4ea2b9bb7f4e29bdb4bd6e1383ad1c55d16f
  Stored in directory: /root/.cache/pip/wheels/cc/ad/f6/7ccf0238790d6346e9fe622923a76ec218e890d356b9a2754a
Successfully built flash_attn
Installing collected packages: einops, flash_attn
Successfully installed einops-0.8.0 flash_attn-2.5.9.post

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, get_scheduler, BitsAndBytesConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim.adamw import AdamW
from tqdm import tqdm

In [None]:
raw_datasets = load_dataset("coai/plantuml_generation", "default", split="train")#.select(range(16))

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
    tokenized_datasets, shuffle=True, batch_size=1, collate_fn=data_collator
)

In [None]:
compute_dtype = getattr(torch, "bfloat16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5",
                                             quantization_config=bnb_config,
                                             device_map={"": 0},
                                             torch_dtype="auto")

optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#model.to(device)

In [None]:
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        #loss = outputs.loss
        #loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 1940/1940 [36:15<00:00,  1.17s/it]

In [None]:
save_directory = "./finetuned_phi_15_plantuml_generation"
model.save_pretrained(save_directory, push_to_hub=True, token="") #Deleted private token

model.safetensors:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

In [None]:
inputs = tokenizer("Generate a plantuml diagram", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Generate a plantuml diagram for the given number of nodes.
    """
    def generate_node(node: int, parent: int, depth: int) -> str:
        """
        Generate a node for the given node, parent, and depth.
        """
        if depth == 0:
            return f"{node}:{parent}"
        else:
            return f"{node}:{parent}:{generate_node(node, parent, depth-1)}"

    def generate_edges(node: int, parent: int, depth: int) -> List[str]:
        """
        Generate a list of edges for the given node, parent, and depth.
        """
        if depth == 0:
            return [f"{node}:{parent}"]
        else:
            return [f"{node}:{parent}:{generate_edge(node, parent, depth-1
