In [1]:
import time
import pandas as pd
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from huggingface_hub import snapshot_download
from peft import LoraConfig, TaskType, get_peft_model
from pprint import pprint



In [2]:
device = "mps"
model_name = "google/flan-t5-base"
dataset_name = "mbpp"

def load_from_hf():
    dataset = load_dataset(dataset_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, dataset, tokenizer

t5_model, dataset, tokenizer = load_from_hf()
t5_model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [3]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters:\t{trainable_model_params:,}\nall model parameters:\t\t{all_model_params:,}\n% of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


print(print_number_of_trainable_model_parameters(t5_model))


trainable model parameters:	247,577,856
all model parameters:		247,577,856
% of trainable model parameters: 100.00%


In [4]:
df = pd.DataFrame(dataset['train'])
df[['text', 'code']].head(5)



Unnamed: 0,text,code
0,Write a function to find the longest chain whi...,"class Pair(object): \r\n\tdef __init__(self, a..."
1,Write a python function to find the first repe...,"def first_repeated_char(str1):\r\n for index,..."
2,Write a function to get a lucid number smaller...,def get_ludic(n):\r\n\tludics = []\r\n\tfor i ...
3,Write a function to reverse words in a given s...,def reverse_words(s):\r\n return ' '.jo...
4,Write a function to check if the given integer...,def prime_num(num):\r\n if num >=1:\r\n for...


In [4]:
def format_instruction(text, code):
	return f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{text}

### Response:
{code}
"""



In [6]:
index = 217
instruct = dataset['train'][index]['text']
code = dataset['train'][index]['code']
prompt = format_instruction(dataset['train']['text'][index], dataset['train']['code'][index])
inputs = tokenizer(prompt, return_tensors="pt").to(device)
generated = tokenizer.decode(
    t5_model.generate(
        inputs["input_ids"],
        max_new_tokens=300,
    )[0],
    skip_special_tokens=True,
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT INSTRUCTION:\n{instruct}')
print(dash_line)
print(f'BASELINE CODE:\n{code}\n')
print(dash_line)
print(f'MODEL CODE GENERATION - ZERO SHOT:\n{generated}')



  if unfinished_sequences.max() == 0:


---------------------------------------------------------------------------------------------------
INPUT INSTRUCTION:
Write a python function to count lower case letters in a given string.
---------------------------------------------------------------------------------------------------
BASELINE CODE:
def lower_ctr(str):
      lower_ctr= 0
      for i in range(len(str)):
          if str[i] >= 'a' and str[i] <= 'z': lower_ctr += 1     
      return  lower_ctr

---------------------------------------------------------------------------------------------------
MODEL CODE GENERATION - ZERO SHOT:
if 'a' in str[i]: lower_ctr += 1 else: lower_ctr = 0 for i in range(len(str)): if str[i] >= 'a' and str[i] = 'z': lower_ctr += 1 return lower_ctr


In [5]:
def tokenize_function(example):
    prompt = [format_instruction(t, c) for t, c in zip(example['text'], example['code'])]
    example["input_ids"] = tokenizer(
        prompt, padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids
    return example



In [6]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    [
        "task_id",
        "text",
        "code",
        "test_list",
        "test_setup_code",
        "challenge_test_list"
    ]
)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [7]:
lora_config = LoraConfig(
    r=32,  # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,  # FLAN-T5
)

In [8]:
output_dir = f"./dialogue-summary-training-{str(int(time.time()))}"

peft_model = get_peft_model(t5_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters:	3,538,944
all model parameters:		251,116,800
% of trainable model parameters: 1.41%


In [12]:
training_args = TrainingArguments(
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # save_total_limit=3,
    # load_best_model_at_end=True,
    # metric_for_best_model="loss",
    # optim="adamw_torch",
    # gradient_checkpointing=True,
    # torch_compile=True, # Test it

    output_dir=output_dir,
    # auto_find_batch_size=True,
    learning_rate=1e-3,  # Wilton
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    # report_to="tensorboard",
    fp16=False,
    bf16=False,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)


trainer.train()

# save model in local
peft_model_path = "./t5-summary-checkpoint-base"
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

  0%|          | 0/1 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 17.86 GB, other allocations: 230.71 MB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).