In [144]:
!pip install loguru transformers wandb transformers[torch] accelerate datasets



In [145]:
#!/usr/bin/env python3
# Fine tune CodeT5 model on the FStar everest dataset.
from __future__ import absolute_import, division, print_function
import datetime
from typing import *
from loguru import logger
import multiprocessing
from tqdm import tqdm
import sys
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
import torch
import numpy as np
import json
import random
import os
import argparse
from transformers.trainer_utils import EvalPrediction
from transformers import (
    AdamW, get_linear_schedule_with_warmup,
    BertConfig, BertForMaskedLM, BertTokenizer,
    GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
    OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
    RobertaConfig, RobertaModel, RobertaTokenizer,
    DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
)
import wandb
import pandas as pd
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForSequenceClassification
from transformers import Seq2SeqTrainer,AutoTokenizer, T5ForConditionalGeneration,EarlyStoppingCallback, Seq2SeqTrainingArguments, AdamW, ProgressCallback

# tokenizer = T5Tokenizer.from_pretrained('t5-small')
# model = T5ForConditionalGeneration.from_pretrained('t5-small')

# https://huggingface.co/transformers/v3.0.2/model_doc/t5.html#t5forconditionalgeneration
# >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
# 
# >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
# >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
# >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
# >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
# >>> loss, prediction_scores = outputs[:2]
# 
# >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
# >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
# >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
# >>> outputs = model.generate(input_ids)
#

### Tutorial: https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling

In [146]:
model_name='Salesforce/codet5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
max_model_length = tokenizer.model_max_length #for CodeT5 it is 512
# print(model)

In [147]:
eos_token = tokenizer.eos_token
logger.info(f"model EOS token: {eos_token}")
logger.info(f"special tokens: {tokenizer.all_special_tokens}")
# #updating the tokenizer's vocalblary file with End of Statement <EOS> Special Token:
# # print("Tokenizer's original size:  ",len(tokenizer))
# special_tokens_dict = {'eos_token': '<EOS>'}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# # print('\n We have added', num_added_toks, 'token')
# model.resize_token_embeddings(len(tokenizer))
# # print(tokenizer.all_special_tokens)


[32m2023-08-04 22:20:04.611[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mmodel EOS token: </s>[0m
[32m2023-08-04 22:20:04.612[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mspecial tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>', '<extra_id_99>', '<extra_id_98>', '<extra_id_97>', '<extra_id_96>', '<extra_id_95>', '<extra_id_94>', '<extra_id_93>', '<extra_id_92>', '<extra_id_91>', '<extra_id_90>', '<extra_id_89>', '<extra_id_88>', '<extra_id_87>', '<extra_id_86>', '<extra_id_85>', '<extra_id_84>', '<extra_id_83>', '<extra_id_82>', '<extra_id_81>', '<extra_id_80>', '<extra_id_79>', '<extra_id_78>', '<extra_id_77>', '<extra_id_76>', '<extra_id_75>', '<extra_id_74>', '<extra_id_73>', '<extra_id_72>', '<extra_id_71>', '<extra_id_70>', '<extra_id_69>', '<extra_id_68>', '<extra_id_67>', '<extra_id_66>', '<extra_id_65>', '<extra_id_64>', '<extra_id_63>', '<extra_id_62>', '<extra_id_61>', '<extra_id_60>', '<extra_id_59>', '<e

In [148]:
wandb.login()



True

In [149]:
def get_label_ids_sakina(target):
    """
    Why is it correct for the model to produce <EOS> at the end if we have too large a sentence?
    Siddharth does not believe this implementation.
    """
    max_length=tokenizer.model_max_length
    # to train model on End of statement token. Even When model truncates longer code, EOS remain to show model the end of the statement
    # Tokenize the target text without padding to get the tokens
    encoded_tokens = tokenizer.tokenize(target)
    # Check if the total number of tokens is greater than max_length
    if len(encoded_tokens) > max_length:
        # If yes, truncate the tokens while preserving the "<EOS>" at the end
        truncated_tokens = encoded_tokens[:max_length - 1] + [encoded_tokens[-1]]
        # Convert the truncated tokens back to input_ids
        input_ids = tokenizer.convert_tokens_to_ids(truncated_tokens)
    else:
        # If no truncation needed, keep the original tokens with padding
        input_ids = tokenizer(target, max_length=max_length, padding="max_length", truncation=True).input_ids
    # print(input_ids)
    return input_ids


def get_label_ids(target):
    """
    get  {'input_ids': [1, 11351, 4653, 29025, 719, 2616, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
    for input "foo bar baz quux larp"
    """
    input_ids = tokenizer(target, max_length=max_length, padding="max_length", truncation=True).input_ids
    return input_ids

if True: # testing
    eos_encoded = tokenizer(tokenizer.eos_token)
    logger.debug(f'eos token: {tokenizer.eos_token} | encoded {eos_encoded}')
    logger.debug(f'tokenizer tokenize: {tokenizer.tokenize("foo bar baz quuxlajdasdsadlka")}')
    logger.debug(f'tokenizer funcall: {tokenizer("foo bar baz quux")}')
    logger.debug(f'convert_tokens_to_ids: {tokenizer.convert_tokens_to_ids(tokenizer.tokenize("foo bar baz quux larp"))}')
# tokenize: string -> List[token=str]
# convert_tokens_to_ids: List[token=str] -> List[int]
# tokenizer(...) = convert_tokns_to_ids . tokenize + attention mask.
# 
if True: # testing
    eos_encoded = tokenizer(tokenizer.eos_token)
    logger.debug(f'eos token: {tokenizer.eos_token} | encoded {eos_encoded}')
    logger.debug(f'tokenizer tokenize: {tokenizer.tokenize("foo bar baz quuxlajdasdsadlka")}')
    logger.debug(f'tokenizer funcall: {tokenizer("foo bar baz quux")}')
    logger.debug(f'convert_tokens_to_ids: {tokenizer.convert_tokens_to_ids(tokenizer.tokenize("foo bar baz quux larp"))}')

[32m2023-08-04 22:20:07.399[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [34m[1meos token: </s> | encoded {'input_ids': [1, 2, 2], 'attention_mask': [1, 1, 1]}[0m
[32m2023-08-04 22:20:07.401[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [34m[1mtokenizer tokenize: ['foo', 'Ġbar', 'Ġbaz', 'Ġqu', 'ux', 'l', 'aj', 'das', 'ds', 'ad', 'l', 'ka'][0m
[32m2023-08-04 22:20:07.402[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [34m[1mtokenizer funcall: {'input_ids': [1, 11351, 4653, 29025, 719, 2616, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}[0m
[32m2023-08-04 22:20:07.402[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [34m[1mconvert_tokens_to_ids: [11351, 4653, 29025, 719, 2616, 5604, 84][0m
[32m2023-08-04 22:20:07.403[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [34m[1meos token: </s> | encoded {'input_ids'

In [150]:
#maping the dataset into batches
Experiment = True # for the first experimental run to get the pipeline going
if Experiment and not os.path.exists("input.txt"):
    # Download Shakespeare
    %time
    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    

In [151]:
import datasets
import tqdm
import json
import pathlib

defs = []

files = set()
for jpath in tqdm.tqdm(glob.glob("dataset/*.json")):
    j = json.loads(open(jpath, "r").read())
    for jdefn in j["defs"]:
        filepath = pathlib.Path(jdefn["file_name"]).name 
        files.add(filepath)
        data = open(f"./raw_dataset/{filepath}").readlines()
        start_line = int(jdefn["start_line"])
        end_line = int(jdefn["end_line"])
        if start_line == 0: continue # start line is zero.
        data = "\n".join(data[start_line-1:end_line-1])
        if data:
            defs.append({"input": data + tokenizer.eos_token})
        
print(defs[:3])
print(f"#defs: {len(defs)}")
files = sorted(list(files))
print(f"grabbed output from #files: {len(files)}")

100%|████████████████████████████████████████████████████████████████████████████████████████████| 2699/2699 [00:32<00:00, 83.56it/s]

[{'input': 'val finv: res:felem -> a:felem -> Stack unit\n\n  (requires fun h ->\n\n    live h a /\\ live h res /\\ eq_or_disjoint a res /\\\n\n    as_nat h a < S.prime)\n\n  (ensures fun h0 _ h1 -> modifies (loc res) h0 h1 /\\\n\n    as_nat h1 res < S.prime /\\\n</s>'}, {'input': 'val fsqrt: res:felem -> a:felem -> Stack unit\n\n  (requires fun h ->\n\n    live h a /\\ live h res /\\ eq_or_disjoint a res /\\\n\n    as_nat h a < S.prime)\n\n  (ensures fun h0 _ h1 -> modifies (loc res) h0 h1 /\\\n\n    as_nat h1 res < S.prime /\\\n</s>'}, {'input': "type t19' =\n\n  | X_a of (squash False)\n</s>"}]
#defs: 49728
grabbed output from #files: 2647





In [152]:
# https://github.com/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb
### Tutorial: https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
def build_huggingface_dataset_from_list_of_defs(defs: List[Dict[str, Any]]) -> datasets.Dataset:
    dataset = datasets.Dataset.from_list(defs)
    dataset = dataset.map(lambda egs : tokenizer(egs["input"]), batched=True, num_proc=4)
    block_size = 128
    def group_texts(examples):
        # Concatenate all texts.
        # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        concatenated_examples = examples
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, though you could add padding instead if the model supports it
        # In this, as in all things, we advise you to follow your heart
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    dataset = dataset.map(group_texts, batched=True, batch_size=1000, num_proc=4)
    return dataset 

defs = defs[:1000]
train = build_huggingface_dataset_from_list_of_defs(defs[:int(len(defs)*0.8)])
valid = build_huggingface_dataset_from_list_of_defs(lines[int(len(defs)*0.8):int(len(defs)*0.9)])
test = build_huggingface_dataset_from_list_of_defs(lines[int(len(defs)*0.9):])
# logger.info(f"len train: {len(train)} | test: {len(test)} | valid: {len(valid)}")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (985 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (874 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4100 [00:00<?, ? examples/s]

In [153]:
def debug():
    train_eg = next(iter(train))
    print(f"train_eg: {train_eg.keys()}")
    train_eg_input = train_eg['input']
    print(f"train_eg input: {type(train_eg_input)} | len : {len(train_eg_input)}")
    print(f"input[0]: {train_eg_input[0][:90]}")
debug()

train_eg: dict_keys(['input', 'input_ids', 'attention_mask', 'labels'])
train_eg input: <class 'list'> | len : 128
input[0]: val finv: res:felem -> a:felem -> Stack unit

  (requires fun h ->

    live h a /\ live h


In [156]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="output_dir",
    learning_rate=2e-5, # should I use a much smaller learning rate?
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    evaluation_strategy='steps',
    eval_steps=10
)
training_args = training_args.set_dataloader(train_batch_size=512, eval_batch_size=512)

In [157]:
import time
from transformers import Trainer
run = wandb.init(
    # Set the project where this run will be logged
    project="codet5-finetune-fstar",
    # Track hyperparameters and run metadata
    config={})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)
start_time = time.time()
trainer.train()

elapsed_time_secs = time.time() - start_time

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…



0,1
eval/loss,▆▁█
eval/runtime,▁▃█
eval/samples_per_second,█▆▁
eval/steps_per_second,█▇▁
train/epoch,▁▅██
train/global_step,▁▅██
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁

0,1
eval/loss,28.0529
eval/runtime,3.0366
eval/samples_per_second,164.656
eval/steps_per_second,0.329
train/epoch,3.0
train/global_step,24.0
train/total_flos,78354444189696.0
train/train_loss,27.39769
train/train_runtime,388.8744
train/train_samples_per_second,30.858


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670058597810566, max=1.0…

Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method 

Step,Training Loss,Validation Loss
10,26.9962,27.893749
20,26.8595,27.339882
30,26.0499,26.000065
40,25.3525,25.531055
50,24.8259,25.31781
60,24.5607,24.696352
70,24.2704,24.83893
80,23.8981,24.354298


In [158]:
import datetime
calc_time = "Total Training Time: '%s' " % datetime.timedelta(elapsed_time_secs)
print(calc_time)   

Total Training Time: '1385 days, 13:45:38.095093' 


In [159]:
model.save_pretrained("./output_dir")

In [177]:
from transformers import T5ForConditionalGeneration

model_gen = T5ForConditionalGeneration.from_pretrained("./output_dir")
# model_gen = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
outs_ids = model_gen.generate(tokenizer("live h a /\ ", return_tensors="pt").input_ids)
out_str = tokenizer.decode(outs_ids[0], skip_special_tokens=True)
print(out_str)

h a / \
