In [1]:
import json
from datasets import load_dataset, Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Load data from JSON file
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Load your data
data = load_data('vba_snippets.json')

# Split data into train, validation, and test sets
train_data = data[:int(0.8*len(data))]
val_data = data[int(0.8*len(data)):int(0.9*len(data))]
test_data = data[int(0.9*len(data)):]

# Convert to Hugging Face Dataset format
def convert_to_dataset(data):
    descriptions = [item['description'] for item in data]
    codes = [item['code'] for item in data]
    return Dataset.from_dict({'description': descriptions, 'code': codes})

train_dataset = convert_to_dataset(train_data)
val_dataset = convert_to_dataset(val_data)
test_dataset = convert_to_dataset(test_data)

# Tokenization
def preprocess_function(examples):
    inputs = ["document: " + code for code in examples['code']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['description'], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

# Evaluation on the test set
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
metrics = trainer.evaluate(tokenized_test_dataset)
print(metrics)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/91 [00:00<?, ? examples/s]



Map:   0%|          | 0/11 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,4.155678
2,No log,3.984515
3,No log,3.94535


Map:   0%|          | 0/12 [00:00<?, ? examples/s]



{'eval_loss': 3.8875091075897217, 'eval_runtime': 2.9971, 'eval_samples_per_second': 4.004, 'eval_steps_per_second': 1.001, 'epoch': 3.0}


In [2]:
# !pip install transformers[torch]

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./model')
tokenizer = T5Tokenizer.from_pretrained('./model')
def generate_description(code_snippet):
    # Clean up the code snippet
    code_snippet = code_snippet.strip()

    # Prepare the input text
    input_text = "document: " + code_snippet

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the output (description)
    outputs = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)

    # Decode the generated output
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return description


# Test the function with a new snippet
new_snippet = """
Sub AddSerialNumbers()
    Dim i As Integer
    On Error GoTo Last
    i = InputBox("Enter Value", "Enter Serial Numbers")
    For i = 1 To i
        ActiveCell.Value = i
        ActiveCell.Offset(1, 0).Activate
    Next i
Last:
    Exit Sub
End Sub
"""

generated_description = generate_description(new_snippet)
print("Generated Description:", generated_description)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Description: Sub AddSerialNumbers() Dim i As Integer On Error GoTo Last i = InputBox("Enter Value", "Enter Serial Numbers") For i = 1 To i ActiveCell.Value = i ActiveCell.Offset(1, 0).Activate Next i Last: Exit Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End Sub End
