In [3]:
!pip install transformers > /dev/null 2>&1
!pip install datasets "transformers[sentencepiece]" > /dev/null 2>&1
!pip install sentencepiece > /dev/null 2>&1
!pip install bitsandbytes > /dev/null 2>&1
!pip install accelerate -U > /dev/null 2>&1
!pip install --upgrade jupyterlab ipywidgets > /dev/null 2>&1
!pip install evaluate > /dev/null 2>&1
!pip install optimum

print("Hello, pip installation is done")

Hello, pip installation is done


In [4]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv('datasets/cli_natural_language_command_dataset.csv')

dataset = Dataset.from_pandas(df)

print(dataset)
print(df.head())



Dataset({
    features: ['input', 'output'],
    num_rows: 900
})
                                   input  \
0    Create a green pyramid at 68 -50 -6   
1      Create a yellow cube at -75 33 41   
2     Create a green sphere at 16 54 -15   
3      Create a red pyramid at 70 90 -46   
4  Create a yellow pyramid at 31 -80 -52   

                                    output  
0    create-object pyramid green 68 -50 -6  
1      create-object cube yellow -75 33 41  
2     create-object sphere green 16 54 -15  
3      create-object pyramid red 70 90 -46  
4  create-object pyramid yellow 31 -80 -52  


In [5]:

from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(examples['output'], max_length=128, truncation=True, padding='max_length')
    
    # Store the labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Step to shuffle the dataset before processing
shuffled_dataset = dataset.shuffle(seed=42)  # You can set a seed for reproducibility
# Tokenize the dataset
tokenized_dataset = shuffled_dataset.map(tokenize_function, batched=True)
# Remove non-tokenized columns to clean the dataset
tokenized_dataset = tokenized_dataset.remove_columns(['input', 'output'])


# Split the dataset into 80% train and 20% test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)

# Access the train and test sets
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Verify the sizes
print(f"Train size: {len(train_dataset)}, Test size: {len(eval_dataset)}")
print(tokenized_dataset[0])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Train size: 720, Test size: 180
{'input_ids': [3, 28141, 22734, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [9268, 18, 30536, 22734, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [6]:
from sklearn.metrics import accuracy_score
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert predictions and labels to text (if using text generation models like T5)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Optionally, log the decoded predictions and labels
    for i in range(5):  # Log the first 5 predictions
        print(f"Prediction: {decoded_preds[i]}")
        print(f"Label: {decoded_labels[i]}")
        print('-' * 30)
    
    # Calculate your metrics (example: accuracy for simple tasks)
    acc = accuracy_score(decoded_labels, decoded_preds)
    
    return {"accuracy": acc}

In [7]:
from transformers import TrainingArguments,Seq2SeqTrainer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    # save_total_limit=3,
    num_train_epochs=7,
    predict_with_generate=True,
    logging_dir='./logs',
    fp16=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.317779,0.0
2,No log,0.055082,0.0
3,No log,0.006726,0.888889
4,No log,0.003187,0.961111




Prediction: 
Label: create-object cube yellow -70 58 31
------------------------------
Prediction: 
Label: delete-object cube
------------------------------
Prediction: 
Label: move-object pyramid 28 88 66
------------------------------
Prediction: 
Label: move-object sphere -19 56 -47
------------------------------
Prediction: 
Label: create-object cube green 95 -55 -77
------------------------------




Prediction: 
Label: create-object cube yellow -70 58 31
------------------------------
Prediction: 
Label: delete-object cube
------------------------------
Prediction: 
Label: move-object pyramid 28 88 66
------------------------------
Prediction: 
Label: move-object sphere -19 56 -47
------------------------------
Prediction: 
Label: create-object cube green 95 -55 -77
------------------------------




Prediction: create-object cube yellow -70 58 31
Label: create-object cube yellow -70 58 31
------------------------------
Prediction: delete-object cube
Label: delete-object cube
------------------------------
Prediction: move-object pyramid 28 88 66
Label: move-object pyramid 28 88 66
------------------------------
Prediction: move-object sphere -19 56 -47
Label: move-object sphere -19 56 -47
------------------------------
Prediction: create-object cube green 95 -55 -77
Label: create-object cube green 95 -55 -77
------------------------------




Prediction: create-object cube yellow -70 58 31
Label: create-object cube yellow -70 58 31
------------------------------
Prediction: delete-object cube
Label: delete-object cube
------------------------------
Prediction: move-object pyramid 28 88 66
Label: move-object pyramid 28 88 66
------------------------------
Prediction: move-object sphere -19 56 -47
Label: move-object sphere -19 56 -47
------------------------------
Prediction: create-object cube green 95 -55 -77
Label: create-object cube green 95 -55 -77
------------------------------


In [None]:
from datetime import datetime

# Get current date and time
now = datetime.now()

# Format as 'Y-m-d H:i'
formatted_time = now.strftime('%Y_%m_%d_%H_%M')

# Assuming the model and tokenizer are already loaded and trained
model.save_pretrained("trained_models/" + formatted_time)
tokenizer.save_pretrained("trained_models/"  + formatted_time)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

input_text = "Create a blue pyramid at 25 -31 -79"

model_name_or_path = "trained_models/" + formatted_time #path/to/your/model/or/name/on/hub
device = "cpu" # or "cuda" if you have a GPU



trainedModel = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)
trainedTokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

inputs = trainedTokenizer(input_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length').to(device)
outputs = trainedModel.generate(inputs['input_ids'],     max_new_tokens=128)
print("Answer:{" + trainedTokenizer.decode(outputs[0], skip_special_tokens=True)+"}")


# After training, during evaluation or inference, check predictions
outputs = trainedModel.generate(
    trainedTokenizer("Create a cube at 10 20 30", return_tensors="pt").input_ids,
num_beams=4,    # Beam search to improve output quality
    early_stopping=True,
)
    # Stop once output is complete)

# Decode the model's predictions
decoded_output = trainedTokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Model Output: [{decoded_output}]")

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

save_directory = "onnx/"

# Load a model from transformers and export it to ONNX
ort_model = ORTModelForSequenceClassification.from_pretrained(model_name_or_path, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Save the onnx model and tokenizer
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)