In [1]:
!pip install transformers > /dev/null 2>&1
!pip install datasets "transformers[sentencepiece]" > /dev/null 2>&1
!pip install sentencepiece > /dev/null 2>&1
!pip install bitsandbytes > /dev/null 2>&1
!pip install accelerate -U > /dev/null 2>&1
!pip install --upgrade jupyterlab ipywidgets > /dev/null 2>&1
!pip install evaluate > /dev/null 2>&1

print("Hello, pip installation is done")

Hello, pip installation is done


In [44]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv('text_to_command_dataset.csv')

dataset = Dataset.from_pandas(df)

print(dataset)
print(df.head())



Dataset({
    features: ['input_text', 'output_text'],
    num_rows: 500
})
              input_text          output_text
0  create a green sphere  sphere-create green
1      create a red cube      cube-create red
2  create a white sphere  sphere-create white
3   create a yellow cube   cube-create yellow
4    create a red sphere    sphere-create red


In [45]:

from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(examples['output_text'], max_length=128, truncation=True, padding='max_length')
    
    # Store the labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Remove non-tokenized columns to clean the dataset
tokenized_dataset = tokenized_dataset.remove_columns(['input_text', 'output_text'])


# Split the dataset into 80% train and 20% test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)

# Access the train and test sets
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Verify the sizes
print(f"Train size: {len(train_dataset)}, Test size: {len(eval_dataset)}")
print(tokenized_dataset[0])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Train size: 400, Test size: 100
{'input_ids': [482, 3, 9, 1442, 3, 9475, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3, 9475, 18, 22082, 1442, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [46]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    return {
        "something": "No compute metrics",
    }

In [47]:
from transformers import TrainingArguments,Seq2SeqTrainer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    use_mps_device=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # compute_metrics=compute_metrics,
)



In [57]:
trainer.train()

trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,0.354212
2,No log,0.293087
3,No log,0.280995
4,No log,0.278653


{'eval_loss': 0.27865251898765564,
 'eval_runtime': 0.7081,
 'eval_samples_per_second': 141.213,
 'eval_steps_per_second': 9.885,
 'epoch': 4.0}

In [59]:
from datetime import datetime

# Get current date and time
now = datetime.now()

# Format as 'Y-m-d H:i'
formatted_time = now.strftime('%Y_%m_%d_%H_%M')

# Assuming the model and tokenizer are already loaded and trained
model.save_pretrained("trained_models/" + formatted_time)
tokenizer.save_pretrained("trained_models/"  + formatted_time)

('trained_models/2024-10-18_03:15/tokenizer_config.json',
 'trained_models/2024-10-18_03:15/special_tokens_map.json',
 'trained_models/2024-10-18_03:15/spiece.model',
 'trained_models/2024-10-18_03:15/added_tokens.json')

In [56]:
test_input_text='create a blue cube'

test_inputs = tokenizer(test_input_text, max_length=128, truncation=True, padding='max_length')

test_input_ids = test_inputs['input_ids']

test_generated_ids = model.generate(test_input_ids, max_length=128)

# print("Hello?")
# print(generated_ids)
# print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
# print("Hello?")

[482, 3, 9, 1692, 123, 346, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


AttributeError: 'list' object has no attribute 'shape'