In [1]:
!pip install scikit-learn
!pip install transformers
!pip install sentencepiece
!pip install torch torchvision torchaudio
!pip install accelerate -U
!pip install datasets



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Sample data - in real scenario, this would be much larger and more diverse
data = {
    "natural_language": [
        "Play the latest album by Taylor Swift",
        "Find a playlist for running",
        "Recommend songs similar to 'Shape of You'"
    ],
    "api_call": [
        "GET /v1/search?type=album&query=taylor+swift+latest&limit=1",
        "GET /v1/browse/featured-playlists?limit=1&context=running",
        "GET /v1/recommendations?seed_tracks=7qiZfU4dY1lWllzX7mPBI3"
    ]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def tokenize_function(examples):
    input_texts = ["translate English to API: " + text for text in examples["natural_language"]]
    model_inputs = tokenizer(input_texts, padding="max_length", truncation=True, max_length=128)
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["api_call"], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)

Map: 100%|██████████| 2/2 [00:00<00:00, 68.22 examples/s]


In [9]:
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

Map: 100%|██████████| 1/1 [00:00<00:00, 504.79 examples/s]


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [12]:
trainer.train()

100%|██████████| 3/3 [00:06<00:00,  2.13s/it]

{'train_runtime': 6.4122, 'train_samples_per_second': 0.936, 'train_steps_per_second': 0.468, 'train_loss': 9.474090576171875, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=9.474090576171875, metrics={'train_runtime': 6.4122, 'train_samples_per_second': 0.936, 'train_steps_per_second': 0.468, 'train_loss': 9.474090576171875, 'epoch': 3.0})

In [13]:
model.save_pretrained("./spotify_api_model")