In [1]:
!ls

README.md   data		preprocessing
clap-to-t5  explore_data.ipynb	requirements.txt


In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import storage

In [3]:
# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
class AudioToTextModel(nn.Module):
    def __init__(self):
        super(AudioToTextModel, self).__init__()
        # Initialize T5 model and tokenizer
        self.t5 = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def forward(self, audio_embeddings, labels=None):
        # Ensure correct shape for inputs_embeds: (batch_size, seq_length, embedding_dim)
        # T5 expects the shape (batch_size, seq_length, embedding_dim)
        projected_embeddings = audio_embeddings.unsqueeze(1)  # Add seq_length dimension (usually 1 for this case)

        # Generate outputs with T5
        outputs = self.t5(
            inputs_embeds=projected_embeddings,
            labels=labels
        )
        return outputs

In [5]:
model = AudioToTextModel()



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
model.load_state_dict(torch.load('data/trained_audio_to_text_model.pth'))

# Step 4: Set the model to evaluation mode (if you only need to do inference)
model.eval()

  model.load_state_dict(torch.load('data/trained_audio_to_text_model.pth'))


AudioToTextModel(
  (t5): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_featur

In [46]:
model = model.to(device)

In [33]:
# Load the training data
train_data = torch.load('data/train_data.pt')

  train_data = torch.load('data/train_data.pt')


In [34]:
train_embeddings = torch.tensor(np.array(train_data["embeddings"])).to(device)  # Move to GPU
train_labels = [str(label) for label in train_data["labels"]]

In [37]:
train_embeddings[0].size()

torch.Size([512])

In [44]:
train_embeddings[:2].device

device(type='cuda', index=0)

In [56]:
inputs_embeds = train_embeddings[0].unsqueeze(1).unsqueeze(1)  # Shape becomes (2, 1, 512)
inputs_embeds.size()

torch.Size([512, 1, 1])

In [54]:
# Ensure train_embeddings has the correct shape (batch_size, seq_length, embedding_dim)


# Run inference using generate()
with torch.no_grad():
    generated_ids = model.t5.generate(
        inputs_embeds=inputs_embeds,
        max_length=50,  # Adjust as needed
        num_beams=2,    # Optional: for beam search
        early_stopping=True
    )

# Decode the generated token IDs to text
for i, gen_id in enumerate(generated_ids):
    decoded_text = tokenizer.decode(gen_id, skip_special_tokens=False)
    print(f"Generated text {i+1}: {decoded_text}")

Generated text 1: <pad>
Generated text 2: <pad>
Generated text 3: <pad>
Generated text 4: <pad>
Generated text 5: <pad>
Generated text 6: <pad>
Generated text 7: <pad>
Generated text 8: <pad>
Generated text 9: <pad>
Generated text 10: <pad>
Generated text 11: <pad>
Generated text 12: <pad>
Generated text 13: <pad>
Generated text 14: <pad>
Generated text 15: <pad>
Generated text 16: <pad>
Generated text 17: <pad>
Generated text 18: <pad>
Generated text 19: <pad>
Generated text 20: <pad>
Generated text 21: <pad>
Generated text 22: <pad>
Generated text 23: <pad>
Generated text 24: <pad>
Generated text 25: <pad>
Generated text 26: <pad>
Generated text 27: <pad>
Generated text 28: <pad>
Generated text 29: <pad>
Generated text 30: <pad>
Generated text 31: <pad>
Generated text 32: <pad>
Generated text 33: <pad>
Generated text 34: <pad>
Generated text 35: <pad>
Generated text 36: <pad>
Generated text 37: <pad>
Generated text 38: <pad>
Generated text 39: <pad>
Generated text 40: <pad>
Generated

Generated text 361: <pad>
Generated text 362: <pad>
Generated text 363: <pad>
Generated text 364: <pad>
Generated text 365: <pad>
Generated text 366: <pad>
Generated text 367: <pad>
Generated text 368: <pad>
Generated text 369: <pad>
Generated text 370: <pad>
Generated text 371: <pad>
Generated text 372: <pad>
Generated text 373: <pad>
Generated text 374: <pad>
Generated text 375: <pad>
Generated text 376: <pad>
Generated text 377: <pad>
Generated text 378: <pad>
Generated text 379: <pad>
Generated text 380: <pad>
Generated text 381: <pad>
Generated text 382: <pad>
Generated text 383: <pad>
Generated text 384: <pad>
Generated text 385: <pad>
Generated text 386: <pad>
Generated text 387: <pad>
Generated text 388: <pad>
Generated text 389: <pad>
Generated text 390: <pad>
Generated text 391: <pad>
Generated text 392: <pad>
Generated text 393: <pad>
Generated text 394: <pad>
Generated text 395: <pad>
Generated text 396: <pad>
Generated text 397: <pad>
Generated text 398: <pad>
Generated te