In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import storage

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# class AudioToTextSmallModel(nn.Module):
#     def __init__(self):
#         super(AudioToTextSmallModel, self).__init__()
#         # Initialize T5 model and tokenizer
#         self.t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

#     def forward(self, audio_embeddings, labels=None):
#         # Ensure correct shape for inputs_embeds: (batch_size, seq_length, embedding_dim)
#         # T5 expects the shape (batch_size, seq_length, embedding_dim)
#         projected_embeddings = audio_embeddings.unsqueeze(1)  # Add seq_length dimension (usually 1 for this case)

#         # Generate outputs with T5
#         outputs = self.t5(
#             inputs_embeds=projected_embeddings,
#             labels=labels
#         )
#         return outputs
    
class AudioToTextBaseModel(nn.Module):
    def __init__(self):
        super(AudioToTextBaseModel, self).__init__()
        # Initialize T5 model and tokenizer with t5-large
        self.t5 = T5ForConditionalGeneration.from_pretrained("t5-base")
        # Linear layer to project 512-dimensional CLAP embeddings to 1024-dimensional embeddings
        self.projection_layer = nn.Linear(512, 768)

    def forward(self, audio_embeddings, labels=None):
        # Project audio embeddings from 512 to 1024 dimensions
        projected_embeddings = self.projection_layer(audio_embeddings)
        
        # Add seq_length dimension (usually 1 for this case)
        projected_embeddings = projected_embeddings.unsqueeze(1)

        # Generate outputs with T5
        outputs = self.t5(
            inputs_embeds=projected_embeddings,
            labels=labels
        )
        return outputs

tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
# Initialize the model and tokenizer
model = AudioToTextBaseModel().to(device)  # Move the model to GPU

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
model.load_state_dict(torch.load("../data/weights_10_base.pth"))

# Step 4: Set the model to evaluation mode (if you only need to do inference)
model.eval()

  model.load_state_dict(torch.load("../data/weights_10_base.pth"))


AudioToTextBaseModel(
  (t5): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_f

In [5]:
# Load the training data
train_data = torch.load('../data/train_data.pt')
test_data = torch.load('../data/test_data.pt')

  train_data = torch.load('../data/train_data.pt')
  test_data = torch.load('../data/test_data.pt')


In [6]:
train_embeddings = torch.tensor(np.array(train_data["embeddings"])).to(device)  # Move to GPU
train_labels = [str(label) for label in train_data["labels"]]

test_embeddings = torch.tensor(np.array(test_data["embeddings"])).to(device)  # Move to GPU
test_labels = [str(label) for label in test_data["labels"]]

In [27]:
train_embeddings.size()

torch.Size([3553, 512])

In [32]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def average_pairwise_sim(embeddings):
    pairwise_similarities = cosine_similarity(embeddings)

    # Get the number of embeddings
    num_embeddings = len(embeddings)

    # Extract the upper triangle of the similarity matrix without the diagonal
    upper_triangle_indices = np.triu_indices(num_embeddings, k=1)
    pairwise_values = pairwise_similarities[upper_triangle_indices]

    # Calculate the average pairwise similarity
    average_pairwise_similarity = np.mean(pairwise_values)
    return average_pairwise_similarity

In [33]:
average_pairwise_sim(train_embeddings.cpu())

0.93777186

In [35]:
train_embeddings[0] - train_embeddings[1]

tensor([ 7.9709e-03,  1.4819e-02,  1.5407e-02, -3.9049e-03, -9.9480e-03,
        -6.9047e-03,  3.6069e-02, -1.0557e-02,  1.5398e-02, -7.1141e-03,
        -5.6794e-04,  1.3234e-02, -2.1287e-02,  3.0202e-02,  4.8684e-03,
        -2.1075e-02, -1.1806e-02, -1.3603e-02, -8.3285e-03, -2.3117e-03,
         6.5029e-03, -3.1472e-02, -5.3021e-03, -6.0198e-03,  7.7293e-03,
         6.9554e-02, -2.4552e-02, -1.1826e-02, -3.4772e-02, -1.9395e-02,
        -1.3569e-02,  2.3409e-02,  4.8719e-02,  6.3505e-02, -4.9649e-02,
         1.0479e-02,  9.4825e-03, -7.4857e-03, -7.8678e-03, -1.1279e-02,
        -1.7770e-02,  1.6087e-02,  2.3782e-02, -2.3731e-02,  1.6549e-02,
        -6.6485e-03,  1.0770e-02,  5.4051e-02,  5.1967e-04,  2.1856e-03,
         3.4155e-02, -1.7571e-02, -2.3475e-02, -3.0246e-03,  6.2090e-02,
        -5.8932e-03, -5.5003e-02, -1.3057e-02, -2.1058e-02, -1.6350e-02,
        -4.8270e-03,  1.7186e-02,  4.0746e-02, -7.7797e-04,  3.9195e-02,
        -9.1315e-03, -4.7628e-03,  3.5974e-02, -3.2

In [34]:
average_pairwise_sim(test_embeddings.cpu())

0.93470097

In [7]:
# Ensure all labels are strings
for label in train_labels:
    if label is None or not isinstance(label, str):
        print("Label has an error or is not a string")

In [8]:
# Tokenize the labels (convert them into token IDs) just once
train_tokenized_labels = tokenizer(train_labels, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)  # Move to GPU

test_tokenized_labels = tokenizer(test_labels, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)  # Move to GPU


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
# Create a DataLoader for your train data
train_dataset = TensorDataset(train_embeddings, train_tokenized_labels)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)

test_dataset = TensorDataset(test_embeddings, test_tokenized_labels)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [10]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)  # You can adjust the learning rate

In [11]:
def evaluate_final_loss(model, data_loader):
    total_loss = 0
    for i, batch in enumerate(data_loader):
        audio_embeddings, labels = batch

        # Move data to GPU
        audio_embeddings = audio_embeddings.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(audio_embeddings, labels=labels)

        # Calculate loss
        loss = outputs.loss
        total_loss += loss.item()

    # Calculate and print the loss for this epoch
    avg_loss = total_loss / len(data_loader)
    return avg_loss

In [12]:
evaluate_final_loss(model, train_loader)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


0.9146866494589982

In [13]:
evaluate_final_loss(model, test_loader)

1.0046113055013608

In [14]:
model.eval()

AudioToTextBaseModel(
  (t5): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_f

In [15]:
def inference(example_embedding):
    with torch.no_grad():
        input_embeddings = model.projection_layer(example_embedding)
        generated_ids = model.t5.generate(
            inputs_embeds=input_embeddings.view(1, 1, 768),
            max_length=100,  # Adjust as needed
            early_stopping=True
        )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [16]:
from tqdm import tqdm

def run_predictions(model, data_loader):
    pred_text = []
    true_text = []
    for i, batch in tqdm(enumerate(data_loader)):
        if i == 50:
            break
        audio_embeddings, labels = batch
        pred = inference(audio_embeddings[0])
        true = tokenizer.decode(labels[0], skip_special_tokens=True)
        pred_text.append(pred)
        true_text.append(true)

    # Calculate and print the loss for this epoch
    return true_text, pred_text

In [17]:
train_true, train_pred = run_predictions(model, train_loader)
for i in range(5):
    print(train_true[i])
    print(train_pred[i])

50it [01:18,  1.56s/it]

The low quality recording features a latin jazz song played in the background over which a drums solo is played. The solo consists of shimmering hi hats, punchy snare and kick hits and low tom rolls, while the latin jazz song consists of groovy piano chords and wooden percussion. It sounds energetic and exciting.
This is a live performance of a classical music piece. The tempo is medium with a groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bassline, groovy bassline, groovy bassline, groovy bassline, groovy bass
This is an amateur recording of a dance performance. There is a zumba dance music version of a movie theme playing in the background. The melody is being played by the strings and the keyboard while there is a loud electronic drum beat for the rhythm. There is a mysterious yet energetic feel to this piece. The recording quality is not that great. However, this piece could still be used to gather samples for beat-making.
This is a




In [18]:
test_true, test_pred = run_predictions(model, test_loader)
for i in range(5):
    print(test_true[i])
    print(test_pred[i])

50it [01:17,  1.55s/it]

This house music features a female voice singing the main melody. This is accompanied by programmed percussion playing a simple beat. The kick is played on every count. Hand claps are played at every alternate count. The bass plays the root notes of the chords. Synth chords are played in the background. This song can be played at a club.
This is a live performance of a jazz song. The tempo is medium with a groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line
This is instrumental Chinese music. The main melody is played by dizi, a Chinese flute. Qin (a Chinese xylophone) is played with syncopation while at the same time carrying the bass notes. There is also a simple acoustic drum beat in the rhythmic background. The piece has a positive, optimistic atmosphere. It could be used in the movies/shows that take place in China. It could also be used in the background 




In [46]:
import random
test_idx = random.randint(0,1000)
print(test_true[test_idx])
print(test_pred[test_idx])

This house music features a female voice singing the main melody. This is accompanied by programmed percussion playing a simple beat. The kick is played on every count. Hand claps are played at every alternate count. The bass plays the root notes of the chords. Synth chords are played in the background. This song can be played at a club.
This is a live performance of a jazz song. The tempo is medium with a groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line


In [19]:
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [20]:

from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu

def evaluate_scores(true, pred):
    # Split the strings into tokens
    scores = {}
    scores["bleu"] = []
    scores["bert_sim"] = []
    
    for i in range(len(true)):
        reference = true[i].split()
        candidate = pred[i].split()

        # Calculate the BLEU score
        bleu_score = sentence_bleu([reference], candidate, weights=(0.25, 0.25, 0.25, 0.25))
        scores["bleu"].append(bleu_score)
        
        # Initializing the Sentence Transformer model using BERT with mean-tokens pooling
        

        # Encoding the sentences to obtain their embeddings
        sentence_embeddings = bert_model.encode([true[i], pred[i]])

        # Calculating the cosine similarity between the first sentence embedding and the rest of the embeddings
        # The result will be a list of similarity scores between the first sentence and each of the other sentences
        similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])[0][0]
        
        scores["bert_sim"].append(similarity_score)
    return scores
    

In [23]:
scores = evaluate_scores(train_true, train_pred)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [26]:
for metric, values in scores.items():
    print(f"{metric}: {np.mean(values)}")
    print(f"{metric}: {np.max(values)}")
    print(f"{metric}: {np.min(values)}")

bleu: 0.00805811755004154
bleu: 0.133922112029015
bleu: 4.0216221822400775e-232
bert_sim: 0.5676581859588623
bert_sim: 0.7076610922813416
bert_sim: 0.3131912648677826


In [38]:
inference(torch.randn(512,).to(device))

'This is a live performance of a folk song. The tempo is fast with a groovy bass line, groovy drumming, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass line, groovy bass'

In [None]:
train_data["filenames"][:5]