In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import lightning as L
import json
import numpy as np

In [2]:
def transform_dict(list_of_dict):
    transformed_embedding = {}
    for dictionary in list_of_dict:
        transformed_embedding[dictionary['id']] = dictionary['embedding']

    return transformed_embedding

def get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict):
    audio_embedding = audio_embeddings_dict[track_id]
    graph_embedding = graph_embeddings_dict[track_id]
    image_embedding = image_embeddings_dict[track_id]
    text_embedding = text_embeddings_dict[track_id]
    return audio_embedding, graph_embedding, image_embedding, text_embedding

In [3]:
## Reading data

tracks = json.load(open('fairouz_conf/fairouz/tracks_contextualized.json'))
track_ids = list(tracks.keys())

audio_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/audio/song_audio_vggish_embeddings.json'))
graph_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/graph/karate_club/song_nodes_RandNE_embedding.json'))
image_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/image/album_covers_openclip_embeddings.json'))
text_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/lyrics/song_lyrics_mxbai_embeddings.json'))

audio_embeddings_dict = transform_dict(audio_embeddings)
graph_embeddings_dict = transform_dict(graph_embeddings)
image_embeddings_dict = transform_dict(image_embeddings)
text_embeddings_dict = transform_dict(text_embeddings)

In [4]:
### Creating Tensors
load = json.load(open('fairouz_conf/fairouz/positives_negatives.json'))

def generate_data():
    anchor_audio = []
    anchor_graph = []
    anchor_image = []
    anchor_text  = []

    query_audio = []
    query_graph = []
    query_image = []
    query_text = []
    
    label = []

    for track_id, data in load.items():
        anchor = get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)
        for positive in data['positives']:
            audio_embd, graph_embd, img_embd, text_embd = get_modality_embeddings(positive, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)

            anchor_audio.append(anchor[0])
            anchor_graph.append(anchor[1])
            anchor_image.append(anchor[2])
            anchor_text.append(anchor[3])

            query_audio.append(audio_embd)
            query_graph.append(graph_embd)
            query_image.append(img_embd)
            query_text.append(text_embd)

            label.append(1)
        
        for negative in data['negatives']:
            audio_embd, graph_embd, img_embd, text_embd = get_modality_embeddings(negative, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)

            anchor_audio.append(anchor[0])
            anchor_graph.append(anchor[1])
            anchor_image.append(anchor[2])
            anchor_text.append(anchor[3])

            query_audio.append(audio_embd)
            query_graph.append(graph_embd)
            query_image.append(img_embd)
            query_text.append(text_embd)

            label.append(0)
        
    return torch.tensor(anchor_audio), torch.tensor(anchor_graph), torch.tensor(anchor_image), torch.tensor(anchor_text), \
           torch.tensor(query_audio), torch.tensor(query_graph), torch.tensor(query_image), torch.tensor(query_text), \
           torch.tensor(label)


In [5]:
anchor_audio, anchor_graph, anchor_image, anchor_text, query_audio, query_graph, query_image, query_text, label = generate_data()

In [6]:
anchor_audio.shape, anchor_graph.shape, anchor_image.shape, anchor_text.shape, query_audio.shape, query_graph.shape, query_image.shape, query_text.shape, label.shape

(torch.Size([6375, 128]),
 torch.Size([6375, 128]),
 torch.Size([6375, 512]),
 torch.Size([6375, 1024]),
 torch.Size([6375, 128]),
 torch.Size([6375, 128]),
 torch.Size([6375, 512]),
 torch.Size([6375, 1024]),
 torch.Size([6375]))

In [10]:
class Encoder(L.LightningModule):
    def __init__(
        self,
        audio_size,
        image_size,
        text_size,
        graph_size,
        expansion_factor,
        contraction_factor,
        embedding_size,
        drop_out=0.2,
    ):
        super().__init__()
        self.embedding_size = embedding_size
        self.audio_encoder = nn.Sequential(
            nn.Linear(audio_size, audio_size // contraction_factor),
            nn.ReLU(),
            nn.Dropout(drop_out),
        )
        self.image_encoder = nn.Sequential(
            nn.Linear(image_size, image_size // contraction_factor),
            nn.ReLU(),
            nn.Dropout(drop_out),
        )
        self.text_encoder = nn.Sequential(
            nn.Linear(text_size, text_size // contraction_factor),
            nn.ReLU(),
            nn.Dropout(drop_out),
        )
        self.graph_encoder = nn.Sequential(
            nn.Linear(graph_size, graph_size // contraction_factor),
            nn.ReLU(),
            nn.Dropout(drop_out),
        )
        self.cat_size = (
            audio_size // contraction_factor
            + image_size // contraction_factor
            + text_size // contraction_factor
            + graph_size // contraction_factor
        )
        self.combined_encoder = nn.Sequential(
            nn.Linear(self.cat_size, self.cat_size * expansion_factor),
            nn.ReLU(),
            nn.Dropout(drop_out),
            nn.Linear(self.cat_size * expansion_factor, embedding_size),
            nn.Dropout(drop_out),
        )
        self.distance_metric = lambda x, y: F.pairwise_distance(x, y, p=2)
        self.margin = 0.5
        self.save_hyperparameters()

    def forward(self, anchor, query, labels):
        anchor_audio, anchor_image, anchor_text, anchor_graph = anchor
        query_audio, query_image, query_text, query_graph = query
        anchor_embedding = self.combined_encoder(
            torch.cat(
                (
                    self.audio_encoder(anchor_audio),
                    self.image_encoder(anchor_image),
                    self.text_encoder(anchor_text),
                    self.graph_encoder(anchor_graph),
                ),
                dim=1,
            )
        )
        query_embedding = self.combined_encoder(
            torch.cat(
                (
                    self.audio_encoder(query_audio),
                    self.image_encoder(query_image),
                    self.text_encoder(query_text),
                    self.graph_encoder(query_graph),
                ),
                dim=1,
            )
        )
        return anchor_embedding, query_embedding, labels

    def training_step(self, batch, batch_idx):
        anchor, query, labels = batch
        anchor_embedding, query_embedding, labels = self(anchor, query, labels)
        distances = self.distance_metric(anchor_embedding, query_embedding)
        losses = 0.5 * labels[0].float() * distances.pow(2) + (
            1 - labels[0]
        ).float() * F.relu(self.margin - distances).pow(2)
        loss = losses.mean()
        self.log("train_loss", loss)
        return loss

    def predict_step(self, audio, image, text, graph):
        audio = self.audio_encoder(audio)
        image = self.image_encoder(image)
        text = self.text_encoder(text)
        graph = self.graph_encoder(graph)
        embedding = self.combined_encoder(torch.cat((audio, image, text, graph), dim=1))
        return embedding

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [11]:
# Define the sizes of your inputs
audio_size = 128
graph_size = 128
image_size = 512
text_size = 1024
batch_size = 32
emb_size = 128
expansion_factor = 2
contraction_factor = 4
drop_out = 0.2

model = Encoder(audio_size, image_size, text_size, graph_size, expansion_factor, contraction_factor, emb_size, drop_out)

# Generate random data for each input type
anchor_audio_data = anchor_audio
anchor_image_data = anchor_image
anchor_text_data = anchor_text
anchor_graph_data = anchor_graph
query_audio_data = query_audio
query_image_data = query_image
query_text_data = query_text
query_graph_data = query_graph
labels_data = label
# Assuming labels are binary

# Combine the data into a single dataset
anchors = data.TensorDataset(anchor_audio_data, anchor_image_data, anchor_text_data, anchor_graph_data)
queries = data.TensorDataset(query_audio_data, query_image_data, query_text_data, query_graph_data)
labels = data.TensorDataset(labels_data)
dataset = data.StackDataset(anchors, queries, labels)

# Create a DataLoader
data_loader = data.DataLoader(dataset, batch_size=batch_size)

In [12]:
print(model)

Encoder(
  (audio_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (image_encoder): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (text_encoder): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (graph_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (combined_encoder): Sequential(
    (0): Linear(in_features=448, out_features=896, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=896, out_features=128, bias=True)
    (4): Dropout(p=0.2, inplace=False)
  )
)


In [13]:
from lightning.pytorch.loggers import CSVLogger
logger = CSVLogger("logs", name="vggish_randne_openclip_mxbai_contraction_4_expansion2_dropout_uniform")

In [14]:
trainer = L.Trainer(min_epochs=100, max_epochs=200, logger = logger)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
trainer.fit(model, data_loader)

You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: logs/vggish_randne_openclip_mxbai_contraction_4_expansion2_dropout_uniform
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type       | Params
------------------------------------------------
0 | audio_encoder    | Sequential | 4.1 K 
1 | image_encoder    | Sequential | 65.7 K
2 | text_encoder     | Sequential | 262 K 
3 | graph_encoder    | Sequential | 4.1 K 
4 | combined_encoder | Sequential | 517 K 
------------------------------------------------
853 K     Trainable params
0         Non-trainable params
853 K     Total params
3.414     Total estimated model params size (MB)
/workspace/fairo

Epoch 199: 100%|██████████| 200/200 [00:03<00:00, 62.01it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████| 200/200 [00:03<00:00, 60.07it/s, v_num=0]


# Inference

In [None]:
# audio, image, text, graph (model)
# audio_embedding, graph_embedding, image_embedding, text_embedding (get modality embeddings)

In [26]:
model.eval()

Encoder(
  (audio_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (image_encoder): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (text_encoder): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (graph_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (combined_encoder): Sequential(
    (0): Linear(in_features=448, out_features=896, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=896, out_features=128, bias=True)
    (4): Dropout(p=0.2, inplace=False)
  )
)

In [27]:
embeddings = []
for track_id in track_ids:
    audio, graph, image, text = get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)
    embedding = model.predict_step(torch.tensor(audio).unsqueeze(0), torch.tensor(image).unsqueeze(0), torch.tensor(text).unsqueeze(0), torch.tensor(graph).unsqueeze(0))
    embedding = embedding.numpy(force=True).squeeze()
    embeddings.append({'id': track_id, 'embedding': embedding.tolist()})

json.dump(embeddings, open('fairouz_conf/fairouz/embeddings/combined/embeddings_contracted_dropout_all_200_epoch.json', 'w'))

# Visualization

In [28]:
embeddings_array = []
metatadata_array = []
for emb in embeddings:
    id = emb['id']
    emb = np.array(emb['embedding'])
    metadata = tracks[id]
    md = {
        "id": id,
        "track_title": metadata['track_title'],
        "artist_name": metadata['artist_name'],
        "album_name": metadata['album_name'],
        "context": ", ".join(metadata["lyrics"]['context']),
        "summary": metadata["lyrics"]['summary'],
        "emotional": ", ".join(metadata["lyrics"]['emotional']),
        "genre": metadata['genres'][0] if len(metadata['genres']) > 0 else 'None',
        "image": metadata['image'],
        "preview_url": metadata['preview_url'],
    }
    embeddings_array.append(emb)
    metatadata_array.append(md)

In [29]:
from nomic import atlas

dataset = atlas.map_data(
    data=metatadata_array,
    embeddings=np.array(embeddings_array),
    identifier="fairouz_vggish_randne_openclip_mxbai_200_epochs_contracted_dropout_all_euclidian",
)

[32m2024-03-25 20:44:35.817[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m868[0m - [1mCreating dataset `fairouz-vggish-randne-openclip-mxbai-200-epochs-contracted-dropout-all-euclidian`[0m
[32m2024-03-25 20:44:36.240[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m108[0m - [1mUploading data to Atlas.[0m
1it [00:01,  1.08s/it]
[32m2024-03-25 20:44:37.331[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1537[0m - [1mUpload succeeded.[0m
[32m2024-03-25 20:44:37.336[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m123[0m - [1m`tyqnology/fairouz-vggish-randne-openclip-mxbai-200-epochs-contracted-dropout-all-euclidian`: Data upload succeeded to dataset`[0m
[32m2024-03-25 20:44:38.840[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1246[0m - [1mCreated map `fairouz_vggish_randne_openclip_mxbai_200_epochs_contracted_dropout_all_euclidian` in dataset `tyq

# Evaluation

In [30]:
import faiss

In [31]:
index = faiss.IndexFlatL2(128)
index.add(np.array(embeddings_array).astype('float32'))

In [32]:
def get_fairouz_embedding(track_id):
    audio, graph, image, text = get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)
    embedding = model.predict_step(torch.tensor(audio).unsqueeze(0), torch.tensor(image).unsqueeze(0), torch.tensor(text).unsqueeze(0), torch.tensor(graph).unsqueeze(0))
    embedding = embedding.numpy(force=True).squeeze()
    return embedding

In [33]:
def get_positives(track_id):
    return load[track_id]["positives"]

def get_negatives(track_id):
    return load[track_id]["negatives"]

In [34]:
from torcheval.metrics.functional.ranking import retrieval_precision

In [35]:
def evaluate(track_id, k=10):
    audio, graph, image, text = get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)
    embedding = model.predict_step(torch.tensor(audio).unsqueeze(0), torch.tensor(image).unsqueeze(0), torch.tensor(text).unsqueeze(0), torch.tensor(graph).unsqueeze(0))
    embedding = embedding.numpy(force=True)
    D, I = index.search(embedding, k)
    distances = D[0]
    normalized_distances = (distances - np.min(distances)) / (np.max(distances) - np.min(distances))
    m = nn.Softmax(dim=0)
    similarity = m(torch.tensor([1-d for d in normalized_distances]))
    positives = get_positives(track_id)
    ids = [metatadata_array[i]["id"] for i in I[0]]
    target = [1 if id in positives else 0 for id in ids]
    return similarity, torch.tensor(target)

In [37]:
p_at_10 = []
p_at_15 = []
p_at_20 = []
p_at_25 = []
for track_id in track_ids:
    sim, target = evaluate(track_id, 10)
    p_at_10.append(retrieval_precision(sim, target, 10))
    sim, target = evaluate(track_id, 15)
    p_at_15.append(retrieval_precision(sim, target, 15))
    sim, target = evaluate(track_id, 20)
    p_at_20.append(retrieval_precision(sim, target, 20))
    sim, target = evaluate(track_id, 25)
    p_at_25.append(retrieval_precision(sim, target, 25))

print("Precision@10", np.mean(p_at_10))
print("Precision@15", np.mean(p_at_15))
print("Precision@20", np.mean(p_at_20))
print("Precision@25", np.mean(p_at_25))



Precision@10 0.07177616
Precision@15 0.05993512
Precision@20 0.052858885
Precision@25 0.048175182


In [1]:
p_at_4 = []
for track_id in track_ids:
    sim, target = evaluate(track_id, 4)
    p_at_4.append(retrieval_precision(sim, target, 4))

print("Precision@4", np.mean(p_at_4))

NameError: name 'track_ids' is not defined

In [None]:
def distance_between(track_id1, track_id2):
    return F.pairwise_distance(torch.tensor(get_fairouz_embedding(track_id1)).unsqueeze(0), torch.tensor(get_fairouz_embedding(track_id2)).unsqueeze(0), p=2)