In [106]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import lightning as L
import numpy as np

In [107]:
from model_dropout import Encoder
from utils import *

In [108]:
model = Encoder.load_from_checkpoint("/workspace/fairouz/logs/vggish_randne_openclip_mxbai_contraction_4_expansion2_dropout_modality_encoders/version_0/checkpoints/epoch=199-step=40000.ckpt")

In [109]:
model.device

device(type='cuda', index=0)

In [110]:
## Reading data
import json
tracks = json.load(open('fairouz_conf/fairouz/tracks_contextualized.json'))
track_ids = list(tracks.keys())

audio_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/audio/song_audio_vggish_embeddings.json'))
graph_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/graph/karate_club/song_nodes_RandNE_embedding.json'))
image_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/image/album_covers_openclip_embeddings.json'))
text_embeddings = json.load(open('fairouz_conf/fairouz/embeddings/lyrics/song_lyrics_mxbai_embeddings.json'))

audio_embeddings_dict = transform_dict(audio_embeddings)
graph_embeddings_dict = transform_dict(graph_embeddings)
image_embeddings_dict = transform_dict(image_embeddings)
text_embeddings_dict = transform_dict(text_embeddings)

In [111]:
import faiss

In [112]:
load = json.load(open('fairouz_conf/fairouz/positives_negatives.json'))

In [113]:
embeddings = []
for track_id in track_ids:
    audio, graph, image, text = get_modality_embeddings(
        track_id,
        audio_embeddings_dict,
        image_embeddings_dict,
        text_embeddings_dict,
        graph_embeddings_dict,
    )
    embedding = model.predict_step(
        torch.tensor(audio).unsqueeze(0).to("cuda"),
        torch.tensor(image).unsqueeze(0).to("cuda"),
        torch.tensor(text).unsqueeze(0).to("cuda"),
        torch.tensor(graph).unsqueeze(0).to("cuda"),
    )
    embedding = embedding.numpy(force=True).squeeze()
    embeddings.append({"id": track_id, "embedding": embedding.tolist()})

In [88]:
json.dump(embeddings, open('fairouz_conf/fairouz/embeddings/combined/embeddings_contracted_textimage8_graphaudio4_expansion2_200_epoch.json', 'w'))

In [114]:
embeddings_array = []
metatadata_array = []
for emb in embeddings:
    id = emb['id']
    emb = np.array(emb['embedding'])
    metadata = tracks[id]
    md = {
        "id": id,
        "track_title": metadata['track_title'],
        "artist_name": metadata['artist_name'],
        "album_name": metadata['album_name'],
        "context": ", ".join(metadata["lyrics"]['context']),
        "summary": metadata["lyrics"]['summary'],
        "emotional": ", ".join(metadata["lyrics"]['emotional']),
        "genre": metadata['genres'][0] if len(metadata['genres']) > 0 else 'None',
        "image": metadata['image'],
        "preview_url": metadata['preview_url'],
    }
    embeddings_array.append(emb)
    metatadata_array.append(md)

In [115]:
index = faiss.IndexFlatL2(128)
index.add(np.array(embeddings_array).astype('float32'))

In [116]:
def get_fairouz_embedding(track_id):
    audio, graph, image, text = get_modality_embeddings(track_id, audio_embeddings_dict, image_embeddings_dict, text_embeddings_dict, graph_embeddings_dict)
    embedding = model.predict_step(torch.tensor(audio).unsqueeze(0), torch.tensor(image).unsqueeze(0), torch.tensor(text).unsqueeze(0), torch.tensor(graph).unsqueeze(0))
    embedding = embedding.numpy(force=True).squeeze()
    return embedding

In [117]:
def get_positives(track_id):
    return load[track_id]["positives"]

def get_negatives(track_id):
    return load[track_id]["negatives"]

In [118]:
from torcheval.metrics.functional.ranking import retrieval_precision

In [119]:
def evaluate(track_id, k=10):
    audio, graph, image, text = get_modality_embeddings(
        track_id,
        audio_embeddings_dict,
        image_embeddings_dict,
        text_embeddings_dict,
        graph_embeddings_dict,
    )
    embedding = model.predict_step(
        torch.tensor(audio).unsqueeze(0).to("cuda"),
        torch.tensor(image).unsqueeze(0).to("cuda"),
        torch.tensor(text).unsqueeze(0).to("cuda"),
        torch.tensor(graph).unsqueeze(0).to("cuda"),
    )
    embedding = embedding.numpy(force=True)
    D, I = index.search(embedding, k)
    distances = D[0]
    normalized_distances = (distances - np.min(distances)) / (
        np.max(distances) - np.min(distances)
    )
    m = nn.Softmax(dim=0)
    similarity = m(torch.tensor([1 - d for d in normalized_distances]))
    positives = get_positives(track_id)
    ids = [metatadata_array[i]["id"] for i in I[0]]
    target = [1 if id in positives else 0 for id in ids]
    return similarity, torch.tensor(target)

In [120]:
p_at_10 = []
p_at_15 = []
p_at_20 = []
p_at_25 = []
for track_id in track_ids:
    sim, target = evaluate(track_id, 10)
    p_at_10.append(retrieval_precision(sim, target, 10))
    sim, target = evaluate(track_id, 15)
    p_at_15.append(retrieval_precision(sim, target, 15))
    sim, target = evaluate(track_id, 20)
    p_at_20.append(retrieval_precision(sim, target, 20))
    sim, target = evaluate(track_id, 25)
    p_at_25.append(retrieval_precision(sim, target, 25))

print("Precision@10", np.mean(p_at_10))
print("Precision@15", np.mean(p_at_15))
print("Precision@20", np.mean(p_at_20))
print("Precision@25", np.mean(p_at_25))

Precision@10 0.10437957
Precision@15 0.08759124
Precision@20 0.07585159
Precision@25 0.06583942


In [123]:
p_at_4 = []
for track_id in track_ids:
    sim, target = evaluate(track_id, 2)
    p_at_4.append(retrieval_precision(sim, target, 2))

print("Precision@4", np.mean(p_at_4))

Precision@4 0.13746959


In [105]:
print(model)

Encoder(
  (audio_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (image_encoder): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (text_encoder): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (graph_encoder): Sequential(
    (0): Linear(in_features=128, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (combined_encoder): Sequential(
    (0): Linear(in_features=448, out_features=896, bias=True)
    (1): ReLU()
    (2): Linear(in_features=896, out_features=128, bias=True)
  )
)


In [86]:
sum(p.numel() for p in model.parameters())
# print as (X.X M)
print(f"{sum(p.numel() for p in model.parameters()) / 1e6:.1f} M")

0.4 M


In [None]:
from nomic import atlas

dataset = atlas.map_data(
    data=metatadata_array,
    embeddings=np.array(embeddings_array),
    identifier="fairouz_vggish_randne_openclip_mxbai_200_epochs_contracted_dropout_euclidian",
)