[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zP8qlM5UPIxnShqmBOhLzPTwAW0lkoB7?usp=sharing)

#Connect to device

In [None]:
import torch
device = torch.device('cuda:0')
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3080 Ti'

In [None]:
# !pip install datasets

# image and poetry similarity

In [None]:
from datasets import load_dataset

dataset = load_dataset('AnyaSchen/image2music_abc')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/revolt/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--image2music_abc-784eee9f15716c2e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00, 399.99it/s]


In [None]:
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle

In [None]:
def save_embeddings(embeddings_dict: dict, path:str):
  with open(path, 'wb') as f:
    pickle.dump(embeddings_dict, f)
  print(f'Saved {path}')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'music', 'genre'],
        num_rows: 1003
    })
})

In [None]:
img_model = SentenceTransformer('clip-ViT-B-32').to(device)

# images = [dataset['train'][i]['image'] for i in range(dataset['train'].num_rows)]

image_embedding = img_model.encode(dataset['train']['image'])

In [None]:
save_embeddings({'image': image_embedding}, './embeddings/image_music.pkl')

Saved ./embeddings_image_music.pkl


In [None]:
def load_music(path):
  with open(path, 'rb') as f:
    load_music = pickle.load(f)
  return load_music

In [None]:
del img_model

# get embeddings for poetry

In [None]:
# Load the pre-trained CLIP model
model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1').to(device)

In [None]:
def from_pkl_to_list(data_dict):
  data_list = []
  author = dataset['train'][0]['author']
  for i in range(1, dataset['train'].num_rows):
    author_curr = dataset['train'][i]['author']
    if author_curr != author:
      for el in data_dict[author]:
        data_list.append(el)
      author = author_curr

  return data_list

In [None]:
generated_music = model.encode(load_music(f'../generate/music.pkl')['music'])
real_music_embedding = model.encode(dataset['train']['music'])

In [None]:
save_embeddings({
    'vit-bart-image2music': generated_music,
    'real': real_music_embedding
    }, './embeddings/music_for_image.pkl')

Saved ./embeddings_music_for_image.pkl


#cosine similaruty

In [None]:
music = load_music('./embeddings/music_for_image.pkl')

In [None]:
image = load_music('./embeddings/image_music.pkl')

In [None]:
device = torch.device('cuda')

In [None]:
def avg_cosine_similarity(embeddings1, embeddings2):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    embeddings1 = torch.tensor(embeddings1).cpu()
    embeddings2 = torch.tensor(embeddings2).cpu()

    total_similarity = 0
    n = len(embeddings1)

    for emb1 in embeddings1:
        emb1 = emb1.unsqueeze(0)
        similarities = cosine_similarity(emb1, embeddings2)
        total_similarity += similarities.max().item()

    return total_similarity / n

In [None]:
print(f"The cosine similarity between the image and real_music is: {avg_cosine_similarity(real_music_embedding, image_embedding)}")

The cosine similarity between the image and real_music is: 0.25164726449330804
