[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1V0wgv4lGFMJm-m50UHhTObJg5QmbvE0b?usp=sharing)

# imports

In [None]:
import pandas as pd

In [None]:
from tqdm import tqdm

In [None]:
import pickle

##load real data

In [None]:
real_poetry_df = pd.read_csv('../data/poetry_keywords.csv')

In [None]:
real_poetry_df['keywords'] = real_poetry_df['keywords'].apply(lambda x: ', '.join(x.split("'")[1:-1:2]))

##load generated data

### Medium one-to-many

In [None]:
with open('../generated/key2poetry/medium_all_poets.pkl', 'rb') as f:
    generated_poetry_medium = pickle.load(f)

### large one-to-many

In [None]:
with open('../generated/key2poetry/large_all_poets.pkl', 'rb') as f:
    generated_poetry_large = pickle.load(f)

## from .pkl to dataframe

In [None]:
poets = real_poetry_df['author'].unique()

In [None]:
def from_pkl_to_df(data_pkl: pd.DataFrame):  
  generated_df = pd.DataFrame()

  for poet in tqdm(poets):
    new_df = pd.DataFrame()
    poetry = []
    keywords = []
    
    for line in data_pkl[poet]:
      poetry.append(line.split('Поэзия:')[-1])
      keywords.append(line.split('Поэзия:')[0].split('Ключевые слова:')[-1].replace('\n',''))

    new_df['author'] = [poet] * len(data_pkl[poet])
    new_df['text'] = poetry
    new_df['keywords'] = keywords
    generated_df = pd.concat([generated_df, new_df])
  return generated_df

generated_medium_df = from_pkl_to_df(generated_poetry_medium)
generated_large_df = from_pkl_to_df(generated_poetry_large)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 363.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 339.00it/s]


In [None]:
generated_large_df

Unnamed: 0,author,text,keywords
0,Маяковский,А если мама\n не пускает\nв крова...,"спать, журнальчик, заставить, мальчик, мать"
1,Маяковский,"Встал Петр,\n сел у окна,\nсмотри...","вставать, детвора, отец, засыпать, ухо"
2,Маяковский,Вышла\n Керзонова\n школа\nза ...,"мочить, дрематься, выходить, школа, чай"
3,Маяковский,Приказ\n выполнить немедленно!\nПройди...,"магазин, вывеска, прочесть, пошагать, симон"
4,Маяковский,"Вышел\n Номисвыходил,\n ...","номисвыходить, наука, пять, прочесть, третий"
...,...,...,...
2202,Пушкин,Старец шел путем путем.\nПод вечер недалеко\n...,"сон, старец, ангел, плаватель, готовить"
2203,Пушкин,"Я жду тебя; нетерпеньем обуян,\nВ уме всё обд...","утомлённый, сон, ждать, исповедовать, плуг"
2204,Пушкин,"Явись мне, спаситель мира,\nУспокой души моей...","успокоить, надеяться, казнь, творец, вечный"
2205,Пушкин,"И мне надоело жить,\nХочу, чтоб мне надышатьс...","надоесть, охладеть, жизнь, хотеть, молодость"


## use sentence-transformers

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained Sentence Transformers model
model = SentenceTransformer('bert-base-nli-mean-tokens')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def get_cos_sim(row): 
  # Convert the generated poetry and original keywords into embeddings
  generated_embedding = model.encode([row['text']])
  keywords_embedding = model.encode([row['keywords']])

  # Compute the cosine similarity between the generated poetry and original keywords
  cos_sim = cosine_similarity(generated_embedding, keywords_embedding)[0][0]
  return cos_sim

# Print the cosine similarity score
print(f"The cosine similarity between the real poetry and original keywords is: {get_cos_sim(real_poetry_df)}")
print(f"The cosine similarity between the generated medium poetry and original keywords is: {get_cos_sim(generated_medium_df)}")
print(f"The cosine similarity between the generated large poetry and original keywords is: {get_cos_sim(generated_large_df)}")
print()
print('Processing real')
real_poetry_df['cos_sim'] = real_poetry_df.apply(get_cos_sim, axis = 1)
print('Processing generated medium')
generated_medium_df['cos_sim'] = generated_medium_df.apply(get_cos_sim, axis = 1)
print('Processing generated large')
generated_large_df['cos_sim'] = generated_large_df.apply(get_cos_sim, axis = 1)

The cosine similarity between the real poetry and original keywords is: 0.9467449188232422
The cosine similarity between the generated medium poetry and original keywords is: 0.9614182710647583
The cosine similarity between the generated large poetry and original keywords is: 0.9588751196861267

Processing real
Processing generated medium
Processing generated large


In [None]:
real_poetry_df['cos_sim'].describe().round(3)

count    7755.000
mean        0.915
std         0.025
min         0.680
25%         0.901
50%         0.917
75%         0.932
max         0.983
Name: cos_sim, dtype: float64

In [None]:
generated_medium_df['cos_sim'].describe().round(3)

count    7755.000
mean        0.910
std         0.023
min         0.673
25%         0.897
50%         0.912
75%         0.925
max         0.971
Name: cos_sim, dtype: float64

In [None]:
generated_large_df['cos_sim'].describe().round(3)

count    7755.000
mean        0.910
std         0.024
min         0.488
25%         0.897
50%         0.912
75%         0.925
max         0.979
Name: cos_sim, dtype: float64

In [None]:
%cd generating_poetry

/home/revolt/generating_poetry


#Connect to device

In [None]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import torch
device = torch.device('cuda:0')
torch.cuda.get_device_name(0)

'Quadro RTX 6000'

In [None]:
# !pip install datasets

# image and poetry similarity

In [None]:
from datasets import load_dataset

dataset = load_dataset('AnyaSchen/image2poetry_ru')

Found cached dataset parquet (/home/revolt/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--image2poetry_ru-bd53c8b353e828ac/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00, 382.06it/s]


In [None]:
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle

In [None]:
def save_embeddings(embeddings_dict: dict, path:str):
  with open(path, 'wb') as f:
    pickle.dump(embeddings_dict, f)
  print(f'Saved {path}')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'poetry', 'author'],
        num_rows: 7755
    })
})

In [None]:
img_model = SentenceTransformer('clip-ViT-B-32').to(device)

image_embedding = img_model.encode(dataset['train']['image'])

Downloading (…)d52eb/.gitattributes: 100%|███████████| 690/690 [00:00<00:00, 247kB/s]
Downloading (…)LIPModel/config.json: 100%|██████| 4.03k/4.03k [00:00<00:00, 1.94MB/s]
Downloading (…)CLIPModel/merges.txt: 100%|█████████| 525k/525k [00:00<00:00, 866kB/s]
Downloading (…)rocessor_config.json: 100%|███████████| 316/316 [00:00<00:00, 318kB/s]
Downloading pytorch_model.bin: 100%|██████████████| 605M/605M [09:14<00:00, 1.09MB/s]
Downloading (…)cial_tokens_map.json: 100%|███████████| 389/389 [00:00<00:00, 196kB/s]
Downloading (…)okenizer_config.json: 100%|███████████| 604/604 [00:00<00:00, 574kB/s]
Downloading (…)CLIPModel/vocab.json: 100%|█████████| 961k/961k [00:01<00:00, 932kB/s]
Downloading (…)859cad52eb/README.md: 100%|██████| 1.88k/1.88k [00:00<00:00, 1.86MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 94.3kB/s]
Downloading (…)cad52eb/modules.json: 100%|██████████| 122/122 [00:00<00:00, 98.0kB/s]


In [None]:
save_embeddings({'image': image_embedding}, './embeddings/embeddings_image.pkl')

Saved ./embeddings_image.pkl


In [None]:
del img_model

# get embeddings for poetry

In [None]:
def load_poetry(path):
  with open(path, 'rb') as f:
    generated_poetry = pickle.load(f)
  return generated_poetry

In [None]:
# Load the pre-trained CLIP model
model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1').to(device)

In [None]:
%cd generating_poetry

/home/revolt/generating_poetry


In [None]:
%ls

[0m[01;34manaconda3[0m/            [01;34mgenerating_poetry[0m/  [01;34mimage2poetry_ru_2[0m/   [01;34mpoets[0m/
embeddings_image.pkl  [01;34mimage2music_min[0m/    [01;34mimage2poetry_ru_ft[0m/
[01;34mgenerating_music[0m/     [01;34mimage2poetry_ru[0m/    [01;34mpoetry_large_gpt3[0m/


In [None]:
poetry_all = {}
for poet in os.listdir('../poets'):
  poetry = load_poetry(f'../poets/{poet}/generated_image2poetry_vit_rugpt2_{poet}.pkl')
  for author in poetry:
    poetry_all[author] = poetry[author]

In [None]:
from tqdm import tqdm

In [None]:
def from_pkl_to_list(data_dict):
  data_list = []

  for i in tqdm(range(1, dataset['train'].num_rows)):

  author = dataset['train'][0]['author']
  for i in tqdm(range(1, dataset['train'].num_rows)):
    author_curr = dataset['train'][i]['author']
    if author_curr != author:
      for el in data_dict[author]:
        data_list.append(el)
      author = author_curr

  return data_list

In [None]:
dataset['train']['author']

In [None]:
# generated_image2_one_poet = model.encode(from_pkl_to_list(poetry_all))
# generated_fp16 = model.encode(from_pkl_to_list(load_poetry(f'../image2poetry_ru/generated_image2poetry_vit_rugpt2_fune_fp16.pkl')))
# generated_fp32 = model.encode(from_pkl_to_list(load_poetry(f'../image2poetry_ru_2/generated_image2poetry_vit_rugpt2_fn_f32.pkl')))
# generated_ft = model.encode(from_pkl_to_list(load_poetry(f'../image2poetry_ru_ft/generated_image2poetry_vit_rugpt2_ft.pkl')))
real_poetry_embedding = model.encode(dataset['train']['poetry'])

In [None]:
save_embeddings({
    'vit-rugpt3-medium-poet': generated_image2_one_poet,
    'vit-rugpt3-large-poetry-fp32': generated_fp32,
    'vit-rugpt3-large-poetry-fp16': generated_fp16,
    'vit-rugpt3-large-poetry-ft':generated_ft,
    'real': real_poetry_embedding
    }, './embeddings_poetry_for_image.pkl')

In [None]:
real_poetries = [dataset['train'][i]['poetry'] for i in range(dataset['train'].num_rows)]

# Encode the generated poetry
poetry_embedding = model.encode(dataset['train']['poetry'])

# image and music similarity

In [None]:
image_embedding = load_poetry('./embeddings_image.pkl')

In [None]:
def avg_cosine_similarity(embeddings1, embeddings2):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    embeddings1 = torch.tensor(embeddings1).cpu()
    embeddings2 = torch.tensor(embeddings2).cpu()

    total_similarity = 0
    n = len(embeddings1)

    for emb1 in embeddings1:
        emb1 = emb1.unsqueeze(0)
        similarities = cosine_similarity(emb1, embeddings2)
        total_similarity += similarities.max().item()

    return total_similarity / n

In [None]:
avg_cosine_similarity(real_poetry_embedding,image_embedding['image'])

0.27993002415164525