In [1]:
import gensim
import nltk
from nltk.translate.bleu_score import sentence_bleu
from scipy.spatial.distance import cosine
import numpy as np
from gensim.models import KeyedVectors
import pandas as pd
import sys
sys.path.append('../')
from lyrics_generator import LyricsGenerator
import pickle
from models.lstm import LSTMModel
import torch
from gensim.models import KeyedVectors
import random
from sys import path
path.append('../')
from utils.lyrics_parser import lyrics2dict
random.seed(563)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def calculate_similarity_metrics(text1, text2, word2vec_model):
    """
    Calculates cosine similarity, and BLEU score between two texts.
    
    Parameters:
    text1 (str): The first text.
    text2 (str): The second text.
    word2vec_model (gensim.models.KeyedVectors): Pre-trained Word2Vec model.
    
    Returns:
    dict: A dictionary containing the cosine similarity, WMD, and BLEU score.
    """
    # Tokenize and vectorize texts
    tokens1 = text1.lower().split()
    tokens2 = text2.lower().split()
    
    vectors1 = [word2vec_model[word] for word in tokens1 if word in word2vec_model]
    vectors2 = [word2vec_model[word] for word in tokens2 if word in word2vec_model]
    
    # Cosine similarity
    # Avoid division by zero and ensure valid vectors for cosine similarity calculation
    if len(vectors1) > 0 and len(vectors2) > 0:
        mean_vector1 = np.mean(vectors1, axis=0)
        mean_vector2 = np.mean(vectors2, axis=0)
        cosine_sim = 1 - cosine(mean_vector1, mean_vector2)
    else:
        cosine_sim = float('nan')
    

    
    # BLEU score
    # Note: `sentence_bleu` expects a list of reference sentences, where each reference is tokenized
    bleu_score = sentence_bleu([tokens1], tokens2)
    
    return {
        'Cosine Similarity': cosine_sim,
        'BLEU Score': bleu_score
    }

In [3]:
# load midi_embeddings.pkl
with open('../data/midi_embeddings.pkl', 'rb') as f:
    midi_embeddings = pickle.load(f)

# load word2vec model
word2vec_model = KeyedVectors.load('../models/weights/word2vec-google-news-300.model')

# Load the trained LSTM model
input_size = word2vec_model.vector_size + len(list(midi_embeddings.values())[0][0])
lstm_model = LSTMModel(input_dim=input_size, hidden_dim=128, vocab_size=len(word2vec_model), num_layers=2)
lstm_model.load_state_dict(torch.load('../models/weights/lstm_int_128_2_0.001_continued_weights.pth'))

device = 'cuda'

lyrics_generator = LyricsGenerator(lstm_model, word2vec_model, midi_embeddings, device=device)
print("Lyrics Generator Initialized!")



Lyrics Generator Initialized!


In [4]:
# Generate lyrics with a specific song key and seed text
song_key = 'hello adele'
seed_text = 'BOS'
max_length = 100
temperature = 1  # Adjust for creativity

generated_lyrics = lyrics_generator.generate(song_key=song_key, seed_text=seed_text, max_length=max_length, temperature=temperature)
print("Generated Lyrics:\n", generated_lyrics)

Generated Lyrics:
 seem me may not i ve ve tried 
 i ve tried 
 doesn t matter it 
 tell me home 
 for breaking from hard 
 but the get about typical 
 at the both wondering i ve forgotten 
 i call 
 i m us 



In [5]:
# load lyrics dicts
with open('../data/lyrics_dict.pkl', 'rb') as f:
    lyric_dicts = pickle.load(f)


In [19]:
def test_similarity(songs, lyric_dicts, epochs=10):
    """
    Test the lyrics generator on a set of songs, comparing generated lyrics against original lyrics.
    Evaluates the similarity of generated lyrics to the original lyrics based on predefined metrics.
    
    Parameters:
    - songs (list of str): A list of song keys to generate lyrics for.
    - epochs (int, optional): The number of times to generate lyrics for each song. Defaults to 10.
    
    Returns:
    - pd.DataFrame: A DataFrame containing similarity metrics for generated lyrics across all songs and epochs.
    """
    
    # Convert word indices in lyrics to words using the Word2Vec model
    original_lyrics = {
        song: ' '.join([word2vec_model.index_to_key[word_index] for word_index in lyric_dicts[song]])
        .replace('EOF', '').replace('EOS', '\n').replace('BOS', '')
        for song in songs
    }
    
    # Generate lyrics for each song across specified epochs
    generated_lyrics = {
        song: [lyrics_generator.generate(song_key=song, seed_text='BOS', max_length=100, temperature=1.0) for _ in range(epochs)]
        for song in songs
    }
    
    # Calculate similarity metrics for generated lyrics against original lyrics
    similarity_metrics = {
        song: [calculate_similarity_metrics(original_lyrics[song], lyric, word2vec_model) for lyric in generated_lyrics[song]]
        for song in songs
    }
    
    # Flatten the similarity metrics into a DataFrame
    df = pd.DataFrame([
        {'song': song, **metrics}  # Assuming calculate_similarity_metrics returns a dict of metrics
        for song, metrics_list in similarity_metrics.items()
        for metrics in metrics_list
    ])
    
    return df


In [7]:
# load lyrics_test_set.csv
lyrics_test_set = pd.read_csv('../data/lyrics_test_set.csv')

test_dict, _, oov = lyrics2dict(lyrics_test_set)
print("OOV: ", oov)


100%|██████████| 5/5 [00:00<00:00, 989.97it/s]

OOV:  0.0





In [8]:
# combined set of midi and lyric keys
midi_keys = set(midi_embeddings.keys())
lyric_keys = set(lyric_dicts.keys())
test_keys = set(test_dict.keys())
train_songs = midi_keys.intersection(lyric_keys)
test_songs = midi_keys.intersection(test_keys)

In [19]:
songs = random.sample(list(train_songs), 50)
songs

['shoop shoop song cher',
 'even now barry manilow',
 'cruel summer bananarama',
 'kim eminem',
 'the real slim shady eminem',
 'manic monday the bangles',
 'zoot suit riot cherry poppin daddies',
 'brick ben folds five',
 'take it easy the eagles',
 'to love you more celine dion',
 'superstar the carpenters',
 'rock me amadeus falco',
 'how deep is your love bee gees',
 'what can i say boz scaggs',
 'son of a preacher man dusty springfield',
 'immortality celine dion',
 'dammit blink 182',
 'too hot coolio',
 'love gives love takes the corrs',
 'land of confusion genesis',
 'everything i own bread',
 'wrap it up fabulous thunderbirds',
 'attitude dancing carly simon',
 'hungry eyes eric carmen',
 'on the border al stewart',
 'only time will tell asia',
 'sweet love anita baker',
 'at your side the corrs',
 'from the bottom of my broken heart britney spears',
 'dancing with myself billy idol',
 'father of mine everclear',
 'criminal fiona apple',
 'you light up my life debby boone',
 '

In [16]:
df = test_similarity(songs, lyric_dicts=lyric_dicts, epochs=10)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [18]:
# calculate average and std for each song
df.groupby(df.song).mean().mean(), df.groupby(df.song).std().mean()

(Cosine Similarity    0.906297
 BLEU Score           0.010602
 dtype: float64,
 Cosine Similarity    0.045925
 BLEU Score           0.007836
 dtype: float64)

In [20]:
# test on test
test_df = test_similarity(test_songs, lyric_dicts=test_dict, epochs=10)


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [21]:
# calculate average and std for each song
test_df.groupby(test_df.song).mean().mean(), test_df.groupby(test_df.song).std().mean()

(Cosine Similarity    8.494694e-01
 BLEU Score           4.521129e-80
 dtype: float64,
 Cosine Similarity    2.595883e-02
 BLEU Score           1.264184e-79
 dtype: float64)

In [22]:
test_keys

{'all the small things blink 182',
 'barbie girl aqua',
 'eternal flame the bangles',
 'honesty billy joel',
 'lovefool cardigans'}

In [34]:
# generate lyrics for a song
song = 'eternal flame the bangles'
lyrics = lyrics_generator.generate(song_key=song, seed_text='BOS', max_length=100, temperature=1.0)
print(lyrics)


when fire 
 in with my own by the skin truth 
 while truth like 
 should cares 
 making 
 bed when my own truth again again the baby 
 where you 
 i must t make me with my motherfuckin 
 i hear you mine name inside lives with bed wine s world is i am i ll spend for catch back apart apart fighting my 
 i 
 door 
 i do apart 
 crying who truth you means 
 tell the own bed my own 
 crying i am again 
 where the world words 

