In [1]:
import gensim
import nltk
from nltk.translate.bleu_score import sentence_bleu
from scipy.spatial.distance import cosine
import numpy as np
from gensim.models import KeyedVectors
from gensim.similarities import WmdSimilarity

# Load your pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)

def calculate_similarity_metrics(text1, text2, word2vec_model):
    """
    Calculates cosine similarity, WMD, and BLEU score between two texts.
    
    Parameters:
    text1 (str): The first text.
    text2 (str): The second text.
    word2vec_model (gensim.models.KeyedVectors): Pre-trained Word2Vec model.
    
    Returns:
    dict: A dictionary containing the cosine similarity, WMD, and BLEU score.
    """
    # Tokenize and vectorize texts
    tokens1 = text1.lower().split()
    tokens2 = text2.lower().split()
    
    vectors1 = [word2vec_model[word] for word in tokens1 if word in word2vec_model]
    vectors2 = [word2vec_model[word] for word in tokens2 if word in word2vec_model]
    
    # Cosine similarity
    # Avoid division by zero and ensure valid vectors for cosine similarity calculation
    if len(vectors1) > 0 and len(vectors2) > 0:
        mean_vector1 = np.mean(vectors1, axis=0)
        mean_vector2 = np.mean(vectors2, axis=0)
        cosine_sim = 1 - cosine(mean_vector1, mean_vector2)
    else:
        cosine_sim = float('nan')
    
    # WMD
    wmd = word2vec_model.wmdistance(tokens1, tokens2)
    
    # BLEU score
    # Note: `sentence_bleu` expects a list of reference sentences, where each reference is tokenized
    bleu_score = sentence_bleu([tokens1], tokens2)
    
    return {
        'Cosine Similarity': cosine_sim,
        'WMD': wmd,
        'BLEU Score': bleu_score
    }

In [None]:
# create a new instance of the LyricsGenerator
vocab = Vocabulary(stoi, itos)
generator = LyricsGenerator(lstm_model, vocab, device)


In [None]:
# generate a song
initial_word = 'BOS'  # Starting word for song generation
song = generator.generate(initial_word)

In [None]:
import pandas as pd
# load trainset
train = pd.read_csv('data/lyrics_train_set2.csv')

# get lyrics of a song
lyrics = train.loc[0, 'lyrics']
lyrics

In [None]:
# calculate similarity metrics
cos, bleu = calculate_similarity_metrics(lyrics, song)
print(f"Cosine Similarity: {cos:.4f}")
print(f"BLEU Score: {bleu:.4f}")