In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(w) for w in (re.sub(',', '', text).lower()).split())

In [7]:
df = pd.read_csv('../data/songs_lyrics.csv')

In [8]:
df

Unnamed: 0.1,Unnamed: 0,album_title,song_title,lyrics,lemmatize_lyrics
0,0,TaylorSwift,Picture to Burn,"State the obvious, I didnt get my perfect fant...",state the obvious i didnt get my perfect fanta...
1,1,TaylorSwift,Tim McGraw,He said the way my blue eyes shined Put those ...,he said the way my blue eye shined put those g...
2,2,TaylorSwift,The Outside,I didnt know what I would find When I went loo...,i didnt know what i would find when i went loo...
3,3,TaylorSwift,Teardrops On My Guitar,"Drew looks at me I fake a smile, so he wont se...",drew look at me i fake a smile so he wont see ...
4,4,TaylorSwift,Mary's Song (Oh My My My),She said I was seven and you were nine I looke...,she said i wa seven and you were nine i looked...
...,...,...,...,...,...
240,240,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,I Can Fix Him (No Really I Can),The smoke cloud billows out his mouth Like a f...,the smoke cloud billow out his mouth like a fr...
241,241,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Whos Afraid of Little Old Me?,"The whos who of ""Whos that?"" is poised for the...","the who who of ""whos that?"" is poised for the ..."
242,242,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Fortnight,I was supposed to be sent away But they forgot...,i wa supposed to be sent away but they forgot ...
243,243,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Fresh Out The Slammer,"Now, pretty baby, Im runnin back home to you F...",now pretty baby im runnin back home to you fre...


In [36]:
# method to find search recommendations
def get_results(user_input):
    # determine the number of words in user search
    search_length = user_input.split()
    n_words = len(search_length)

    # if user is looking for phrase (trigrams to total word-grams)
    if n_words > 2:
        tfv = TfidfVectorizer(ngram_range=(3, n_words), use_idf=False)
        tfv_matrix = tfv.fit_transform(df['lemmatize_lyrics'])

    # if user is just searching for words (unigrams, bigrams)
    else:
        tfv = TfidfVectorizer(ngram_range=(1, n_words), use_idf=False)
        tfv_matrix = tfv.fit_transform(df['lemmatize_lyrics'])
    
    # fit user input 
    tfv_user = tfv.transform([lemmatize_text(user_input)])

    # calculate similarity scores between search and all songs
    cosine_similarities = cosine_similarity(tfv_user, tfv_matrix)
    cosine_similarities = cosine_similarities.flatten()

    # create matrix for all songs that match 
    matches = pd.DataFrame()
    matches['index'] = np.where(cosine_similarities > 0)[0]
    matches['score'] = cosine_similarities[matches['index']]
    matches_sorted = matches.sort_values(by='score', ascending=False)

    # output the list of similarities
    for i in range(len(matches_sorted)):
        song_index = matches_sorted['index'].iloc[i]
        print(f"Song: {df['song_title'][song_index]}, Similarity Score: {matches_sorted['score'].iloc[i]:.4f}")

In [None]:
get_results("casually cruel")

Song: Mr. Perfectly Fine (Taylor's Version) (From The Vault), Similarity Score: 0.0891
Song: Cruel Summer, Similarity Score: 0.0521
Song: All Too Well (Taylor's Version), Similarity Score: 0.0281
Song: Cornelia Street, Similarity Score: 0.0175
Song: Robin, Similarity Score: 0.0170
Song: The Black Dog, Similarity Score: 0.0143
Song: All Too Well (10 Minute Version) (Taylor's Version) (From The Vault), Similarity Score: 0.0133
Song: Eyes Open (Taylor's Version), Similarity Score: 0.0090
Song: New Romantics (Taylor's Version), Similarity Score: 0.0077
Song: Daylight, Similarity Score: 0.0074


In [None]:
# %pip install gensim
from gensim.models import Word2Vec

In [40]:
# w2v model
model_w2v = Word2Vec(sentences=df['lemmatize_lyrics'], vector_size=100, window=5, min_count=1, workers=4)

In [55]:
def get_results2(user_input):
    query_words = user_input.split()

    # Get the average vector of the query words
    query_vector = np.mean([model_w2v.wv[word] for word in query_words if word in model_w2v.wv], axis=0)
    
    # Find the most similar song lyrics based on vector similarity (cosine similarity)
    song_similarities = []
    for idx, lyrics in enumerate(df['lemmatize_lyrics']):
        # Get the average vector for each song
        song_vector = np.mean([model_w2v.wv[word] for word in lyrics if word in model_w2v.wv], axis=0)
        
        # Compute cosine similarity
        similarity = np.dot(query_vector, song_vector) / (np.linalg.norm(query_vector) * np.linalg.norm(song_vector))
        song_similarities.append((df['song_title'][idx], similarity))
    
    # Sort by similarity
    song_similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return top_n similar songs
    return song_similarities.all()

In [56]:
get_results2("casually cruel")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()