In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [16]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [3]:
# import sys
# sys.path.append('../')
from functions.function_store import print_lexic_order_with_first_letter, collect_artist_list, collect_song_list, collect_full_discography


# 1. Choosing the song for comparison

In [4]:
'''
To view the collection
'''

# path = "discography/"
artist_list = collect_artist_list()

print_lexic_order_with_first_letter(artist_list)



B
black_sabbath
blondie
bob_dylan

D
depeche_mode
dio
doors

E
elf

F
florence_the_machine
franz_ferdinand

G
guns_n_roses

I
imagine_dragons

K
kasabian

L
lana_del_rey

M
muse

Q
queen

R
rainbow
roxette

U
u2

W
within_temptation


1

In [5]:
'''
To choose the artist
'''

target_artist = 'florence_the_machine'
# 'florence_the_machine'

In [39]:
print()
print("Enter an artist name from the list above:")
target_artist = input()

florence_the_machine


In [6]:
'''
To view the discography
'''

print_lexic_order_with_first_letter(collect_song_list(target_artist))


A
Addicted To Love
All This And Heaven Too

B
Between Two Lungs
Blinding
Breaking Down

C
Cosmic Love

D
Dog Days Are Over
Drumming Song

G
Girl With One Eye

H
Heartlines
Hurricane Drunk

I
I'm Not Calling You A Liar

K
Kiss With A Fist

L
Leave My Body
Lover To Lover

M
My Boy Builds Coffins

N
Never Let Me Go
No Light No Light

O
Only If For A Night

R
Rabbit Heart (Raise It Up)

S
Seven Devils
Shake It Out
Spectrum

W
What The Water Gave Me

Y
You've Got The Love


1

In [7]:
'''
To choose the song:
'''

target_song = 'No Light No Light'


In [15]:
# print()
# print("Enter an song name from the list above:")
# target_song = input()

In [8]:
# for user-based recommender

target_songs_list = [
#     ['queen', "Don't Stop Me Now"],
    ['queen', "Under Pressure"],
    ['lana_del_rey', "Blue Jeans"],
    ['rainbow', "Stargazer"],
    ['guns_n_roses', "You Could Be Mine"]
]

# 2. Text-based Recommender

In [17]:

def to_recommend_lyrics(df, column, target_key):
    # Instantiate the vectorizer object and transform the plot column
    vectorizer = TfidfVectorizer(max_df=0.7, min_df=2, stop_words=stopwords)
    vectorized_data = vectorizer.fit_transform(df[column])

    # Create Dataframe from TF-IDFarray
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

    # Assign the song titles to the index and inspect
    tfidf_df.index = df.index

    # Create the array of cosine similarity values
    cosine_similarity_array = cosine_similarity(tfidf_df)

    # Wrap the array in a pandas DataFrame
    cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

    # Find the values for the target song
    cosine_similarity_series = cosine_similarity_df.loc[target_key]

    # Sort these values highest to lowest
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

    return ordered_similarities


In [18]:
def top_recommended_lyrics(artist, song, n):
    """Returns list of Top-`n` song lyrics which are most similar to song 
    with `artist` artist name and `song` song name.
    
    Args:
        artist (str): The artist name.
        song (str): The song name.
        n (int): The number of most similar songs.
    
    Returns:
        DataFrame
    """
    full_discography_df2 = collect_full_discography()
    target_key = full_discography_df2[
        (full_discography_df2['artist'] == artist) &
        (full_discography_df2['song'] == song)
    ].index[0] # select only one key in case of song duplicates
    
    similar_df = pd.DataFrame(to_recommend_lyrics(full_discography_df2, 'lyrics', target_key)).rename(columns={target_key:'similarity_score'})
    similar_df2 = similar_df.join(full_discography_df2).reset_index(drop=True)[['similarity_score','artist','album','song']]

    return similar_df2[similar_df2.index.isin(list(range(1, n+1)))]


In [24]:
# help(top_recommended_lyrics)

In [19]:
top_recommended_lyrics(target_artist, target_song, 5)

Unnamed: 0,similarity_score,artist,album,song
1,0.253054,queen,Innuendo,I Want It All
2,0.252719,u2,Passengers: Original Soundtrack,Slug
3,0.242378,bob_dylan,Slow Train Coming,Precious Angel
4,0.215553,bob_dylan,Nashville Skyline,One More Night
5,0.207136,depeche_mode,Black Celebration,If You Want


# 3. User-profile Recommender

In [20]:

def to_recommend_lyrics_by_set(df, column, target_list):
    # Instantiate the vectorizer object and transform the plot column
    vectorizer = TfidfVectorizer(max_df=0.7, min_df=2, stop_words=stopwords)
    vectorized_data = vectorizer.fit_transform(df[column])

    # Create Dataframe from TF-IDFarray
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

    # Assign the song titles to the index and inspect
    tfidf_df.index = df.index
    
    # Create a subset of only the songs the user has enjoyed
    songs_enjoyed_df = tfidf_df.reindex(target_list)

    # Generate the user profile by finding the average scores of songs they enjoyed
    user_prof = songs_enjoyed_df.mean()

    # Find subset of tfidf_df that does not include songs in target_list
    tfidf_subset_df = tfidf_df.drop(target_list, axis=0)

    # Calculate the cosine_similarity and wrap it in a DataFrame
    similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
    similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

    # Sort the values from high to low by the values in the similarity_score
    sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

    return sorted_similarity_df


In [21]:
def top_recommended_lyrics_by_set(user_list, n):
    """Returns list of Top-`n` song lyrics which are most similar to songs 
    with artist name and song name from `user_list`.
    
    Args:
        user_list (list): 
        n (int): The number of most similar songs.
    
    Returns:
        DataFrame
    """
    full_discography_df2 = collect_full_discography()
    my_favourite_songs = []

    for row in user_list:
        target_key = full_discography_df2[
            (full_discography_df2['artist'] == row[0]) &
            (full_discography_df2['song'] == row[1])
        ].index[0] # select only one key in case of song duplicates
        my_favourite_songs.append(target_key)
    
    similar_df = pd.DataFrame(to_recommend_lyrics_by_set(full_discography_df2, 'lyrics', my_favourite_songs)).rename(columns={target_key:'similarity_score'})
    similar_df2 = similar_df.join(full_discography_df2).reset_index(drop=True)[['similarity_score','artist','album','song']]

    return similar_df2[similar_df2.index.isin(list(range(n)))]


In [22]:
top_recommended_lyrics_by_set(target_songs_list, 5)

Unnamed: 0,similarity_score,artist,album,song
0,0.421173,queen,Disc: 2,Another One Bites The Dust
1,0.421173,queen,Live At Wembley '86,We Are The Champions
2,0.421173,queen,Greatest Hits,Bicycle Race
3,0.421173,queen,Greatest Hits III,We Will Rock You
4,0.243328,dio,Lock Up The Wolves,My Eyes
