In [1]:
import os
import re
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# import sys
# sys.path.append('../')
from functions.function_store import print_lexic_order_with_first_letter, collect_artist_list, collect_song_list, collect_full_discography


# 1. Choosing the song for comparison

In [4]:
'''
To view the collection
'''

# path = "discography/"
artist_list = collect_artist_list()

print_lexic_order_with_first_letter(artist_list)



B
black_sabbath
blondie
bob_dylan

D
depeche_mode
dio
doors

E
elf

F
florence_the_machine
franz_ferdinand

G
guns_n_roses

I
imagine_dragons

K
kasabian

L
lana_del_rey

M
muse

Q
queen

R
rainbow
roxette

U
u2

W
within_temptation


1

In [5]:
'''
To choose the artist
'''

target_artist = 'florence_the_machine'
# 'florence_the_machine'

In [39]:
print()
print("Enter an artist name from the list above:")
target_artist = input()

florence_the_machine


In [6]:
'''
To view the discography
'''

print_lexic_order_with_first_letter(collect_song_list(target_artist))


A
Addicted To Love
All This And Heaven Too

B
Between Two Lungs
Blinding
Breaking Down

C
Cosmic Love

D
Dog Days Are Over
Drumming Song

G
Girl With One Eye

H
Heartlines
Hurricane Drunk

I
I'm Not Calling You A Liar

K
Kiss With A Fist

L
Leave My Body
Lover To Lover

M
My Boy Builds Coffins

N
Never Let Me Go
No Light No Light

O
Only If For A Night

R
Rabbit Heart (Raise It Up)

S
Seven Devils
Shake It Out
Spectrum

W
What The Water Gave Me

Y
You've Got The Love


1

In [7]:
'''
To choose the song:
'''

target_song = 'No Light No Light'


In [15]:
# print()
# print("Enter an song name from the list above:")
# target_song = input()

# 2. Text-based Recommender

In [8]:

def to_recommend_lyrics(df, column, target_key):
    # Instantiate the vectorizer object and transform the plot column
    vectorizer = TfidfVectorizer(max_df=0.7, min_df=2)
    vectorized_data = vectorizer.fit_transform(df[column])

    # Create Dataframe from TF-IDFarray
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

    # Assign the song titles to the index and inspect
    tfidf_df.index = df.index

    # Create the array of cosine similarity values
    cosine_similarity_array = cosine_similarity(tfidf_df)

    # Wrap the array in a pandas DataFrame
    cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

    # Find the values for the target song
    cosine_similarity_series = cosine_similarity_df.loc[target_key]

    # Sort these values highest to lowest
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

    return ordered_similarities


In [9]:
def top_recommended_lyrics(artist, song, n):
    """Returns list of Top-`n` song lyrics which are most similar to song 
    with `artist` artist name and `song` song name.
    
    Args:
        artist (str): The artist name.
        song (str): The song name.
        n (int): The number of most similar songs.
    
    Returns:
        DataFrame
    """
    full_discography_df2 = collect_full_discography()
    target_key = full_discography_df2[
        (full_discography_df2['artist'] == artist) &
        (full_discography_df2['song'] == song)
    ].index[0]
    similar_df = pd.DataFrame(to_recommend_lyrics(full_discography_df2, 'lyrics', target_key)).rename(columns={target_key:'similarity_score'})
    similar_df2 = similar_df.join(full_discography_df2).reset_index(drop=True)[['similarity_score','artist','album','song']]

    return similar_df2[similar_df2.index.isin(list(range(1, n+1)))]


In [10]:
top_recommended_lyrics(target_artist, target_song, 5)

Unnamed: 0,similarity_score,artist,album,song
1,0.266883,bob_dylan,Nashville Skyline,One More Night
2,0.261312,bob_dylan,Slow Train Coming,Precious Angel
3,0.235382,u2,Fire/R.O.K.,The Ocean (Live From Boston)
4,0.235279,queen,Innuendo,I Want It All
5,0.215884,u2,Passengers: Original Soundtrack,Slug


# 4. User-profile Recommender

In [16]:
full_discography_df2 = collect_full_discography()
full_discography_df2.head(2)

Unnamed: 0_level_0,artist,album,song,lyrics,link
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
doors__An_American_Prayer__A_Feast_Of_Friends,doors,An American Prayer,A Feast Of Friends,"Wow, I'm sick of doubt\nLive in the light of c...",https://www.allthelyrics.com/lyrics/doors/a_fe...
doors__Absolutely_Live__A_Little_Game,doors,Absolutely Live,A Little Game,Once I had a little game\nI liked to crawl bac...,https://www.allthelyrics.com/lyrics/doors/a_li...


In [None]:
'''
define the set of your favourite songs
and 
learn the most similar songs from the rest of the full discography
'''

In [79]:
my_favourite_songs_list = [
    ['queen', "", "Don't Stop Me Now"],
    ['queen', "", "Under Pressure"],
    ['lana_del_rey', "", "Blue Jeans"],
    ['rainbow', "", "Stargazer"],
    ['guns_n_roses', "", "You Could Be Mine"]
]

In [82]:
my_favourite_songs_df = pd.DataFrame(my_favourite_songs_list)
my_favourite_songs_df.columns = ['artist', 'album_0', 'song']
my_favourite_songs_df = my_favourite_songs_df.drop(['album_0'], axis=1)
my_favourite_songs_df

Unnamed: 0,artist,song
0,queen,Don't Stop Me Now
1,queen,Under Pressure
2,lana_del_rey,Blue Jeans
3,rainbow,Stargazer
4,guns_n_roses,You Could Be Mine


In [102]:
df3 = full_discography_df2.merge(my_favourite_songs_df, how='inner', on=['artist', 'song'])
df3['key'] = df3.apply(lambda x: x['artist'] + '__' + '_'.join(x['album'].split()) + '__' + '_'.join(x['song'].split()), axis=1)
df3 = df3.set_index('key')
# df3.index

my_favourite_songs = list(df3.index)
my_favourite_songs


['guns_n_roses__Use_Your_Illusion_II__You_Could_Be_Mine',
 "queen__Greatest_Hits__Don't_Stop_Me_Now",
 'queen__Innuendo__Under_Pressure',
 'lana_del_rey__-__Blue_Jeans',
 'rainbow__Rising__Stargazer']

In [107]:

def to_recommend_lyrics_by_set(df, target_list):
    vectorizer = TfidfVectorizer(max_df=0.7, min_df=2)
    vectorized_data = vectorizer.fit_transform(df['lyrics'])

    # Create Dataframe from TF-IDFarray
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

    # Assign the songs titles to the index and inspect
    tfidf_df.index = df.index

    # Create a subset of only the songs the user has enjoyed
    songs_enjoyed_df = tfidf_df.reindex(target_list)

    # Generate the user profile by finding the average scores of songs they enjoyed
    user_prof = songs_enjoyed_df.mean()

    # Find subset of tfidf_df that does not include songs in target_list
    tfidf_subset_df = tfidf_df.drop(target_list, axis=0)

    # Calculate the cosine_similarity and wrap it in a DataFrame
    similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
    similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

    # Sort the values from high to low by the values in the similarity_score
    sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

    return sorted_similarity_df



In [109]:
similar_by_set_df = to_recommend_lyrics_by_set(full_discography_df2, my_favourite_songs)

similar_df_head3 = top_of_similarity(similar_by_set_df, 0.3)
similar_df_head3.join(full_discography_df2).reset_index(drop=True)[['similarity_score','artist','album','song']]


Unnamed: 0,similarity_score,artist,album,song
0,0.40998,queen,Disc: 2,Another One Bites The Dust
1,0.40998,queen,Greatest Hits III,We Will Rock You
2,0.40998,queen,Live At Wembley '86,We Are The Champions
3,0.40998,queen,Greatest Hits,Bicycle Race
4,0.313669,bob_dylan,Knocked Out Loaded,Brownsville Girl
5,0.301823,guns_n_roses,Use Your Illusion I,Coma
