# Artist Recommendation System #

Our objective is to make an artist recommendation system, based on the data available in [Kaggle's Spotify Dataset](https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks?select=data_by_artist.csv).

The user will input an artist, and the program will return the 10 most similar artists. 

**Methodology**:

The recommendation system is based on 3 parts:
1. The genre of the artist
2. The artist's popularity
3. The artist's debut year

**Steps**:

1. We began by filtering the data based on the genre of the artist. All the other artists who don't share any genre in common are eliminated. Similarity is calculated using Jaccard Score.
2. Next, we calculated the similarity of the other artists to the input artist, by calculating the difference in the genre jaccard similarity score, the popularity and the debut years.
3. After scaling these differences (using the Min-Max scaler), we calculated the proximity of each artist to the main artist. The lower the final score, the closer the artist.
4. The top 10 closest artists are returned as the recommendations.


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_full = pd.read_csv(r"data\data.csv", ',')

In [3]:
df_w_genres = pd.read_csv(r"data\data_w_genres.csv", ',')

In [4]:
# A quick look at the data
df_w_genres.head(5)

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,['show tunes']
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,33.076923,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.444444,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086,120.329667,0.458667,42.555556,11,1,9,[]


In [5]:
df_full.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [6]:
# Convert genres in the df_w_genres dataframe from list to string 
pattern = re.compile(r"\'(.*?)\'", re.IGNORECASE)
df_w_genres['genres'] = df_w_genres['genres'].map(lambda x: re.findall(pattern, x))

# Change fields of no genres to None
df_w_genres['genres'] = df_w_genres['genres'].map(lambda x: np.nan if len(x) == 0 else x)

In [7]:
# Drop all items that do not have a genre
df_w_genres = df_w_genres.drop(df_w_genres[df_w_genres['genres'].isna()].index)
df_w_genres.reset_index(inplace=True)
df_w_genres.drop('index', axis=1, inplace=True)

In [8]:
# Convert all artists to lowercase. Helps the user's input match the artist record
df_w_genres['artists']= df_w_genres['artists'].map(lambda x: x.lower())

In [9]:
df_w_genres.head(1)

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""cats"" 1981 original london cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,[show tunes]


In [10]:
def jaccard_score(list_1, list_2):
    '''
    Returns the jaccard score (Intersection/Union) of two iterables
    '''
    set_1 = set(list_1)
    set_2  = set(list_2)
    return len(set_1.intersection(set_2))/len(set_1.union(set_2))

In [11]:
# Can we use cosine similarity to get similiar artists, if the input artist is not in the dataset?
# A 'Did you mean...' of sorts?

In [12]:

def genre_similar_artists(input_artist):
    '''
    Returns artists that have a at least one genre that matches with the input artist 
    Will return a dataframe based on the df_w_genres index 
    '''
    # The genres of the input artist
    try:
        input_artist_genres = df_w_genres[df_w_genres['artists']==input_artist]['genres'].values[0]
    except(IndexError):
        return 'Artist not found'

    # Calculate the jaccard score of every artist, based on its similarity with the input artis
    similarity_scores = df_w_genres['genres'].map(lambda x: jaccard_score(x, input_artist_genres))

    # Filter out the 0 values (i.e. no common genres)
    similarity_scores = similarity_scores[similarity_scores>0]
    similarity_df = pd.DataFrame(similarity_scores.values, columns=['genre_similarity'], index=similarity_scores.index)
    
    return similarity_df


In [13]:
    
def add_debut_years(similarity_df):
    '''
    Returns a pandas series which has the debut year (or the year of the oldest song in the database)
    '''
    # similarity_df = similarity_df.copy()
    debut_years = []
    for i in similarity_df.index:
        artist = df_w_genres.iloc[i]['artists']
        debut = df_full[df_full['artists'].map(lambda x: artist in x)]['year'].min()
        debut_years.append(debut)
    debut_years = pd.Series(debut_years, index=similarity_df.index)
    similarity_df['year'] = debut_years
    
    return similarity_df

In [14]:
def add_popularities(similarity_df):
    '''
    Returns the popularity of the selected similar artist
    '''
    # similarity_df = similarity_df.copy()
    
    popularity = df_w_genres.iloc[similarity_df.index]['popularity']
    similarity_df['popularity'] = popularity
    return similarity_df

In [15]:
def add_artists(similarity_df):
    '''
    Adds the name of the artist to the dataframe, as the index
    '''
    # similarity_df = similarity_df.copy()
    
    similarity_df['artist'] = df_w_genres.iloc[similarity_df.index]['artists']
    similarity_df.set_index('artist', inplace=True)
    return similarity_df

In [16]:
def ranking_system(similarity_df, input_artist, weights=[0.2, 0.4, 0.4]):
    '''
    Creates a ranking system, based on the distance of each artist to the input artist
    '''
    my_artist_specs = similarity_df.loc[input_artist]
    differences = abs(similarity_df - my_artist_specs)
    # Using a min-max scaler, to ensure that all columns are of equal scale
    differences = differences.apply(lambda x: (x-x.min())/(x.max()-x.min()))
    differences = differences*weights
    
    # The score is the sum of all columns. The lowest score is the most similar
    # The input artist will have a score of 0
    differences['score'] = differences.sum(axis=1)
    return differences['score']
    

In [17]:
artist = input('Enter Artist Name: ')
artist = artist.lower()
similarity_df = genre_similar_artists(artist)
similarity_df = add_popularities(similarity_df)
similarity_df = add_debut_years(similarity_df)
similarity_df = add_artists(similarity_df)
final_score = ranking_system(similarity_df, artist)

Enter Artist Name: mudvayne


In [18]:
recommendations = final_score.sort_values().iloc[1:11].index
print(recommendations.map(lambda x:x.title()).values)

['Spineshank' 'Static-X' 'Fear Factory' 'Coal Chamber' 'Mushroomhead'
 '(Hed) P.E.' 'Adem' 'Sevendust' 'Rob Zombie' 'Orgy']
