This jupiter notebook has a content recommender system. The system is based only on the description of the artist's genres, without reference to the year of release, song characteristics, etc.

In [1]:
# import lib
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# read main data file
data = pd.read_csv('data.csv', ).drop('id', axis=1)
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.963,0.0394,0.165,0.253
year,1921,1921,1921,1921,1921
acousticness,0.982,0.732,0.961,0.967,0.957
artists,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",['Dennis Day'],['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,['Frank Parker'],['Phil Regan']
danceability,0.279,0.819,0.328,0.275,0.418
duration_ms,831667,180533,500062,210000,166693
energy,0.211,0.341,0.166,0.309,0.193
explicit,0,0,0,0,0
instrumentalness,0.878,0.0,0.913,0.000028,0.000002
key,10,7,3,5,3


In [3]:
# cast to date to year
data['release_date'] = [x[:4] for x in data.release_date]
data.release_date = data.release_date.astype('int64')

In [4]:
# data.info()

In [5]:
# data.fillna(0, inplace=True)

In [6]:
# data.shape

In [7]:
# drop note, where artists and name composition coincide
data.drop_duplicates(['name', 'artists'], inplace=True)

In [8]:
data.shape

(157685, 18)

In [9]:
# data.artists.nunique()

In [10]:
def clear(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (List) - returns list of the string
    
    '''
    # text = re.sub(r" ","_",text)
    text = re.sub("[\'\"\[\]]","",text).split(", ")
    return text

def clear_v2(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (string) - returns string
    
    '''
    # text = re.sub(r" ","_",text)
    # print(text)
    text = re.sub("[\'\"\[\]]","", text)
    return text    

def clear_v1(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (List) - returns list of the string
    
    '''
    # text = re.sub(r" ","_",text)
    text = re.sub("[\'\"\[\]]","",text).split(", ")
    return text[0]

Due to the fact that there can be several authors in the artist cell, it is necessary to separate them into separate cells or use the clear_in function clear_v1

In [11]:
# clear artists column
data.artists = data.artists.apply(clear)
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.963,0.0394,0.165,0.253
year,1921,1921,1921,1921,1921
acousticness,0.982,0.732,0.961,0.967,0.957
artists,"[Sergei Rachmaninoff, James Levine, Berliner P...",[Dennis Day],[KHP Kridhamardawa Karaton Ngayogyakarta Hadin...,[Frank Parker],[Phil Regan]
danceability,0.279,0.819,0.328,0.275,0.418
duration_ms,831667,180533,500062,210000,166693
energy,0.211,0.341,0.166,0.309,0.193
explicit,0,0,0,0,0
instrumentalness,0.878,0.0,0.913,0.000028,0.000002
key,10,7,3,5,3


In [12]:
# 
data = data.explode("artists",ignore_index=True)
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,Sergei Rachmaninoff,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.0594,1921,0.982,James Levine,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
2,0.0594,1921,0.982,Berliner Philharmoniker,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
3,0.963,1921,0.732,Dennis Day,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
4,0.0394,1921,0.961,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339


In [13]:
# data.info()

In [14]:
# data.artists.tolist()

Load genres data, in these file genres are of interest

In [15]:
genres_w = pd.read_csv('data_w_genres.csv')

In [16]:
genres_w.head().T

Unnamed: 0,0,1,2,3,4
genres,['show tunes'],[],[],[],[]
artists,"""Cats"" 1981 Original London Cast","""Cats"" 1983 Broadway Cast","""Fiddler On The Roof” Motion Picture Chorus","""Fiddler On The Roof” Motion Picture Orchestra","""Joseph And The Amazing Technicolor Dreamcoat""..."
acousticness,0.590111,0.862538,0.856571,0.884926,0.510714
danceability,0.467222,0.441731,0.348286,0.425074,0.467143
duration_ms,250318.555556,287280.0,328920.0,262890.962963,270436.142857
energy,0.394003,0.406808,0.286571,0.24577,0.488286
instrumentalness,0.0114,0.081158,0.024593,0.073587,0.0094
liveness,0.290833,0.315215,0.325786,0.275481,0.195
loudness,-14.448,-10.69,-15.230714,-15.63937,-10.236714
speechiness,0.210389,0.176212,0.118514,0.1232,0.098543


In [17]:
genres_w.artists = genres_w.artists.apply(clear_v2)
genres_w.genres = genres_w.genres.apply(clear_v2)
genres_w.head().T

Unnamed: 0,0,1,2,3,4
genres,show tunes,,,,
artists,Cats 1981 Original London Cast,Cats 1983 Broadway Cast,Fiddler On The Roof” Motion Picture Chorus,Fiddler On The Roof” Motion Picture Orchestra,Joseph And The Amazing Technicolor Dreamcoat 1...
acousticness,0.590111,0.862538,0.856571,0.884926,0.510714
danceability,0.467222,0.441731,0.348286,0.425074,0.467143
duration_ms,250318.555556,287280.0,328920.0,262890.962963,270436.142857
energy,0.394003,0.406808,0.286571,0.24577,0.488286
instrumentalness,0.0114,0.081158,0.024593,0.073587,0.0094
liveness,0.290833,0.315215,0.325786,0.275481,0.195
loudness,-14.448,-10.69,-15.230714,-15.63937,-10.236714
speechiness,0.210389,0.176212,0.118514,0.1232,0.098543


Merge data and genres_w file

In [18]:
data = data.merge(genres_w[['genres', 'artists']], how='left', on='artists')
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.0594,0.0594,0.963,0.0394
year,1921,1921,1921,1921,1921
acousticness,0.982,0.982,0.982,0.732,0.961
artists,Sergei Rachmaninoff,James Levine,Berliner Philharmoniker,Dennis Day,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...
danceability,0.279,0.279,0.279,0.819,0.328
duration_ms,831667,831667,831667,180533,500062
energy,0.211,0.211,0.211,0.341,0.166
explicit,0,0,0,0,0
instrumentalness,0.878,0.878,0.878,0.0,0.913
key,10,10,10,7,3


In [19]:
# data[data.genres.isnull()]

In [20]:
# remove gaps
data.genres.fillna('', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211068 entries, 0 to 211067
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           211068 non-null  float64
 1   year              211068 non-null  int64  
 2   acousticness      211068 non-null  float64
 3   artists           211068 non-null  object 
 4   danceability      211068 non-null  float64
 5   duration_ms       211068 non-null  int64  
 6   energy            211068 non-null  float64
 7   explicit          211068 non-null  int64  
 8   instrumentalness  211068 non-null  float64
 9   key               211068 non-null  int64  
 10  liveness          211068 non-null  float64
 11  loudness          211068 non-null  float64
 12  mode              211068 non-null  int64  
 13  name              211068 non-null  object 
 14  popularity        211068 non-null  int64  
 15  release_date      211068 non-null  int64  
 16  speechiness       21

In [21]:
# the entire dataset cannot be processed, limited by year
year_start = 2014

In [22]:
temp = data[data.year >= year_start].reset_index(drop=True)
temp.head().T

Unnamed: 0,0,1,2,3,4
valence,0.591,0.463,0.51,0.584,0.211
year,2014,2014,2014,2014,2014
acousticness,0.0489,0.301,0.431,0.0751,0.22
artists,Ariana Grande,J. Cole,Vance Joy,J. Cole,Ty Dolla $ign
danceability,0.525,0.692,0.484,0.517,0.805
duration_ms,204093,292987,204280,239320,242983
energy,0.621,0.521,0.731,0.705,0.33
explicit,0,1,0,1,1
instrumentalness,0.0,0.0,0.0,0.0,0.0
key,7,10,1,6,1


In [23]:
# create tfidf matrix
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
# vectorizer = CountVectorizer()
tfidf = vectorizer.fit_transform(temp.genres)
tfidf.shape

(18601, 9653)

In [24]:
def create_df(vector, x):
  df = pd.DataFrame(x.toarray(), columns=vector.get_feature_names())
  
  return df


In [25]:
df = create_df(vectorizer, tfidf)



In [26]:
# to estimate distances between vectors
cosine = linear_kernel(tfidf, tfidf)

In [27]:
# cosine[0]

In [28]:
# temp.reset_index(inplace=True, drop=True)
titles = temp['name'] + ' | ' + temp['artists']
indices = pd.Series(temp.index, index=temp['name'])

In [29]:
def get_recommendations(title, cosine_sim=cosine, indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [32]:
get_recommendations('Santa Tell Me', cosine).head(20)

25                              Bang Bang | Ariana Grande
42                          One Last Time | Ariana Grande
85                         Love Me Harder | Ariana Grande
90                                Problem | Ariana Grande
114                            Break Free | Ariana Grande
389                           Complicated | Olivia OBrien
617     i hate u, i love u (feat. olivia o'brien) | Ol...
652                              Into You | Ariana Grande
663                       Dangerous Woman | Ariana Grande
677                          Side To Side | Ariana Grande
706                         hate u love u | Olivia OBrien
749                      My Favorite Part | Ariana Grande
827                       Let Me Love You | Ariana Grande
911                      Sign of the Times | Harry Styles
954                         Sweet Creature | Harry Styles
992                                   Kiwi | Harry Styles
1015                                 One Day | Tate McRae
1020          

In [31]:
get_recommendations('Am I Wrong', cosine).head(20)

13449                           In Your Arms | Nico & Vinz
13653    That's How You Know (feat. Kid Ink & Bebe Rexh...
61                                      Only | Chris Brown
93       Post to Be (feat. Chris Brown & Jhene Aiko) | ...
95            Loyal (feat. Lil Wayne & Tyga) | Chris Brown
206      New Flame (feat. Usher & Rick Ross) | Chris Brown
221              Show Me (feat. Chris Brown) | Chris Brown
490                              Do It Again | Chris Brown
1461               No Guidance (feat. Drake) | Chris Brown
1609                            Easy - Remix | Chris Brown
1686            The Take (feat. Chris Brown) | Chris Brown
1769                                Go Crazy | Chris Brown
2265                                     Ayo | Chris Brown
2276     All Eyes on You (feat. Chris Brown & Nicki Min...
3006       Freaky Friday (feat. Chris Brown) | Chris Brown
3796        Drunk Texting (feat. Jhené Aiko) | Chris Brown
3838          Main Chick (feat. Chris Brown) | Chris Bro