This jupiter notebook has a content recommender system. The system is based only on the description of the artist's genres, without reference to the year of release, song characteristics, etc.

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# load data
data = pd.read_csv('data.csv', ).drop('id', axis=1)
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.963,0.0394,0.165,0.253
year,1921,1921,1921,1921,1921
acousticness,0.982,0.732,0.961,0.967,0.957
artists,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",['Dennis Day'],['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,['Frank Parker'],['Phil Regan']
danceability,0.279,0.819,0.328,0.275,0.418
duration_ms,831667,180533,500062,210000,166693
energy,0.211,0.341,0.166,0.309,0.193
explicit,0,0,0,0,0
instrumentalness,0.878,0.0,0.913,0.000028,0.000002
key,10,7,3,5,3


In [3]:
# cast to date to year
data['release_date'] = [x[:4] for x in data.release_date]
data.release_date = data.release_date.astype('int64')

In [4]:
# data.info()

In [5]:
# data.fillna(0, inplace=True)

In [6]:
data.drop_duplicates(['name', 'artists'], inplace=True)

In [7]:
# data.shape

In [8]:
# data.artists.nunique()

In [9]:
def clear(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (List) - returns list of the string
    
    '''
    # text = re.sub(r" ","_",text)
    text = re.sub("[\'\"\[\]]","",text).split(", ")
    return text

def clear_v2(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (string) - returns string
    
    '''
    # text = re.sub(r" ","_",text)
    # print(text)
    text = re.sub("[\'\"\[\]]","", text)
    return text    

def clear_v1(text):
    '''
    converts string that appears to be list to an actual list after removing ',",[,] symbols and splitting it by ,
    
    Parameter:
        text (string )- string that need to be converted into list
    
    Returns: 
        l (List) - returns list of the string
    
    '''
    # text = re.sub(r" ","_",text)
    text = re.sub("[\'\"\[\]]","",text).split(", ")
    return text[0]

Due to the fact that there can be several authors in the artist cell, it is necessary to separate them into separate cells or use the clear_in function clear_v1

In [10]:
data.artists = data.artists.apply(clear)
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.963,0.0394,0.165,0.253
year,1921,1921,1921,1921,1921
acousticness,0.982,0.732,0.961,0.967,0.957
artists,"[Sergei Rachmaninoff, James Levine, Berliner P...",[Dennis Day],[KHP Kridhamardawa Karaton Ngayogyakarta Hadin...,[Frank Parker],[Phil Regan]
danceability,0.279,0.819,0.328,0.275,0.418
duration_ms,831667,180533,500062,210000,166693
energy,0.211,0.341,0.166,0.309,0.193
explicit,0,0,0,0,0
instrumentalness,0.878,0.0,0.913,0.000028,0.000002
key,10,7,3,5,3


In [11]:
data = data.explode("artists",ignore_index=True)
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,Sergei Rachmaninoff,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.0594,1921,0.982,James Levine,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
2,0.0594,1921,0.982,Berliner Philharmoniker,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
3,0.963,1921,0.732,Dennis Day,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
4,0.0394,1921,0.961,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339


In [12]:
# data.info()

In [13]:
# data.artists.tolist()

In [14]:
# load w_genres file
genres_w = pd.read_csv('data_w_genres.csv')

In [15]:
genres_w.head().T

Unnamed: 0,0,1,2,3,4
genres,['show tunes'],[],[],[],[]
artists,"""Cats"" 1981 Original London Cast","""Cats"" 1983 Broadway Cast","""Fiddler On The Roof” Motion Picture Chorus","""Fiddler On The Roof” Motion Picture Orchestra","""Joseph And The Amazing Technicolor Dreamcoat""..."
acousticness,0.590111,0.862538,0.856571,0.884926,0.510714
danceability,0.467222,0.441731,0.348286,0.425074,0.467143
duration_ms,250318.555556,287280.0,328920.0,262890.962963,270436.142857
energy,0.394003,0.406808,0.286571,0.24577,0.488286
instrumentalness,0.0114,0.081158,0.024593,0.073587,0.0094
liveness,0.290833,0.315215,0.325786,0.275481,0.195
loudness,-14.448,-10.69,-15.230714,-15.63937,-10.236714
speechiness,0.210389,0.176212,0.118514,0.1232,0.098543


In [16]:
genres_w.artists = genres_w.artists.apply(clear_v2)
genres_w.genres = genres_w.genres.apply(clear_v2)
genres_w.head().T

Unnamed: 0,0,1,2,3,4
genres,show tunes,,,,
artists,Cats 1981 Original London Cast,Cats 1983 Broadway Cast,Fiddler On The Roof” Motion Picture Chorus,Fiddler On The Roof” Motion Picture Orchestra,Joseph And The Amazing Technicolor Dreamcoat 1...
acousticness,0.590111,0.862538,0.856571,0.884926,0.510714
danceability,0.467222,0.441731,0.348286,0.425074,0.467143
duration_ms,250318.555556,287280.0,328920.0,262890.962963,270436.142857
energy,0.394003,0.406808,0.286571,0.24577,0.488286
instrumentalness,0.0114,0.081158,0.024593,0.073587,0.0094
liveness,0.290833,0.315215,0.325786,0.275481,0.195
loudness,-14.448,-10.69,-15.230714,-15.63937,-10.236714
speechiness,0.210389,0.176212,0.118514,0.1232,0.098543


Merge data and genres_w file

In [17]:
data = data.merge(genres_w[['genres', 'artists']], how='left', on='artists')
data.head().T

Unnamed: 0,0,1,2,3,4
valence,0.0594,0.0594,0.0594,0.963,0.0394
year,1921,1921,1921,1921,1921
acousticness,0.982,0.982,0.982,0.732,0.961
artists,Sergei Rachmaninoff,James Levine,Berliner Philharmoniker,Dennis Day,KHP Kridhamardawa Karaton Ngayogyakarta Hadini...
danceability,0.279,0.279,0.279,0.819,0.328
duration_ms,831667,831667,831667,180533,500062
energy,0.211,0.211,0.211,0.341,0.166
explicit,0,0,0,0,0
instrumentalness,0.878,0.878,0.878,0.0,0.913
key,10,10,10,7,3


In [18]:
# data[data.genres.isnull()]

In [19]:
# # remove gaps
data.genres.fillna('', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211068 entries, 0 to 211067
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           211068 non-null  float64
 1   year              211068 non-null  int64  
 2   acousticness      211068 non-null  float64
 3   artists           211068 non-null  object 
 4   danceability      211068 non-null  float64
 5   duration_ms       211068 non-null  int64  
 6   energy            211068 non-null  float64
 7   explicit          211068 non-null  int64  
 8   instrumentalness  211068 non-null  float64
 9   key               211068 non-null  int64  
 10  liveness          211068 non-null  float64
 11  loudness          211068 non-null  float64
 12  mode              211068 non-null  int64  
 13  name              211068 non-null  object 
 14  popularity        211068 non-null  int64  
 15  release_date      211068 non-null  int64  
 16  speechiness       21

In [20]:
start_year = 2014 

In [21]:
temp = data[data.year >= start_year].reset_index(drop=True)
temp.head().T

Unnamed: 0,0,1,2,3,4
valence,0.591,0.463,0.51,0.584,0.211
year,2014,2014,2014,2014,2014
acousticness,0.0489,0.301,0.431,0.0751,0.22
artists,Ariana Grande,J. Cole,Vance Joy,J. Cole,Ty Dolla $ign
danceability,0.525,0.692,0.484,0.517,0.805
duration_ms,204093,292987,204280,239320,242983
energy,0.621,0.521,0.731,0.705,0.33
explicit,0,1,0,1,1
instrumentalness,0.0,0.0,0.0,0.0,0.0
key,7,10,1,6,1


In [22]:
# search numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = temp.select_dtypes(include=numerics).columns.tolist()

In [23]:
# use scaler
scaler = StandardScaler()
# scaler = MinMaxScaler()

In [24]:
temp[num_cols] = scaler.fit_transform(temp[num_cols])
temp.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,genres
0,0.598452,-1.544905,-0.73388,Ariana Grande,-0.66807,-0.203988,0.016145,-0.824991,-0.312036,0.484645,0.693312,-0.001137,0.815546,Santa Tell Me,1.568205,-1.544905,-0.063551,2.311953,"pop, post-teen pop"
1,0.046881,-1.544905,0.18326,J. Cole,0.307359,1.141832,-0.47356,1.212134,-0.312036,1.312028,-0.845995,-0.251229,-1.226173,No Role Modelz,1.436121,-1.544905,1.724558,-0.660965,"conscious hip hop, hip hop, north carolina hip..."
2,0.249411,-1.544905,0.656201,Vance Joy,-0.907546,-0.201156,0.55482,-0.824991,-0.312036,-1.17012,-0.233513,0.151053,0.815546,Riptide,1.039872,-1.544905,-0.716127,-0.621824,"folk-pop, modern rock, pop, pop rock"
3,0.568288,-1.544905,-0.638565,J. Cole,-0.714797,0.329335,0.427497,1.212134,-0.312036,0.208851,-0.382583,-0.19217,-1.226173,Wet Dreamz,1.105914,-1.544905,2.00865,1.79201,"conscious hip hop, hip hop, north carolina hip..."
4,-1.039026,-1.544905,-0.111418,Ty Dolla $ign,0.967379,0.384791,-1.408896,1.212134,-0.312036,-1.17012,-0.531652,-0.307335,-1.226173,"Or Nah (feat. The Weeknd, Wiz Khalifa & DJ Mus...",1.171955,-1.544905,-0.197242,0.038622,"hip hop, pop, pop rap, r&b, trap, trap soul"


In [25]:
# temp.reset_index(inplace=True, drop=True)
titles = temp['name'] + ' | ' + temp['artists']
indices = pd.Series(temp.index, index=temp['name'])

In [26]:
# create tfidf matrix
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
# vectorizer = CountVectorizer()
tfidf = vectorizer.fit_transform(temp.genres)
tfidf.shape

(18601, 9653)

In [27]:
def create_df(vector, x):
  df = pd.DataFrame(x.toarray(), columns=vector.get_feature_names())
  
  return df


In [28]:
df = create_df(vectorizer, tfidf)



In [29]:
temp = pd.concat([temp, df], axis=1)

In [30]:
temp.shape

(18601, 9672)

In [31]:
vec = temp.drop(['artists', 'name', 'genres'], axis=1).to_numpy()

In [32]:
# to estimate distances between vectors
cosine = linear_kernel(vec, vec)

In [33]:
# cosine[0]

In [34]:
def get_recommendations(title, cosine_sim=cosine, indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [37]:
get_recommendations('Santa Tell Me', cosine).head(20)

75                       Drunk On A Plane | Dierks Bentley
17162                         Be Our Guest | Mitchell Hope
17163                           Be Our Guest | Spencer Lee
17164                            Be Our Guest | Kala Balch
17165                     Be Our Guest | Marco Marinangeli
74                                      Animals | Maroon 5
2067                              Salad Days | Mac DeMarco
5721                Gonna Know We Were Here | Jason Aldean
2226                        Day Drinking | Little Big Town
2239             Qué Tal Si Eres Tu | Los Tigres Del Norte
13439                 Pilares de Cristal | Chalino Sanchez
2059                                    Just One Day | BTS
176                         I Wanna Get Better | Bleachers
169                          Leave The Night On | Sam Hunt
15158                 El Caballo De Pepe | Grupo Laberinto
2062     Cecilia And The Satellite | Andrew McMahon in ...
292                      679 (feat. Remy Boyz) | Fetty W

In [36]:
get_recommendations('Am I Wrong', cosine).head(20)

5728                                      Perdoname | DyCy
5729                            Perdoname | Adrian Delgado
13331    Rock Me - Live Version from The Motion Picture...
2226                        Day Drinking | Little Big Town
13368               Sweet Child O' Mine - Acoustic | Slash
13369    Sweet Child O' Mine - Acoustic | Myles Kennedy...
13638                   Hair (feat. Sean Paul) | Sean Paul
13637                  Hair (feat. Sean Paul) | Little Mix
9658      Will The Circle Be Unbroken - Live | Pat Monahan
9648     Will The Circle Be Unbroken - Live | Warren Ha...
9649     Will The Circle Be Unbroken - Live | Derek Trucks
9650     Will The Circle Be Unbroken - Live | Susan Ted...
9651     Will The Circle Be Unbroken - Live | Devon Allman
9652     Will The Circle Be Unbroken - Live | Robert Ra...
9653       Will The Circle Be Unbroken - Live | Jimmy Hall
9654        Will The Circle Be Unbroken - Live | Sam Moore
9655           Will The Circle Be Unbroken - Live | Keb 

as we can see, when using additional parameters for the content recommender system, the results can differ significantly.
for better prediction, it may be worth trying to apply clustering and dimensionality reduction methods.