In [1]:
import pandas as pd
import numpy as np
import seaborn as sb

In [2]:
df_full = pd.read_csv(r'data/data.csv')

In [3]:
df_w_genres = pd.read_csv(r'data/data_w_genres.csv')

In [4]:
df_only_genres = pd.read_csv(r'data/data_by_genres.csv')

In [5]:
df_only_genres.head(1)

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode
0,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.166667,5,1


In [6]:
df_w_genres.head(1)

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,['show tunes']


In [7]:
# Keep a backup
df_w_genres_raw = df_w_genres.copy()

In [8]:
df_full.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [9]:
# Convert genres from string to list 
import re
pattern = re.compile(r"\'(.*?)\'", re.IGNORECASE)
df_w_genres['genres'] = df_w_genres['genres'].map(lambda x: re.findall(pattern, x))

# Change fields of no genres to None
df_w_genres['genres'] = df_w_genres['genres'].map(lambda x: np.nan if len(x) == 0 else x)

In [10]:
# Drop all items that do not have a genre
df_w_genres = df_w_genres.drop(df_w_genres[df_w_genres['genres'].isna()].index)
df_w_genres.reset_index(inplace=True)
df_w_genres.drop('index', axis=1, inplace=True)

In [11]:
df_w_genres.head(1)

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,[show tunes]


In [12]:
df_w_genres['genres'].map(lambda x: len(x)).agg(['max', 'min'])

max    23
min     1
Name: genres, dtype: int64

In [13]:
class Recommendation_System():
    
    def __init__(self, input_artist, df_w_genres, df_full):
        self.input_artist = input_artist
        self.df = df_w_genres
        self.df_full = df_full
    
    @staticmethod
    def IOU(genre_1, genre_2):
        a = set(genre_1)
        b = set(genre_2)
        return len(a.intersection(b))/len(a.union(b))
    
    def same_genre_artists(self):
        '''
        Returns a pandas series with the artists which have a positive jaccard score
        The series will also have the same artist as well
        '''
        genre_to_look = self.df[self.df['artists']==self.input_artist]['genres'].values[0]
        self.jaccard_scores = self.df['genres'].map(lambda x: IOU(x, genre_2= genre_to_look))
        self.jaccard_scores = self.jaccard_scores[self.jaccard_scores>0]
        return self.jaccard_scores

    
    def get_debut(self):
        debut_years = []
        for i in self.jaccard_scores.index:
            artist = df_w_genres.iloc[i]['artists']
            debut = self.df_full[self.df_full['artists'].map(lambda x: artist in x)]['year'].min()
            debut_years.append(debut)
        self.debuts = pd.Series(debut_years, index=self.jaccard_scores.index)
    
    def get_popularities(self):
        self.popularity = df_w_genres.iloc[self.jaccard_scores.index]['popularity']
    
    
    def combine_series(self):
        artists = df_w_genres.iloc[self.jaccard_scores.index]['artists']
        similar_artists = pd.DataFrame([artists, self.debuts, self.popularity, self.jaccard_scores])
        return similar_artists.T
            
    

In [14]:
artist = 'Shakira'

In [16]:
rec = Recommendation_System(artist, df_w_genres, df_full)
rec.same_genre_artists()
rec.get_debut()
rec.get_popularities()

TypeError: IOU() missing 2 required positional arguments: 'genre_1' and 'genre_2'

In [143]:
rec_df = rec.combine_series()

In [144]:
rec_df

Unnamed: 0,artists,Unnamed 0,popularity,genres
8,((( O ))),2017,66,0.125
11,*NSYNC,1980,45.7949,0.285714
23,112,1996,48.3273,0.0666667
24,11:11,2016,57,0.0666667
42,24 Horas,2010,46,0.142857
...,...,...,...,...
17955,mxmtoon,2017,72.75,0.125
17980,t.A.T.u.,2002,52,0.166667
17988,will.i.am,1982,53.2895,0.285714
18002,Ñejo,2007,62.3333,0.1


In [22]:
from sklearn.preprocessing import MinMaxScaler

In [145]:
scaler = MinMaxScaler()
rec_df[['Unnamed 0','popularity','genres']] = scaler.fit_transform(rec_df[['Unnamed 0','popularity','genres']].to_numpy())

In [154]:
rec_df.sort_values(by='diff',ascending=True).head(20)

Unnamed: 0,artists,Unnamed 0,popularity,genres,score,diff,ranks
13986,Shakira,0.747475,0.696446,1.0,0.828073,0.0,1.0
10799,Mike Bahía,0.989899,0.906977,0.46875,0.74827,0.0798029,2.0
10137,Manuel Medrano,0.949495,0.837209,0.46875,0.712283,0.115791,3.0
10139,Manuel Turizo,0.969697,0.883721,0.392857,0.704571,0.123503,4.0
14393,Sofia Reyes,0.959596,0.585271,0.645833,0.684361,0.143712,5.0
1269,Bacilos,0.808081,0.744186,0.544643,0.677148,0.150926,6.0
1160,Ava Max,0.979798,0.907988,0.291667,0.675821,0.152252,7.0
10448,Mau y Ricky,0.979798,0.872093,0.291667,0.661463,0.16661,8.0
11619,Normani,0.979798,0.901993,0.241071,0.653186,0.174888,9.0
876,Ansel Elgort,0.969697,0.784884,0.3625,0.652893,0.175181,10.0


In [146]:
rec_df['score'] = rec_df['Unnamed 0'] * 0.2 + rec_df['popularity'] * 0.4 + rec_df['genres'] * 0.4

In [147]:
i#nput_score = np.array(rec_df[rec_df['artists']==artist][['Unnamed 0','popularity','genres']])

In [150]:
input_score = rec_df[rec_df['artists']==artist]['score'].values[0]

In [151]:
input_score

0.8280733969237082

In [152]:
rec_df['diff'] = abs(input_score - rec_df['score'])

In [153]:
rec_df['ranks'] = rec_df['diff'].rank(ascending=True)

In [126]:
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [137]:
from sklearn.metrics.pairwise import cosine_similarity

In [139]:
rec_df['cos_sim'] = cosine_similarity(np.array(rec_df[['Unnamed 0','popularity','genres']]), input_score).reshape(-1)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [156]:
np.array(np.around(rec_df[['Unnamed 0','popularity','genres']],2))

array([[0.9696969696969688, 0.7674418604651163, 0.0703125],
       [0.5959595959595951, 0.5324985092426953, 0.24107142857142855],
       [0.7575757575757578, 0.5619450317124736, 0.008333333333333331],
       ...,
       [0.6161616161616159, 0.6196450428396573, 0.24107142857142855],
       [0.8686868686868685, 0.7248062015503877, 0.04375000000000001],
       [0.8686868686868685, 0.7255813953488371, 0.05555555555555555]],
      dtype=object)

In [None]:
from math import sqrt

In [None]:
def squared_normalization(x):
    return x/sqrt((x**2).sum())

In [None]:
rec_df = rec_df.set_index('artists')

In [None]:
rec_df_normalized = rec_df.apply(squared_normalization)

In [None]:
rec_df_normalized.head(1)

In [None]:
weights = [0.4, 0.4, 0.2]

In [None]:
rec_df_weighted = rec_df_normalized*weights

In [None]:
best_sol = rec_df_weighted.loc[artist]

In [None]:
best_sol

In [None]:
rec_df_weighted.head(2)

In [None]:
best_scores = abs((rec_df_weighted - best_sol).sum(axis=1))

In [None]:
best_scores.sort_values()

In [None]:
best_scores.sort_values().index[-1]

In [None]:
worst_sol = rec_df_weighted.loc[best_scores.sort_values().index[-1]]

In [None]:
worst_scores = abs((rec_df_weighted - worst_sol).sum(axis=1))

In [None]:
all_scores = pd.DataFrame([best_scores, worst_scores]).T.rename(columns={0:'best', 1:'worst'})

In [None]:
all_scores['final'] = all_scores['worst']/(all_scores['worst']+all_scores['best'])

In [None]:
all_scores['ranks'] = all_scores['final'].rank(ascending=False)

In [None]:
all_scores.sort_values('ranks').head(20)

In [None]:
rec_df.loc[artist]

In [None]:
rec_df.loc['Disturbed']

In [None]:
df_w_genres[df_w_genres['artists']=='Disturbed']['genres'].values

In [None]:
df_w_genres[df_w_genres['artists']=='Linkin Park']['genres'].values

In [None]:
rec_df.loc['Flyleaf']

In [None]:
rec_df.loc['Christopher Wilde']

In [None]:
rec_df.loc['Taylor Swift']

In [16]:
def IOU(genre_1, genre_2):
    a = set(genre_1)
    b = set(genre_2)
    return len(a.intersection(b))/len(a.union(b))

In [None]:
def same_genre_artists(artist):
    '''
    Returns a pandas series with the artists which have a positive jaccard score
    The series will also have the same artist as well
    '''
    genre_to_look = df_w_genres[df_w_genres['artists']==artist]['genres'].values[0]
    jaccard_scores = df_w_genres['genres'].map(lambda x: IOU(x, genre_2= genre_to_look))
    return jaccard_scores[jaccard_scores>0]

In [None]:
def get_debut(artist):
    return df_full[df_full['artists'].map(lambda x: artist in x)]['year'].min()

In [None]:
def year_difference(other_artist, input_artist, df_to_look):
    
    input_artist_debut = df_full[df_full['artists'].map(lambda x: input_artist in x)]['year'].min()
    df_to_look['artists'].map(lambda x: get_debut(x))
    df_to_look.map()

In [None]:
next_step['artists'].map(lambda x: get_debut(x))

In [None]:
df_w_genres.iloc[57]

In [None]:
df_full[df_full['artists'].map(lambda x: '38 Special' in x)].min()

In [None]:
next_step = df_w_genres.iloc[same_genre_artists('Arcade Fire').index]

In [None]:
df_full[df_full['artists'].map(lambda x: '311' in x)]['year'].describe()

In [None]:
df_full[df_full['artists'].map(lambda x: 'Arcade Fire' in x)]['year'].describe()

In [None]:
df_full[df_full['artists'].map(lambda x: 'The Beatles' in x)]['year'].describe()

In [None]:
df_full[df_full['artists'].map(lambda x: 'Metallica' in x)]['year'].describe()

In [None]:
type(df_full['artists'][0])

In [None]:
next_step

In [None]:
same_genre_artists('Arcade Fire')

In [None]:
df_w_genres.iloc[17961    ]

In [None]:
df_w_genres['genres'].map(lambda x: IOU(x, genre_2=['dance pop',  'pop', 'post-teen pop'])).iloc[2048]

In [None]:
from sklearn.metrics import jaccard_score

In [None]:
a = set(['dance pop',   'post-teen pop'])
b = set(['dance pop',  'pop', 'post-teen pop'])

In [None]:
df_w_genres[df_w_genres['artists']=='Noah Cyrus']

In [None]:
df_w_genres[df_w_genres['artists']=='Taylor Swift']

In [None]:
df_w_genres[df_w_genres['artists']=='Shreya Ghoshal']['genres'].values[0]

In [None]:
ts_similar = same_genre_artists('Taylor Swift')

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
similar_artist_set = df_w_genres.iloc[ts_similar.value_counts().index]

## Ideas ##
1. Popularity
2. Similarity of Genres
3. Year

In [None]:
pairwise_distances(similar_artist_set.iloc[0], 

In [None]:
ts_similar

In [None]:
ts_similar.value_counts()

In [None]:
df_w_genres.iloc[ts_similar.value_counts().index]

In [None]:
# Convert artists from string to list 
# Uses same pattern as the one used to convert genres
df_full['artists'] = df_full['artists'].map(lambda x: re.findall(pattern, x))

In [None]:
df_full.head(1)

In [None]:
# Get the number of active years for an artist
# This can also be used as a parameter (we want artists to be in the same year range as the original artist)

# Entries of one artist
df_full[df_full['artists'].map(lambda x: 'GRiZ' in x)]['year'].agg(['min', 'max'])

In [None]:
sb.distplot(df_full[df_full['artists'].map(lambda x: 'GRiZ' in x)]['year'])

In [None]:
df_w_genres[df_w_genres['artists']=='Eminem']

In [None]:
df_full[df_full['artists'].map(lambda x: 'Eminem' in x)]

In [None]:
sb.distplot(df_full[df_full['artists'].map(lambda x: 'The Killers' in x)]['year'])

In [None]:
df_w_genres.iloc[17520]

In [None]:
df_w_genres