In [1]:
import pandas as pd
import sys
from collections import Counter
from tqdm import tqdm

recommenders_path = 'Recommenders'
sys.path.append(recommenders_path)

graphs_path = 'Graphs'
sys.path.append(graphs_path)

artist_path = 'Artists'
sys.path.append(artist_path)

graphics_path = 'Graphics'
sys.path.append(graphics_path)

from Recommenders.PopularityRecommender import PopularityRecommender
from Recommenders.StrongerConnectionsRecommender import StrongerConnectionsRecommender
from Graphs.BipartiteGraph import BipartiteGraph
from Graphics.Graphics import Graphics

In [2]:
df = pd.read_csv('../data/spotify_artists.csv')
df

Unnamed: 0,external_urls,followers,genres,id,name,popularity,uri,related_artists_ids
0,https://open.spotify.com/artist/4dpARuHxo51G3z...,57657211,"british soul, pop, pop soul, uk pop",4dpARuHxo51G3z768sgnrY,Adele,86,spotify:artist:4dpARuHxo51G3z768sgnrY,"5WUlDfRSoLAfcVSX1WnrxN, 2wY79sveU1sp5g7SokKOiI..."
1,https://open.spotify.com/artist/66CXWjxzNUsdJx...,98814719,pop,66CXWjxzNUsdJxJ2JdwvnR,Ariana Grande,91,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,"4nDoRrQiYLoBzwC5BhVJzF, 0C8ZW7ezQVs4URX5aX7Kqx..."
2,https://open.spotify.com/artist/13ubrt8QOOCPlj...,14722549,"east coast hip hop, hip hop, rap",13ubrt8QOOCPljQ2FL1Kca,A$AP Rocky,84,spotify:artist:13ubrt8QOOCPljQ2FL1Kca,"5dHt1vcEm9qb8fCyLcB3HL, 2P5sC9cVZDToPxyomzF1UH..."
3,https://open.spotify.com/artist/06HL4z0CvFAxyc...,118971229,pop,06HL4z0CvFAxyc27GXpf02,Taylor Swift,100,spotify:artist:06HL4z0CvFAxyc27GXpf02,"0C8ZW7ezQVs4URX5aX7Kqx, 1McMsnEElThX1knmY4oliG..."
4,https://open.spotify.com/artist/7Ln80lUS6He07X...,25991892,"garage rock, modern rock, permanent wave, rock...",7Ln80lUS6He07XvHI8qqHH,Arctic Monkeys,85,spotify:artist:7Ln80lUS6He07XvHI8qqHH,"77SW9BnxLY8rJ0RciFqkHh, 0epOFNiUfyON9EYx7Tpr6V..."
...,...,...,...,...,...,...,...,...
14783,https://open.spotify.com/artist/7EyzyrMNgqiK8b...,1417807,rap mineiro,7EyzyrMNgqiK8bMrbkOT9l,Sidoka,59,spotify:artist:7EyzyrMNgqiK8bMrbkOT9l,"4oPnjkJcLqOim9KJxvIYMz, 1QBWA6tuiZ0JuDluPqbe71..."
14784,https://open.spotify.com/artist/6gzXCdfYfFe5XK...,2313438,"k-pop, k-pop boy group",6gzXCdfYfFe5XKhPKkYqxV,SUPER JUNIOR,58,spotify:artist:6gzXCdfYfFe5XKhPKkYqxV,"6nVMMEywS5Y4tsHPKx1nIo, 1bkpTEmumLC3xc7HgMsttU..."
14785,https://open.spotify.com/artist/7AAXcP4NpvvLM9...,609413,"sertanejo, sertanejo universitario",7AAXcP4NpvvLM9Xcfy64ij,Hugo Pena & Gabriel,40,spotify:artist:7AAXcP4NpvvLM9Xcfy64ij,"5Dyg6H3QJHQV5c7ojyKWyv, 4ZUahcHoVxr4lsrfdmhjhV..."
14786,https://open.spotify.com/artist/5KvkOKroKLz202...,186296,"folk metal, german metal, melodic death metal,...",5KvkOKroKLz202ioXfGWR2,Equilibrium,49,spotify:artist:5KvkOKroKLz202ioXfGWR2,3t5X2CVDf5mrlIx1SdvWYM


In [3]:
artists_ids = {}
for i in range(len(df)):
    artists_ids[df['id'][i]] = df['name'][i]

In [4]:
edges = []
for i in range(len(df)):
    if type(df['genres'][i]) != float: # it means that the genre is not NaN
        genres_artist = [item.strip() for item in df['genres'][i].split(',')]
        for g in genres_artist:
            edge = (df['id'][i], g, df['popularity'][i])
            edges.append(edge)

In [5]:
artists_nodes = [(df['id'][index], dict(popularity=int(df['popularity'][index]), name=str(df['name'][index]))) for index in range(len(df['id']))]
genres_nodes = set([item.strip() for sublist in df['genres'].dropna().apply(lambda x: x.split(',')) for item in sublist])

In [6]:
Bipartite_G = BipartiteGraph()
Bipartite_G.add_nodes(artists_nodes, genres_nodes)
Bipartite_G.add_edges(edges)
name_network = 'popularity_recommender'

In [7]:
G = Bipartite_G.transform_bipartite_into_simple(list(df['id']))

## **Analisando a rede**

### **Informações báiscas**

In [8]:
print(f'Quantidade de vértices: {G.get_number_of_nodes()}')
print(f'Quantidade de arestas: {G.get_number_of_edges()}')
print(f'Quantidade de componentes: {G.get_number_connected_components()}')
print(f'Menor grau da rede: {G.get_minimum_degree()}')
print(f'Maior grau da rede: {G.get_maximum_degree()}')
print(f'Grau médio da rede: {G.get_average_degree()}')
print(f'Densidade da rede: {G.get_density()}')
#print(f'Distância média: {G.average_shortest_path_length()}')
#average_clustering_coefficient, percentage_of_considered_nodes = G.get_average_clustering_coefficient()
#print(f'Coeficiente de clustering médio: {average_clustering_coefficient} (apenas {percentage_of_considered_nodes:.2%} da rede foi considerada no cálculo, que que possuem grau maior do que um)')

Quantidade de vértices: 14788
Quantidade de arestas: 774610
Quantidade de componentes: 118
Menor grau da rede: 1
Maior grau da rede: 768
Grau médio da rede: 104.76196916418718
Densidade da rede: 0.0070847345076206925


### **Distribuição de graus**

In [9]:
#frequencies_of_degrees, probability_of_degrees, accumulated_probability_of_degress = G.get_degree_distribution()
#Graphics.create_graphic_degrees(data = frequencies_of_degrees, xLabel = 'Grau (K)', yLabel = 'Frequência', title = 'Frequência dos Graus', limit = 5, lines = False, name_network = name_network)
#Graphics.create_graphic_degrees(data = probability_of_degrees, xLabel = 'Grau (K)', yLabel = 'P(k)', title = 'Probabilidade dos Graus', limit = 0.01, lines = False,  name_network = name_network)
#Graphics.create_graphic_degrees(data = accumulated_probability_of_degress, xLabel = 'Grau (k)', yLabel = 'P(>=K)', title = 'Função de Distribuição Cumulativa Complementar', limit = 0.05, lines = True,  name_network = name_network)

### **Centralidades**

#### **Centralidade por Grau**

In [10]:
#degree_centrality=G.get_degree_centrality()
#analysis_centrality=Graphics.create_graphic_centrality(data=degree_centrality, title='Centralidade por Grau', name_network=name_network)
#Graphics.create_ranking_centrality(centrality=degree_centrality, graph=G, title='Centralidade por Grau - Ranking', name_network=name_network)

In [11]:
#eigenvector_centrality=G.get_eigenvector_centrality()
#analysis_centrality=Graphics.create_graphic_centrality(data=eigenvector_centrality, title='Centralidade por Autovetor', name_network=name_network)
#Graphics.create_ranking_centrality(centrality=eigenvector_centrality, graph=G, title='Centralidade por Autovetor - Ranking', name_network=name_network)

### **Comunidades**

In [12]:
#louvain_communities = G.get_louvain_communities()

In [13]:
#for index, community in enumerate(louvain_communities):
#    print(f'Comunidade {index+1:2}:', end=' ')
#    community_list = []
#    for artist_id in community:
#        community_list.append(G.graph.nodes[artist_id]['name'])
#    print(community_list)
#    if len(community_list) == 1: print(G.get_degree(artist_id))

## **Recomendador**

In [42]:
analysis = {}
counter_nan=0
for index in tqdm(range(len(df))):
    if type(df['related_artists_ids'][index]) == float: 
        counter_nan+=1
        continue

    related_artists_ids = df['related_artists_ids'][index].split(', ')
    number_recommendations = len(related_artists_ids)

    recommender = PopularityRecommender(Bipartite_G, number_recommendations)
    recommended_artists = recommender.make_recommendations(df['id'][index])

    #if len(recommended_artists) == number_recommendations:
    counter=0
    for artist_id in recommended_artists:
        if artist_id in related_artists_ids:
            counter+=1
    analysis[df['id'][index]] =  counter/len(recommended_artists)

100%|██████████| 14788/14788 [00:06<00:00, 2412.34it/s]


In [43]:
total_correct_mean = sum(analysis.values())/len(analysis.values())
print(total_correct_mean)

0.10028152825021336


In [29]:
len(df)

14788

In [17]:
counter_nan # quantidade de nan

1228

In [18]:
len(analysis) # quantidade total que pegamos

13560

In [19]:
len(df) - (len(analysis) + counter_nan) # quantidade que n consegumis pegar igual

0

In [44]:
analysis = {}
counter_nan=0
for index in tqdm(range(len(df))):

    recommender_stronger = StrongerConnectionsRecommender(G, 20)
    recommended_artists_stronger = recommender_stronger.make_recommendations(df['id'][index], collaborations=False)

    number_recommendations = len(recommended_artists_stronger)

    recommender = PopularityRecommender(Bipartite_G, number_recommendations)
    recommended_artists = recommender.make_recommendations(df['id'][index])

    #if len(recommended_artists) == number_recommendations:
    counter=0
    for artist_id in recommended_artists:
        if artist_id in recommended_artists_stronger:
            counter+=1
    analysis[df['id'][index]] =  counter/len(recommended_artists) 

100%|██████████| 14788/14788 [00:13<00:00, 1092.71it/s]


In [45]:
total_correct_mean = sum(analysis.values())/len(analysis.values())
print(total_correct_mean)

0.7607199005432571


In [46]:
analysis = {}
counter_nan=0
for index in tqdm(range(len(df))):

    recommender = PopularityRecommender(Bipartite_G, 20)
    recommended_artists = recommender.make_recommendations(df['id'][index])

    number_recommendations = len(recommended_artists)

    recommender_stronger = StrongerConnectionsRecommender(G, number_recommendations)
    recommended_artists_stronger = recommender_stronger.make_recommendations(df['id'][index], collaborations=False)

    #if len(recommended_artists) == number_recommendations:
    counter=0
    for artist_id in recommended_artists_stronger:
        if artist_id in recommended_artists:
            counter+=1
    analysis[df['id'][index]] =  counter/len(recommended_artists_stronger) 

  0%|          | 0/14788 [00:00<?, ?it/s]

100%|██████████| 14788/14788 [00:14<00:00, 1051.67it/s]


In [47]:
total_correct_mean = sum(analysis.values())/len(analysis.values())
print(total_correct_mean)

0.7634974303489308


In [None]:
#recommender.convert_recommendations(artist_id, artists_ids)

In [None]:
#recommender.make_recommendations(artist_id, collaborations=True)
#recommender.convert_recommendations(artist_id, artists_ids)