# Analisis Exploratorio de Datos
Esta parte del proyecto consiste en cargar los datasets para poder explorarlos


In [1]:
import pandas as pd

artists = pd.read_csv('dataset/artists.dat', sep='\t')
tags = pd.read_csv('dataset/tags.dat', sep='\t', encoding='ISO-8859-1')
user_artists = pd.read_csv('dataset/user_artists.dat', sep='\t')
user_tags = pd.read_csv('dataset/user_taggedartists.dat', sep='\t')


# Filtrado basado en el contenido: similaridad de artistas
Para esta parte, buscamos hallar los artistas más similares según los tags que tienen asignados. Para esto, creamos un vector one-hot-encoded para cada artista, en el que cada columna es un tag


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

artist_tags = user_tags.groupby('artistID')['tagID'].value_counts().unstack(fill_value=0)
artist_tags.shape

(12523, 9749)

In [3]:
# Normalizar la cantidad de tags por artista
artist_tags_normalized = normalize(artist_tags, axis=1, norm='l2')

In [4]:
cosine_sim = cosine_similarity(artist_tags_normalized)
cosine_sim_df = pd.DataFrame(cosine_sim, index=artist_tags.index, columns=artist_tags.index)

In [5]:
cosine_sim_df.head()

artistID,1,2,3,4,5,6,7,8,9,10,...,18724,18732,18734,18735,18736,18737,18739,18740,18741,18744
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.133333,0.0,0.894586,0.08269,0.033127,0.006786,0.843713,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.133333,1.0,0.0,0.151911,0.744208,0.089443,0.067186,0.008165,0.096077,0.047068,...,0.0,0.0,0.0,0.0,0.04,0.0,0.031623,0.063246,0.0,0.023905
3,0.0,0.0,1.0,0.0,0.0,0.323381,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.894586,0.151911,0.0,1.0,0.094211,0.125809,0.185569,0.837234,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040032,0.0,0.0,0.0
5,0.08269,0.744208,0.0,0.094211,1.0,0.061633,0.025253,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Get the artistID for an artist
targetArtist = 'Arctic Monkeys'

# Get from the artists df
target_artist_id = artists.loc[artists['name'] == targetArtist, 'id'].iloc[0]
target_artist_id

207

In [7]:
# Get similarity scores for the target artist and sort them
similar_artists = cosine_sim_df[target_artist_id].sort_values(ascending=False).reset_index()

# Rename columns
similar_artists.columns = ['artistID', 'cosine_similarity']

# Exclude the target artist itself from the results (if needed)
similar_artists = similar_artists.drop(similar_artists[similar_artists['artistID'] == target_artist_id].index)

# Get the top 10 similar artists
similar_artists = similar_artists.head(20)

# Merge with the artists df
similar_with_names = similar_artists.merge(artists[['id', 'name']], left_on='artistID', right_on='id', how='left')

# Print the results
similar_with_names[['cosine_similarity', 'name', 'artistID']]


Unnamed: 0,cosine_similarity,name,artistID
0,0.972432,Hard-Fi,7187
1,0.955964,Babyshambles,208
2,0.952964,The Last Shadow Puppets,1512
3,0.950545,The Libertines,428
4,0.94238,The Fratellis,1515
5,0.937507,The Zutons,5440
6,0.933403,The Kooks,1048
7,0.929053,The Pigeon Detectives,5030
8,0.928087,Kaiser Chiefs,716
9,0.926427,Maxïmo Park,2815
