In [8]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy import cluster
from scipy import spatial
from matplotlib import pyplot as plt
import seaborn
import copy
%matplotlib inline

# Clustering by tags

### Data Cleaning

In [12]:
big_set = pd.read_csv('./data/artists.csv', low_memory=False)

In [13]:
artist_tags=big_set[['artist_mb', 'tags_mb']]

In [14]:
artist_tags = artist_tags.dropna()
artist_tags

Unnamed: 0,artist_mb,tags_mb
0,Coldplay,rock; pop; alternative rock; british; uk; brit...
1,Radiohead,rock; electronic; alternative rock; british; g...
2,Red Hot Chili Peppers,rock; alternative rock; 80s; 90s; rap; metal; ...
3,Rihanna,pop; dance; hip hop; reggae; contemporary r b;...
4,Eminem,turkish; rap; american; hip-hop; hip hop; hiph...
...,...,...
1466059,Kazushige Kinoshita,japanese; violinist; japan; chamber music; fre...
1466061,水越恵子,likedis auto
1466063,大槻ケンヂ,rock
1466069,孫耀威,chinese


In [15]:
artist_tags['tags_mb'] = artist_tags['tags_mb'].apply(lambda x: np.array([tag.strip() for tag in x.split(';')]))
artist_tags

Unnamed: 0,artist_mb,tags_mb
0,Coldplay,"[rock, pop, alternative rock, british, uk, bri..."
1,Radiohead,"[rock, electronic, alternative rock, british, ..."
2,Red Hot Chili Peppers,"[rock, alternative rock, 80s, 90s, rap, metal,..."
3,Rihanna,"[pop, dance, hip hop, reggae, contemporary r b..."
4,Eminem,"[turkish, rap, american, hip-hop, hip hop, hip..."
...,...,...
1466059,Kazushige Kinoshita,"[japanese, violinist, japan, chamber music, fr..."
1466061,水越恵子,[likedis auto]
1466063,大槻ケンヂ,[rock]
1466069,孫耀威,[chinese]


In [16]:
artist_tags_len = artist_tags
artist_tags_len['tags_len'] = artist_tags_len['tags_mb'].apply(lambda x: len(x))

In [17]:
np_artists = artist_tags_len['tags_mb'].to_numpy()

In [18]:
np_artists

array([array(['rock', 'pop', 'alternative rock', 'british', 'uk', 'britannique',
              'britpop', 'pop rock', 'piano pop', 'piano rock', 'english',
              'parlophone', 'rock and indie', 'ambient pop', 'pop/rock',
              'chapel', 'post-britpop'], dtype='<U16')                          ,
       array(['rock', 'electronic', 'alternative rock', 'british', 'grunge',
              'uk', 'britannique', 'britpop', 'art rock', 'experimental rock',
              'english', 'chamber pop', 'parlophone', 'england', 'melancholic',
              'oxford', 'bootleg', 'rock and indie', 'c’était mieux avant',
              'art pop', 'nude', 'sacred cows'], dtype='<U19')                 ,
       array(['rock', 'alternative rock', '80s', '90s', 'rap', 'metal',
              'american', 'crossover', 'usa', 'funk', 'funk rock', 'alternative',
              'pop rock', 'funk metal', 'rap rock', '00s', 'dvd',
              'pop and chart', '10s', 'funk rock tributo'], dtype='<U17')   

In [19]:
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in np_artists:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))
    

tag_matrix = sparse.csr_matrix((data, indices, indptr), dtype=int)
tmr = tag_matrix.toarray()

In [20]:
tag_matrix

<119943x36091 sparse matrix of type '<class 'numpy.int32'>'
	with 305601 stored elements in Compressed Sparse Row format>

In [30]:
from scipy.sparse import * 
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

In [31]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [40]:
X=tag_matrix
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=200, random_state=0).fit(X)
kmeans.labels_


array([135, 135, 145, ...,  20, 102, 102])

In [41]:
kmeans.labels_

array([135, 135, 145, ...,  20, 102, 102])

### Top 2000 By Listeners

In [71]:
pd.options.mode.chained_assignment = None
top2k = artist_tags[:2000]

In [72]:
top2k

Unnamed: 0,artist_mb,tags_mb,tags_len
0,Coldplay,"[rock, pop, alternative rock, british, uk, bri...",17
1,Radiohead,"[rock, electronic, alternative rock, british, ...",22
2,Red Hot Chili Peppers,"[rock, alternative rock, 80s, 90s, rap, metal,...",20
3,Rihanna,"[pop, dance, hip hop, reggae, contemporary r b...",20
4,Eminem,"[turkish, rap, american, hip-hop, hip hop, hip...",19
...,...,...,...
2213,DJ Jazzy Jeff,"[hip hop, hip hop rnb and dance hall, jazz hop]",3
2214,Big Star,"[power pop, pop rock, folk pop, classic pop an...",5
2215,ABC,"[synthpop, pop, new wave, dance-pop, new roman...",7
2216,Burl Ives,"[american, classic pop and rock]",2


In [73]:
top2k_tags_len = top2k
top2k_tags_len['tags_len'] = top2k_tags_len['tags_mb'].apply(lambda x: len(x))

In [76]:
top2k_artists = top2k_tags_len['tags_mb'].to_numpy()

In [77]:
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in top2k_artists:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))
    

tag_matrix = sparse.csr_matrix((data, indices, indptr), dtype=int)
tmr = tag_matrix.toarray()

<2000x1973 sparse matrix of type '<class 'numpy.int32'>'
	with 14965 stored elements in Compressed Sparse Row format>