In [1]:
import os
import json
from pprint import pprint
import datetime
import pytz

import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine, euclidean
from sklearn.preprocessing import StandardScaler

from umap import UMAP

import matplotlib
from matplotlib import pyplot as plt
import plotly.express as px
from plotly.offline import plot
import plotly.graph_objects as go

import seaborn as sb



In [2]:
tracks = pd.read_csv('../data/processed/tracks.csv', index_col=0)
features = pd.read_csv('../data/processed/track_features.csv', index_col=0)

data = pd.merge(tracks, features, on='id')
data.played_at = pd.to_datetime(data.played_at)
data.sort_values('played_at', inplace=True)
data.reset_index(inplace=True)
del data['index']

In [3]:
audio_features = ['acousticness', 
                 'danceability', 
                 'energy', 
                 'instrumentalness', 
                 'liveness', 
                 'loudness', 
                 'speechiness', 
                 'tempo', 
                 'valence']

In [4]:
artist_audio_features = data.groupby('artist')[audio_features].median()

In [5]:
delta=10e-3
#df.speechiness = (df.speechiness+delta).apply(np.log)
artist_audio_features.instrumentalness = (artist_audio_features.instrumentalness+delta).apply(np.log)

In [6]:
_idx = artist_audio_features.index
_cols = artist_audio_features.columns

In [7]:
scaler = StandardScaler()
artist_audio_features = scaler.fit_transform(artist_audio_features)

In [8]:
artist_audio_features = pd.DataFrame(artist_audio_features, index=_idx, columns=_cols)

In [9]:
artist_play_count = data.groupby('artist').played_at.count()

In [10]:
mapper = UMAP(n_neighbors=15)

In [11]:
X = mapper.fit_transform(artist_audio_features)
X = pd.DataFrame(X, columns=['x0', 'x1'])
X['artists'] = artist_audio_features.index

In [12]:
X = X.set_index('artists')
X['play_count'] = artist_play_count
X.reset_index(inplace=True)

In [13]:
fig = px.scatter(X[X.play_count>10], 
                 x='x0', 
                 y="x1", 
                 color="artists", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [14]:
neural_artists = data[data.played_at>=datetime.datetime(2020, 5, 27).astimezone(pytz.utc)].artist.unique()

In [15]:
fig = px.scatter(X[(X.play_count>10) | (X.artists.isin(neural_artists))], 
                 x='x0', 
                 y="x1", 
                 color="artists", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [16]:
import matplotlib

In [17]:
mask1 = data.played_at>=datetime.datetime(2020, 5, 30, 13, 53, 12).astimezone(pytz.utc)
mask2 = data.played_at<=datetime.datetime(2020, 5, 30, 16, 52, 25).astimezone(pytz.utc)

In [18]:
bezerra_radio = data[mask1 & mask2][['played_at', 'artist']].artist.unique()

In [19]:
Xradio = X[(X.artists.isin(bezerra_radio))]

In [20]:
Xradio.reset_index(drop=True, inplace=True)

In [22]:
hot_cmap = matplotlib.cm.get_cmap('hot')
norm = matplotlib.colors.Normalize(vmin=Xradio.index.min(), vmax=Xradio.index.max())
hot_rgb = []
for i in Xradio.index:
    k = matplotlib.colors.colorConverter.to_rgb(hot_cmap(norm(i)))
    hot_rgb.append(k)

In [23]:
to_plot = X[(X.play_count>10) | (X.artists.isin(bezerra_radio))]
fig = go.Figure(go.Scatter(
                 x=to_plot.x0, 
                 y=to_plot.x1,
                 mode='markers',
                 marker_color="grey",
                 text=to_plot.artists))

fig.add_trace(go.Scatter(x=Xradio.x0, 
                         y=Xradio.x1, 
                         mode="markers+lines", 
                         marker=dict(
                            color=['rgb'+str(c) for c in hot_rgb],
                            size=Xradio.index),
                         
                         text=Xradio.artists))

plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [25]:
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, KMeans, DBSCAN

In [26]:
clust = AgglomerativeClustering(n_clusters=10)

In [27]:
classes = clust.fit_predict(X[['x0', 'x1']])

In [28]:
classes.max()

9

In [29]:
X['class'] = classes

In [30]:
X.sort_values('class')

Unnamed: 0,artists,x0,x1,play_count,class
0,...And You Will Know Us by the Trail of Dead,1.544106,3.090335,1,0
312,Haken,1.392223,3.009206,10,0
315,Harold Melvin & The Blue Notes,1.851017,4.189450,1,0
318,Hawkwind,1.911222,3.844090,2,0
322,Helloween,0.876817,2.183632,34,0
...,...,...,...,...,...
513,Porcupine Tree,3.516915,2.138123,1,9
68,Astrix Juno Reactor,3.908526,2.281927,8,9
617,The Who,3.572701,2.421220,1,9
196,DJ Shadow Nils Frahm,3.423581,2.079472,1,9


In [31]:
fig = px.scatter(X[X.play_count>10], 
                 x='x0', 
                 y="x1", 
                 color="class", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'