In [1]:
import os
import json
from pprint import pprint
import datetime
import pytz

import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine, euclidean
from sklearn.preprocessing import StandardScaler

from umap import UMAP

import matplotlib
from matplotlib import pyplot as plt
import plotly.express as px
from plotly.offline import plot
import plotly.graph_objects as go

import seaborn as sb



In [2]:
tracks = pd.read_csv('../data/processed/tracks.csv', index_col=0)
features = pd.read_csv('../data/processed/track_features.csv', index_col=0)

data = pd.merge(tracks, features, on='id')
data.played_at = pd.to_datetime(data.played_at)
data.sort_values('played_at', inplace=True)
data.reset_index(inplace=True)
del data['index']

In [3]:
audio_features = ['acousticness', 
                 'danceability', 
                 'energy', 
                 'instrumentalness', 
                 'liveness', 
                 'loudness', 
                 'speechiness', 
                 'tempo', 
                 'valence']

In [4]:
artist_audio_features = data.groupby('artist')[audio_features].median()

In [5]:
delta=10e-3
#df.speechiness = (df.speechiness+delta).apply(np.log)
artist_audio_features.instrumentalness = (artist_audio_features.instrumentalness+delta).apply(np.log)

In [6]:
_idx = artist_audio_features.index
_cols = artist_audio_features.columns

In [7]:
scaler = StandardScaler()
artist_audio_features = scaler.fit_transform(artist_audio_features)

In [8]:
artist_audio_features = pd.DataFrame(artist_audio_features, index=_idx, columns=_cols)

In [9]:
artist_play_count = data.groupby('artist').played_at.count()

In [10]:
mapper = UMAP(n_neighbors=15)

In [11]:
X = mapper.fit_transform(artist_audio_features)
X = pd.DataFrame(X, columns=['x0', 'x1'])
X['artists'] = artist_audio_features.index

In [12]:
X = X.set_index('artists')
X['play_count'] = artist_play_count
X.reset_index(inplace=True)

In [13]:
fig = px.scatter(X[X.play_count>10], 
                 x='x0', 
                 y="x1", 
                 color="artists", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [14]:
neural_artists = data[data.played_at>=datetime.datetime(2020, 5, 27).astimezone(pytz.utc)].artist.unique()

In [15]:
fig = px.scatter(X[(X.play_count>10) | (X.artists.isin(neural_artists))], 
                 x='x0', 
                 y="x1", 
                 color="artists", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [54]:
import matplotlib

In [16]:
mask1 = data.played_at>=datetime.datetime(2020, 5, 30, 13, 53, 12).astimezone(pytz.utc)
mask2 = data.played_at<=datetime.datetime(2020, 5, 30, 16, 52, 25).astimezone(pytz.utc)

In [17]:
bezerra_radio = data[mask1 & mask2][['played_at', 'artist']].artist.unique()

In [18]:
Xradio = X[(X.artists.isin(bezerra_radio))]

In [31]:
Xradio.reset_index(drop=True, inplace=True)

In [89]:
to_plot = X[(X.play_count>10) | (X.artists.isin(bezerra_radio))]
fig = go.Figure(go.Scatter(
                 x=to_plot.x0, 
                 y=to_plot.x1,
                 mode='markers',
                 marker_color="grey",
                 text=to_plot.artists))

fig.add_trace(go.Scatter(x=Xradio.x0, 
                         y=Xradio.x1, 
                         mode="markers+lines", 
                         marker=dict(
                            color=['rgb'+str(c) for c in hot_rgb],
                            size=Xradio.index),
                         
                         text=Xradio.artists))

plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'

In [64]:
hot_cmap = matplotlib.cm.get_cmap('hot')
norm = matplotlib.colors.Normalize(vmin=Xradio.index.min(), vmax=Xradio.index.max())
hot_rgb = []
for i in Xradio.index:
    k = matplotlib.colors.colorConverter.to_rgb(hot_cmap(norm(i)))
    hot_rgb.append(k)

In [88]:
fig = go.Figure(go.Scatter(x=Xradio.x0, 
                           y=Xradio.x1,
                           mode='markers',
                           text=Xradio.artists))

fig.update_traces(marker=dict(
                              color=,
                              size=Xradio.index))

plot(fig)

'temp-plot.html'

In [92]:
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, KMeans, DBSCAN

In [132]:
clust = AgglomerativeClustering(n_clusters=10)

In [133]:
classes = clust.fit_predict(X[['x0', 'x1']])

In [134]:
classes.max()

9

In [135]:
X['class'] = classes

In [136]:
X.sort_values('class')

Unnamed: 0,artists,x0,x1,play_count,class
291,Igor Stravinsky St Petersburg Philharmonic Orc...,9.774935,2.108355,13,0
126,Béla Bartók Hungarian National Philharmonic Zo...,9.989212,1.386041,8,0
125,Béla Bartók Frankfurt Radio Symphony Orchestra...,10.342969,0.288844,1,0
295,Itzhak Perlman Jeremy Lubbock John Williams Pi...,9.932921,1.857408,1,0
123,Burt Mitchell,8.935189,2.346592,1,0
...,...,...,...,...,...
378,Mariinsky Orchestra,7.231702,3.129254,2,9
387,Maíra Freitas,7.207127,2.994826,1,9
145,Charles Bolt,8.464114,2.448708,2,9
366,Lucio Yanel Yamandú Costa,7.071073,2.837062,11,9


In [137]:
fig = px.scatter(X[X.play_count>10], 
                 x='x0', 
                 y="x1", 
                 color="class", 
                 hover_data=['artists'])
plot(fig)
#fig.write_html('./figures/umap_thplayed10_nn15_artists.html')

'temp-plot.html'