In [87]:
import pandas as pd
import numpy as np
import spotipy as sp
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import json
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


In [68]:
pio.templates.default = "plotly_white"

In [44]:
music_df = pd.read_json('StreamingHistory0.json')
df2 = pd.read_json('StreamingHistory1.json')
df3 = pd.read_json('StreamingHistory2.json')
df4 = pd.read_json('StreamingHistory3.json')
df5 = pd.read_json('StreamingHistory4.json')
music_df = pd.concat([music_df, df2, df3, df4, df5])

In [45]:
music_df.head(10)
#df.tail(10)
#df.info()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2022-09-22 16:28,Death Cab for Cutie,Transatlanticism,244250
1,2022-11-02 00:02,Alan Walker,Faded,212626
2,2022-11-02 00:03,Alan Walker,Faded,52928
3,2022-11-02 00:14,The Floating Mind,Brown Noise (200 Hz),12695
4,2022-11-02 00:14,Justin Warnick,Stranger,157499
5,2022-11-02 00:16,Weekend,Coma Summer,11754
6,2022-11-02 00:16,Two Feet,Your Mother Was Cheaper,3249
7,2022-11-02 00:16,King Krule,363N63,121292
8,2022-11-02 00:16,John Mayer,Waiting On the World to Change,2474
9,2022-11-02 00:20,Jeff Rosenstock,We Begged 2 Explode,224149


In [46]:
music_df['endTime']=pd.to_datetime(music_df['endTime'])
music_df['minPlayed']=(music_df['msPlayed']/1000/60)
music_df.drop('msPlayed', axis=1, inplace=True)

In [47]:
music_df.head()

Unnamed: 0,endTime,artistName,trackName,minPlayed
0,2022-09-22 16:28:00,Death Cab for Cutie,Transatlanticism,4.070833
1,2022-11-02 00:02:00,Alan Walker,Faded,3.543767
2,2022-11-02 00:03:00,Alan Walker,Faded,0.882133
3,2022-11-02 00:14:00,The Floating Mind,Brown Noise (200 Hz),0.211583
4,2022-11-02 00:14:00,Justin Warnick,Stranger,2.624983


In [48]:
music_df.tail()

Unnamed: 0,endTime,artistName,trackName,minPlayed
6272,2023-11-02 21:38:00,The War On Drugs,Nothing to Find,3.724133
6273,2023-11-02 22:16:00,James Blake,CMYK,3.654583
6274,2023-11-02 22:17:00,Burial,Kindred,0.125867
6275,2023-11-02 22:22:00,Burial,Ashtray wasp,5.185067
6276,2023-11-02 23:04:00,Burial,Near Dark,0.009233


In [49]:
music_df['dayOfWeek']=music_df['endTime'].dt.day_name()
music_df['dayOfMonth']=music_df['endTime'].dt.day
music_df['timeOfDay']=music_df['endTime'].dt.hour
music_df['date']=music_df['endTime'].dt.date

In [50]:
music_df.head()

Unnamed: 0,endTime,artistName,trackName,minPlayed,dayOfWeek,dayOfMonth,timeOfDay,date
0,2022-09-22 16:28:00,Death Cab for Cutie,Transatlanticism,4.070833,Thursday,22,16,2022-09-22
1,2022-11-02 00:02:00,Alan Walker,Faded,3.543767,Wednesday,2,0,2022-11-02
2,2022-11-02 00:03:00,Alan Walker,Faded,0.882133,Wednesday,2,0,2022-11-02
3,2022-11-02 00:14:00,The Floating Mind,Brown Noise (200 Hz),0.211583,Wednesday,2,0,2022-11-02
4,2022-11-02 00:14:00,Justin Warnick,Stranger,2.624983,Wednesday,2,0,2022-11-02


In [51]:
print("I listened to {} tracks, {:.3f} per day in the last year (crazy!)".format( len(music_df), len(music_df)/365))

I listened to 46277 tracks, 126.786 per day in the last year (crazy!)


In [52]:
print("This amounts to about {} hours total, and {:.3f} hours per day.".format(music_df.minPlayed.sum()/60, music_df.minPlayed.sum()/365/60))

This amounts to about 2228.3570124999997 hours total, and 6.105 hours per day.


In [53]:
print("which then equates to about {:.3f} minutes per song.".format(music_df.minPlayed.sum()/len(music_df)))

which then equates to about 2.889 minutes per song.


In [54]:
artist_track_count = music_df.groupby('artistName')['trackName'].count().reset_index()
artist_track_count.columns = ['artist', 'play counts']

artist_track_count = artist_track_count.sort_values('play counts', ascending = False)
artist_track_count.head()
artist_track_short = artist_track_count[:10]

In [69]:
fig = px.bar(artist_track_short, x='play counts', y='artist', orientation='h',
             labels={'play counts': 'play counts', 'artist': 'artist'},
             title='Track Count for Each Artist')
fig.update_traces(text=artist_track_short.apply(lambda x: f"{x['play counts']}", axis=1),
                  textposition='inside')

fig.show()

In [70]:
track_counts = music_df.groupby('trackName').size().reset_index(name = 'Listen Count')
track_counts = track_counts.sort_values('Listen Count', ascending=False)
track_counts
track_counts.head(20)

Unnamed: 0,trackName,Listen Count
2942,Haldern,281
783,Basketball Shoes,208
1523,Concorde,180
7703,Track X,165
8767,i tried,162
6219,Self Control,159
7382,The Place Where He Inserted the Blade,149
4537,Making The Band (Danity Kane),132
8926,skin in the game,129
8809,kisses,124


In [71]:
track_counts_short = track_counts[:10]
fig = px.bar(track_counts_short, x='Listen Count', y='trackName', orientation='h',
             labels={'Listen Count': 'Listen Count', 'trackName': 'trackName'},
             title='Listen Count for each Song')
fig.update_traces(text=track_counts_short['Listen Count'], textposition='outside')
fig.update_traces(text=track_counts_short.apply(lambda x: f"{x['Listen Count']}", axis=1),
                  textposition='inside')

fig.show()

In [72]:

fig_new = go.Figure(data=go.Heatmap(z=timevsweek.values,
                                    x=timevsweek.columns,
                                    y=timevsweek.index,
                                    colorscale='Blues'))

fig_new.update_layout(
    title='Listening Map',
    xaxis=dict(title='Time of Day'),
    yaxis=dict(title='Day of Week'),
    xaxis_ticksuffix='h',
)

fig_new.show()

In [59]:
SPOTIPY_CLIENT_ID = '2344996eab174070a213ad526d96c0f6'
SPOTIPY_CLIENT_SECRET = 'cdceafd465504a3cad736d27c385f3a4'

In [60]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [61]:
def get_song_features(track):
    results = sp.search(q='track:{}'.format(track), type='track')

    # Check if there are any search results
    if results['tracks']['items']:
        # Extract the first artist's name
        artist = results['tracks']['items'][0]['artists'][0]['name']
        
        # Search again with both track name and artist
        track_info = sp.search(q='track:{} artist:{}'.format(track, artist), type='track')

        # Check if there are any search results
        if track_info['tracks']['items']:
            # Get the track ID
            track_id = track_info['tracks']['items'][0]['id']

            # Get the audio features for the track
            audio_features = sp.audio_features(track_id)

            return audio_features
        else:
            print("No results found for the track '{}' by the artist '{}'.".format(track, artist))
    else:
        print("No results found for the track '{}'.".format(track))


In [62]:
top_100 = track_counts[:100]
audio_features = []
for track in top_100['trackName']:
    features = get_song_features(track)
    audio_features.append(features)
    
    

In [63]:
audio_features

[[{'danceability': 0.368,
   'energy': 0.318,
   'key': 4,
   'loudness': -11.182,
   'mode': 1,
   'speechiness': 0.0358,
   'acousticness': 0.627,
   'instrumentalness': 0.188,
   'liveness': 0.0804,
   'valence': 0.122,
   'tempo': 81.116,
   'type': 'audio_features',
   'id': '4EVwKp8hDRmxrT6Dzu6aBc',
   'uri': 'spotify:track:4EVwKp8hDRmxrT6Dzu6aBc',
   'track_href': 'https://api.spotify.com/v1/tracks/4EVwKp8hDRmxrT6Dzu6aBc',
   'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4EVwKp8hDRmxrT6Dzu6aBc',
   'duration_ms': 305213,
   'time_signature': 4}],
 [{'danceability': 0.283,
   'energy': 0.451,
   'key': 7,
   'loudness': -8.585,
   'mode': 1,
   'speechiness': 0.0452,
   'acousticness': 0.0203,
   'instrumentalness': 0.00318,
   'liveness': 0.0463,
   'valence': 0.0683,
   'tempo': 91.116,
   'type': 'audio_features',
   'id': '4ShVovnLIlObG43vwaCRkJ',
   'uri': 'spotify:track:4ShVovnLIlObG43vwaCRkJ',
   'track_href': 'https://api.spotify.com/v1/tracks/4ShVovnLIlObG43

In [64]:
numerical_data = []
for inner_list in audio_features:
    # Extract numerical data from each dictionary
    numerical_dict = {key: value for dict_ in inner_list for key, value in dict_.items() if isinstance(value, (int, float))}
    
    # Append the numerical data to the list
    numerical_data.append(numerical_dict)

# Convert the list of dictionaries into a DataFrame
feature_df = pd.DataFrame(numerical_data)

# Display the DataFrame
print(feature_df)

    danceability  energy  key  loudness  mode  speechiness  acousticness  \
0          0.368   0.318    4   -11.182     1       0.0358      0.627000   
1          0.283   0.451    7    -8.585     1       0.0452      0.020300   
2          0.565   0.284    9   -11.881     1       0.0286      0.023000   
3          0.669   0.574    4    -6.442     0       0.0286      0.017500   
4          0.727   0.710    4    -6.142     1       0.0742      0.010400   
..           ...     ...  ...       ...   ...          ...           ...   
95         0.583   0.676    6    -4.440     1       0.0298      0.028900   
96         0.107   0.678    0   -12.594     1       0.0581      0.000019   
97         0.669   0.749    7    -6.811     0       0.1410      0.413000   
98         0.371   0.441    7   -10.936     1       0.0305      0.006270   
99         0.466   0.510    1   -14.373     1       0.0715      0.920000   

    instrumentalness  liveness  valence    tempo  duration_ms  time_signature  
0      

In [65]:
normalized_df=(feature_df-feature_df.min())/(feature_df.max()-feature_df.min())

In [73]:
fig = px.histogram(feature_df, x="danceability")
fig.show()

In [74]:
fig = px.histogram(feature_df, x="instrumentalness")
fig.show()

In [75]:
fig = px.histogram(feature_df, x="acousticness")
fig.show()

In [76]:
fig = px.histogram(feature_df, x="valence")
fig.show()

In [77]:
fig = px.histogram(feature_df, x="tempo")
fig.show()

In [79]:
fig = px.histogram(feature_df, x="speechiness")
fig.show()

In [88]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_df.drop('Cluster', axis=1))  # Drop the 'Cluster' column before scaling

# Perform hierarchical clustering
# You can adjust the number of clusters and linkage method as needed
clustering = AgglomerativeClustering(n_clusters=5, linkage='ward')
clusters = clustering.fit_predict(scaled_features)

# Add cluster labels to the DataFrame
feature_df['Cluster'] = clusters

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

# t-SNE
tsne = TSNE(n_components=2)
tsne_result = tsne.fit_transform(scaled_features)

# Create a DataFrame for PCA and t-SNE results
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
tsne_df = pd.DataFrame(data=tsne_result, columns=['TSNE1', 'TSNE2'])
plot_df = pd.concat([pca_df, tsne_df, feature_df['Cluster']], axis=1)

# Plot PCA
fig_pca = px.scatter(plot_df, x='PC1', y='PC2', color='Cluster', title='PCA Plot')
fig_pca.show()

# Plot t-SNE
fig_tsne = px.scatter(plot_df, x='TSNE1', y='TSNE2', color='Cluster', title='t-SNE Plot')
fig_tsne.show()
