# Analysis and clustring of my favorite Spotify songs

I am huge music fan and, in my opinion, there is no obvious pattern what types of songs I like. Therefore, I decided to look  a little bit deeper into it, perform analysis and try to distinguish some groups.

## Technologies used:
- json
- matplotlib
- seaborn
- numpy
- pandas
- requests
- urlib.parse
- base64
- K-means clustering
- TSNE


## Project plan
1. Data gathering
2. Data preparation and exploratory analysis of my streaming history.
3. Spotify API: Adjsuting class written by XXXX. Adding extra methods for my case.
4. K-Means clustering. Finding accurate numer of clusters using elbow method.
5. Cluster visualisation using TSNE.



## 1 Data gathering:
- My streaing history - songs, which I listned to during last 365 days (requested from spotify website)
- Spotify API - to get details about each song

## 2  Data preparation and exploratory analysis of my streaming history.

In [2]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import requests
import datetime
from urllib.parse import urlencode
import base64

In [None]:
df_str0 = pd.read_json('StreamingHistory0.json')
df_str1 = pd.read_json('StreamingHistory1.json')
df_str2 = pd.read_json('StreamingHistory2.json')

In [None]:
df_temp = df_str0.append(df_str1)
df_temp = df_temp.append(df_str2)
df_temp.reset_index(inplace=True)

In [None]:
df_temp.head()

In [None]:
df_temp.artistName.value_counts()

I will create data frame, which presents how many times I played each song.

In [None]:
df_my_songs = pd.DataFrame(df_temp.trackName.value_counts()).reset_index()
df_my_songs 

Now I will select all the songs which was played more than 5 times, which sounds like a reasonble choice to pick songs that I really like.

In [None]:
df_my_songs = df_my_songs[df_my_songs['trackName'] > 5]

In [None]:
df_my_songs.columns = ['trackName','counts']

Finally, I will marge original streaming dataframe with track counts df and remove duplicates.

In [None]:
df_my = df_temp.merge(df_my_songs, how='inner')
df_my

In [None]:
df_my = df_my.drop_duplicates(subset='trackName')

In [None]:
df_my.trackName.unique().shape

## 3. Spotify API

To get data for my favorite tracks I will use code wirtten by xxxxxx

I will add few methods to get audio features of my songs.This will be obtained in two steps:

1. Searching for trackID based on asrtist and track name.
2. Searching for audio features based on trackID

In [18]:
client_id ='55852b101a53402690e53d515fee3a3b'   # put your client_id here
client_secret = '7e4c33910bdf4d39ab571db55b10e7df'  # put your client_secret here

In [19]:
class SpotifyAPI(object):
    access_token = None
    access_token_expires = datetime.datetime.now()
    access_token_did_expire = True
    client_id = None
    client_secret = None
    token_url = "https://accounts.spotify.com/api/token"
    
    def __init__(self, client_id, client_secret, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.client_id = client_id
        self.client_secret = client_secret

    def get_client_credentials(self):
        """
        Returns a base64 encoded string
        """
        client_id = self.client_id
        client_secret = self.client_secret
        if client_secret == None or client_id == None:
            raise Exception("You must set client_id and client_secret")
        client_creds = f"{client_id}:{client_secret}"
        client_creds_b64 = base64.b64encode(client_creds.encode())
        return client_creds_b64.decode()
    
    def get_token_headers(self):
        client_creds_b64 = self.get_client_credentials()
        return {
            "Authorization": f"Basic {client_creds_b64}"
        }
    
    def get_token_data(self):
        return {
            "grant_type": "client_credentials"
        } 
    
    def perform_auth(self):
        token_url = self.token_url
        token_data = self.get_token_data()
        token_headers = self.get_token_headers()
        r = requests.post(token_url, data=token_data, headers=token_headers)
        if r.status_code not in range(200, 299):
            raise Exception("Could not authenticate client.")
            # return False
        data = r.json()
        now = datetime.datetime.now()
        access_token = data['access_token']
        expires_in = data['expires_in'] # seconds
        expires = now + datetime.timedelta(seconds=expires_in)
        self.access_token = access_token
        self.access_token_expires = expires
        self.access_token_did_expire = expires < now
        return True
    
    def get_access_token(self):
        token = self.access_token
        expires = self.access_token_expires
        now = datetime.datetime.now()
        if expires < now:
            self.perform_auth()
            return self.get_access_token()
        elif token == None:
            self.perform_auth()
            return self.get_access_token() 
        return token
    
    def get_resource_header(self):
        access_token = self.get_access_token()
        headers = {
            "Authorization": f"Bearer {access_token}"
        }
        return headers
        
        
    def get_resource(self, lookup_id, resource_type='albums', version='v1'):
        endpoint = f"https://api.spotify.com/{version}/{resource_type}/{lookup_id}"
        headers = self.get_resource_header()
        r = requests.get(endpoint, headers=headers)
        if r.status_code not in range(200, 299):
            return {}
        return r.json()
    
    def get_album(self, _id):
        return self.get_resource(_id, resource_type='albums')
    
    def get_artist(self, _id):
        return self.get_resource(_id, resource_type='artists')
    
    def get_audio_features(self, _id):
        return self.get_resource(_id, resource_type='audio-features')
    
    def base_search(self, query_params): # type
        headers = self.get_resource_header()
        endpoint = "https://api.spotify.com/v1/search"
        lookup_url = f"{endpoint}?{query_params}"
        r = requests.get(lookup_url, headers=headers)
        if r.status_code not in range(200, 299):  
            return {}
        return r.json()
    
    def search(self, query=None, operator=None, operator_query=None, search_type='track' ):
        if query == None:
            raise Exception("A query is required")
        if isinstance(query, dict):
            query = " ".join([f"{k}:{v}" for k,v in query.items()])
        if operator != None and operator_query != None:
            if operator.lower() == "or" or operator.lower() == "not":
                operator = operator.upper()
                if isinstance(operator_query, str):
                    query = f"{query} {operator} {operator_query}"
        query_params = urlencode({"q": query, "type": search_type.lower()})
        print(query_params)
        return self.base_search(query_params)

In [20]:
spotify = SpotifyAPI(client_id, client_secret)

In [None]:
df_my.reset_index(inplace=True, drop=True)

## 1. Searching for trackID based on asrtist and track name.
By investigaring the output of search results I managed to find trackID path. Since not all the songs can be found, to avoid errors I will use tryexept and put NaN as missing ID, which than be dropped.

In [None]:
for i in range(df_my.shape[0]):
    try:
        df_my.loc[i, "trackID"] = spotify.search({"artist": df_my.loc[i,'artistName'], "track": df_my.loc[i,'trackName']}, search_type="track")['tracks']['items'][0]['id']

    except IndexError:
        df_my.loc[i, "trackID"] = np.NaN

In [None]:
df_my.dropna(inplace=True)
df_my.reset_index(inplace=True, drop=True)
df_my.head(10)

In [None]:
for i in range(df_my.shape[0]):
    print(i)
    audio_features = spotify.get_audio_features(df_my.loc[i,"trackID"])
    for key, value in audio_features.items():
#             print(key, value)
        df_my.loc[i, f'{key}'] = value

In [None]:
# df_my.to_csv('my_top_with_features.csv')