In [7]:
import spotipy
import pandas as pd
import itertools

# Authentication with Spotify

In [8]:
# Client Flow - to get features and analysis from tracks
# Authenticate my user from Spotify

CLIENT_ID = "5424fe3bb2b646c8a2144d03d41f22c7"
CLIENT_SECRET = "f13ad8573f3444908afdf6a6199db827"
token = spotipy.oauth2.SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=token)

# Load dataset with ratings and features

In [9]:
df_toptracks_features = pd.read_csv("./dataset/toptracks_ratings_features.csv")
df_toptracks_features

Unnamed: 0,id,name,range,rating,id_copy,energy,danceability,loudness,valence,tempo,speechiness,acousticness,instrumentalness,duration_ms,time_signature
0,7L5IwfKB6W0tadcSh9wlyH,Ouverture,short_term,5.393939,7L5IwfKB6W0tadcSh9wlyH,0.3360,0.4960,-19.440,0.1850,129.006,0.0513,0.95500,0.912000,322173,4
1,0tAZi3X7dUdd7m8OXB8pMA,Shadow,short_term,5.272727,0tAZi3X7dUdd7m8OXB8pMA,0.2750,0.0832,-15.256,0.0334,170.316,0.0347,0.88700,0.853000,558267,1
2,1XMDIKQbV30WJPKLMN6MKv,INSTRUCTION,short_term,4.969697,1XMDIKQbV30WJPKLMN6MKv,0.3550,0.4910,-12.480,0.0369,145.405,0.0354,0.38200,0.887000,262733,4
3,1XZdwzd8DTDvkjVc0eJ9BI,Wildlife Analysis,short_term,4.909091,1XZdwzd8DTDvkjVc0eJ9BI,0.0204,0.1550,-31.212,0.2780,79.755,0.0462,0.99500,0.936000,75627,4
4,1f4cKwcKfNiLbQr8x2tZ3C,Melt!,short_term,4.848485,1f4cKwcKfNiLbQr8x2tZ3C,0.9190,0.7850,-13.059,0.4430,131.037,0.0552,0.00772,0.916000,214307,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,2OqtZbITDWCFUHAT9fmdin,Choses nouvelles,long_term,4.242424,2OqtZbITDWCFUHAT9fmdin,0.3830,0.6690,-12.993,0.5690,140.011,0.0385,0.46500,0.856000,234600,4
234,6EPRKhUOdiFSQwGBRBbvsZ,Ace of Spades,long_term,4.181818,6EPRKhUOdiFSQwGBRBbvsZ,0.9100,0.3420,-7.748,0.5470,140.452,0.0758,0.00004,0.000121,166360,4
235,4Jj8pWRyVjh0KIJLrcreRa,Here We Go Jack,long_term,4.121212,4Jj8pWRyVjh0KIJLrcreRa,0.3090,0.6790,-10.606,0.4730,163.979,0.0320,0.58500,0.907000,237938,4
236,3v9g8iM3v5irWQHqFWaDSo,Cómo Te Quiero,long_term,4.060606,3v9g8iM3v5irWQHqFWaDSo,0.2770,0.6080,-11.917,0.4370,126.501,0.0282,0.62200,0.896000,242536,4


# Audio analysis

## define some functions and variables to help us gather and process the analysis data

In [10]:
# a dict specifying which audio analysis features we want.
analysis_features_list = {
    "segments" : # feature parent
    [ # feature array
        "pitches",
        "timbre"
    ],
     "sections" : 
    [
        "key",
        "mode"
    ]
}


def gather_analysis_data(track_id, analysis_features_list, audio_analysis):
    
    # takes track_id, features and analysis object
    # returns a dict where each key is feature names with their values as values.
    
    analysis_data = {} 
    # for every item in the feaures list. val = ["pitches", "timbre"]. Key = "sections", "segments"
    for count1, (feature_parent, feature_array) in enumerate(analysis_features_list.items()):
        
        for feature in feature_array:
            #Initialize a new key
            analysis_data[feature] = []  
            
            # go through the API response and gather all relevant feature values in results dict.
            for count2, (category, analysis) in enumerate(audio_analysis.items()):                
                if category == feature_parent:
                    
                    for item in range(len(analysis)):
                        analysis_data[feature].append(analysis[item][feature])
    
    return analysis_data


def clean_analysis_data(analysis_data, duration_ms):
    
    # Takes analysis_data dict and calculates different kinds of averages of the analysis data lists and numbers.
    # outputs analysis_data dict cleaned, with only 1 value per key
    result = {}
    
    for count, (key, val) in enumerate(analysis_data.items()):       
        averages = []
        
        # if values are timbre or pitches (lists)
        if type(val[0]) == list:
            
            # take the same index items of all lists in the 2d array and average them. [[1,2,3], [1,2,3]] (1+1/2) etc..
            for w in range(len(val[0])):
                temp_store = []
                for item in val:
                    temp_store.append(item[w])
                averages.append(round(sum(temp_store) / len(temp_store), 2))
                
            result[key] = []
            result[key].append(averages)
        
        # if values are scalar (mode, key)
        else:
            if key == "key":
                # take the precentage value indicating how many key changes happen in the track, based on its duration.
                # remove consecutive duplicate values.
                val = [k for k, g in itertools.groupby(val)] 
                
                #how many miliseconds per chord change on average.
                ms_per_chord_change = duration_ms/len(val)
                change_percentage = (100/duration_ms)*ms_per_chord_change
                
                #invert values for a more intuitive reading.
                averages.append(abs(change_percentage-100))
                result[key] = averages
            
            # Take the average mode of the track
            else:
                averages.append(round(sum(val)/len(val), 2))
                result[key] = averages
    
    return result

## Collect, clean, and gather analysis data

In [11]:
df_analysis_result = pd.DataFrame()

# iterate over our toptracks dataframe
for index, row in df_toptracks_features.iterrows():
    #if index < 10:
    analysis_data = {}

    # spotify API call
    audio_analysis = sp.audio_analysis(row["id"])

    # gather raw features
    analysis_data = gather_analysis_data(row["id"], analysis_features_list, audio_analysis)

    # clean data 
    analysis_data_cleaned = clean_analysis_data(analysis_data, row["duration_ms"])

    # concat data into main dict 
    df_analysis_data_cleaned = pd.DataFrame(analysis_data_cleaned)
    df_analysis_data_cleaned["id_copy_2"] = row["id"]
    df_analysis_result = pd.concat([df_analysis_result, df_analysis_data_cleaned], ignore_index=True)
    

# finally, rename some columns for better doc
df_analysis_result = df_analysis_result.rename(columns = {"pitches": "pitch_avg", 
                                                          "timbre" : "timbre_avg", 
                                                          "key": "key_change_percentage", 
                                                          "mode": "mode_avg"}, inplace=False)
# print for check
df_analysis_result

Unnamed: 0,pitch_avg,timbre_avg,key_change_percentage,mode_avg,id_copy_2
0,"[0.27, 0.58, 0.21, 0.27, 0.18, 0.34, 0.26, 0.0...","[36.4, -140.17, -20.32, -18.11, 13.38, -37.86,...",75.000000,0.14,7L5IwfKB6W0tadcSh9wlyH
1,"[0.18, 0.26, 0.64, 0.5, 0.16, 0.26, 0.16, 0.21...","[40.29, -65.03, 29.6, -16.94, 3.88, -26.49, -1...",93.333333,0.86,0tAZi3X7dUdd7m8OXB8pMA
2,"[0.51, 0.26, 0.22, 0.12, 0.12, 0.2, 0.21, 0.39...","[38.23, -65.25, -37.01, -15.68, 9.7, -36.31, 2...",75.000000,0.36,1XMDIKQbV30WJPKLMN6MKv
3,"[0.12, 0.06, 0.43, 0.1, 0.56, 0.08, 0.08, 0.16...","[28.95, -181.05, 81.47, 3.86, 94.46, -38.57, -...",0.000000,0.33,1XZdwzd8DTDvkjVc0eJ9BI
4,"[0.63, 0.68, 0.49, 0.41, 0.46, 0.5, 0.56, 0.53...","[36.82, 22.73, -62.74, 44.28, 36.1, -56.15, -1...",85.714286,0.56,1f4cKwcKfNiLbQr8x2tZ3C
...,...,...,...,...,...
233,"[0.36, 0.36, 0.31, 0.19, 0.24, 0.25, 0.45, 0.2...","[42.33, -84.86, -38.32, -18.36, -1.64, -23.54,...",90.909091,0.45,2OqtZbITDWCFUHAT9fmdin
234,"[0.31, 0.43, 0.34, 0.66, 0.67, 0.54, 0.46, 0.3...","[51.29, 89.0, 28.24, -10.6, 3.43, -30.31, -2.3...",80.000000,0.50,6EPRKhUOdiFSQwGBRBbvsZ
235,"[0.29, 0.49, 0.36, 0.16, 0.52, 0.21, 0.28, 0.1...","[41.27, -87.94, -54.71, -3.61, 1.01, -23.17, 4...",87.500000,0.75,4Jj8pWRyVjh0KIJLrcreRa
236,"[0.32, 0.22, 0.24, 0.16, 0.39, 0.14, 0.23, 0.4...","[40.47, -62.61, 3.05, -1.31, -12.6, -24.89, -1...",88.888889,0.73,3v9g8iM3v5irWQHqFWaDSo


# Add analysis to toptracks dataset

In [12]:
# add more columns to existing data
df_toptracks_features_analysis = pd.concat([df_toptracks_features, df_analysis_result], ignore_index=False, axis=1)
df_toptracks_features_analysis

Unnamed: 0,id,name,range,rating,id_copy,energy,danceability,loudness,valence,tempo,speechiness,acousticness,instrumentalness,duration_ms,time_signature,pitch_avg,timbre_avg,key_change_percentage,mode_avg,id_copy_2
0,7L5IwfKB6W0tadcSh9wlyH,Ouverture,short_term,5.393939,7L5IwfKB6W0tadcSh9wlyH,0.3360,0.4960,-19.440,0.1850,129.006,0.0513,0.95500,0.912000,322173,4,"[0.27, 0.58, 0.21, 0.27, 0.18, 0.34, 0.26, 0.0...","[36.4, -140.17, -20.32, -18.11, 13.38, -37.86,...",75.000000,0.14,7L5IwfKB6W0tadcSh9wlyH
1,0tAZi3X7dUdd7m8OXB8pMA,Shadow,short_term,5.272727,0tAZi3X7dUdd7m8OXB8pMA,0.2750,0.0832,-15.256,0.0334,170.316,0.0347,0.88700,0.853000,558267,1,"[0.18, 0.26, 0.64, 0.5, 0.16, 0.26, 0.16, 0.21...","[40.29, -65.03, 29.6, -16.94, 3.88, -26.49, -1...",93.333333,0.86,0tAZi3X7dUdd7m8OXB8pMA
2,1XMDIKQbV30WJPKLMN6MKv,INSTRUCTION,short_term,4.969697,1XMDIKQbV30WJPKLMN6MKv,0.3550,0.4910,-12.480,0.0369,145.405,0.0354,0.38200,0.887000,262733,4,"[0.51, 0.26, 0.22, 0.12, 0.12, 0.2, 0.21, 0.39...","[38.23, -65.25, -37.01, -15.68, 9.7, -36.31, 2...",75.000000,0.36,1XMDIKQbV30WJPKLMN6MKv
3,1XZdwzd8DTDvkjVc0eJ9BI,Wildlife Analysis,short_term,4.909091,1XZdwzd8DTDvkjVc0eJ9BI,0.0204,0.1550,-31.212,0.2780,79.755,0.0462,0.99500,0.936000,75627,4,"[0.12, 0.06, 0.43, 0.1, 0.56, 0.08, 0.08, 0.16...","[28.95, -181.05, 81.47, 3.86, 94.46, -38.57, -...",0.000000,0.33,1XZdwzd8DTDvkjVc0eJ9BI
4,1f4cKwcKfNiLbQr8x2tZ3C,Melt!,short_term,4.848485,1f4cKwcKfNiLbQr8x2tZ3C,0.9190,0.7850,-13.059,0.4430,131.037,0.0552,0.00772,0.916000,214307,4,"[0.63, 0.68, 0.49, 0.41, 0.46, 0.5, 0.56, 0.53...","[36.82, 22.73, -62.74, 44.28, 36.1, -56.15, -1...",85.714286,0.56,1f4cKwcKfNiLbQr8x2tZ3C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,2OqtZbITDWCFUHAT9fmdin,Choses nouvelles,long_term,4.242424,2OqtZbITDWCFUHAT9fmdin,0.3830,0.6690,-12.993,0.5690,140.011,0.0385,0.46500,0.856000,234600,4,"[0.36, 0.36, 0.31, 0.19, 0.24, 0.25, 0.45, 0.2...","[42.33, -84.86, -38.32, -18.36, -1.64, -23.54,...",90.909091,0.45,2OqtZbITDWCFUHAT9fmdin
234,6EPRKhUOdiFSQwGBRBbvsZ,Ace of Spades,long_term,4.181818,6EPRKhUOdiFSQwGBRBbvsZ,0.9100,0.3420,-7.748,0.5470,140.452,0.0758,0.00004,0.000121,166360,4,"[0.31, 0.43, 0.34, 0.66, 0.67, 0.54, 0.46, 0.3...","[51.29, 89.0, 28.24, -10.6, 3.43, -30.31, -2.3...",80.000000,0.50,6EPRKhUOdiFSQwGBRBbvsZ
235,4Jj8pWRyVjh0KIJLrcreRa,Here We Go Jack,long_term,4.121212,4Jj8pWRyVjh0KIJLrcreRa,0.3090,0.6790,-10.606,0.4730,163.979,0.0320,0.58500,0.907000,237938,4,"[0.29, 0.49, 0.36, 0.16, 0.52, 0.21, 0.28, 0.1...","[41.27, -87.94, -54.71, -3.61, 1.01, -23.17, 4...",87.500000,0.75,4Jj8pWRyVjh0KIJLrcreRa
236,3v9g8iM3v5irWQHqFWaDSo,Cómo Te Quiero,long_term,4.060606,3v9g8iM3v5irWQHqFWaDSo,0.2770,0.6080,-11.917,0.4370,126.501,0.0282,0.62200,0.896000,242536,4,"[0.32, 0.22, 0.24, 0.16, 0.39, 0.14, 0.23, 0.4...","[40.47, -62.61, 3.05, -1.31, -12.6, -24.89, -1...",88.888889,0.73,3v9g8iM3v5irWQHqFWaDSo


In [13]:
# these should all be aligned perfectly.
df_check = df_toptracks_features_analysis[["id","id_copy","id_copy_2"]]
df_check

Unnamed: 0,id,id_copy,id_copy_2
0,7L5IwfKB6W0tadcSh9wlyH,7L5IwfKB6W0tadcSh9wlyH,7L5IwfKB6W0tadcSh9wlyH
1,0tAZi3X7dUdd7m8OXB8pMA,0tAZi3X7dUdd7m8OXB8pMA,0tAZi3X7dUdd7m8OXB8pMA
2,1XMDIKQbV30WJPKLMN6MKv,1XMDIKQbV30WJPKLMN6MKv,1XMDIKQbV30WJPKLMN6MKv
3,1XZdwzd8DTDvkjVc0eJ9BI,1XZdwzd8DTDvkjVc0eJ9BI,1XZdwzd8DTDvkjVc0eJ9BI
4,1f4cKwcKfNiLbQr8x2tZ3C,1f4cKwcKfNiLbQr8x2tZ3C,1f4cKwcKfNiLbQr8x2tZ3C
...,...,...,...
233,2OqtZbITDWCFUHAT9fmdin,2OqtZbITDWCFUHAT9fmdin,2OqtZbITDWCFUHAT9fmdin
234,6EPRKhUOdiFSQwGBRBbvsZ,6EPRKhUOdiFSQwGBRBbvsZ,6EPRKhUOdiFSQwGBRBbvsZ
235,4Jj8pWRyVjh0KIJLrcreRa,4Jj8pWRyVjh0KIJLrcreRa,4Jj8pWRyVjh0KIJLrcreRa
236,3v9g8iM3v5irWQHqFWaDSo,3v9g8iM3v5irWQHqFWaDSo,3v9g8iM3v5irWQHqFWaDSo


# Save to disk

In [14]:
# add to csv
df_toptracks_features_analysis.to_csv("./dataset/toptracks_ratings_features_analysis.csv", index=False)