### Datasets with audio analysis
[Billboard](https://drive.google.com/open?id=10wzAdM6AfbuMdTzA74zjjvDZbcxgTNXG)  
[OCC](https://drive.google.com/open?id=1FOxNSA7CWhXJbCco4lDnVU5Pwab_d9ep)

In [1]:
import pandas as pd 
import numpy as np 
billboard = pd.read_csv('billboard_including_features_and_analysis.csv')

In [2]:
billboard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2837 entries, 0 to 2836
Data columns (total 20 columns):
acousticness        2837 non-null float64
artist              2837 non-null object
audio_analysis      2837 non-null object
danceability        2837 non-null float64
day                 2837 non-null float64
duration_ms         2837 non-null float64
energy              2837 non-null float64
instrumentalness    2837 non-null float64
key                 2837 non-null float64
liveness            2837 non-null float64
loudness            2837 non-null float64
mode                2837 non-null float64
month               2837 non-null float64
position            2837 non-null float64
speechiness         2837 non-null float64
tempo               2837 non-null float64
time_signature      2837 non-null float64
title               2837 non-null object
valence             2837 non-null float64
year                2837 non-null float64
dtypes: float64(17), object(3)
memory usage: 443.4+ KB


In [82]:
from ast import literal_eval
import scipy.stats as stats

def build_timbre_matrix(song_dict):
    timbre_matrix = []
    for segment in song_dict["segments"]:
        timbre_matrix.append(segment["timbre"])
    return np.array(timbre_matrix)

def build_beat_list(song_dict):
    beats = []
    for beat in song_dict["beats"]:
        beats.append(beat["duration"])
    return np.array(beats)

df_with_stats = pd.DataFrame()
for index, row in billboard.iterrows():   
    segments = literal_eval(row['audio_analysis'])
    timbre_matrix = build_timbre_matrix(segments)
    beats = build_beat_list(segments)
    
    audio_analysis = {}
    audio_analysis["timbre_%d_mean"] = np.mean(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_std"] = np.std(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_var"] = np.var(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_median"] = np.median(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_max"] = np.max(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_min"] = np.min(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_range"] = np.ptp(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_percentile"] = np.percentile(timbre_matrix, q=80, axis=0)
    audio_analysis["timbre_%d_skewness"] = stats.skew(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_kurtosis"] = stats.kurtosis(timbre_matrix, axis=0)
    
    song = row.to_dict()
    del song['audio_analysis']
    song["beatdiff_mean"] = np.mean(beats)
    song["beatdiff_std"] = np.std(beats) 
    song["beatdiff_var"] = np.var(beats) 
    song["beatdiff_median"] = np.median(beats) 
    song["beatdiff_max"] = np.max(beats) 
    song["beatdiff_min"] = np.min(beats) 
    song["beatdiff_range"] = np.ptp(beats) 
    song["beatdiff_percentile"] = np.percentile(beats, q=80) 
    song["beatdiff_skewness"] = stats.skew(beats) 
    song["beatdiff_kurtosis"] = stats.kurtosis(beats)  
    
    for stat in audio_analysis: 
        for i in range(len(audio_analysis[stat])):
            key = stat % (i+1)
            song[key] = audio_analysis[stat][i]

    
    df_with_stats = df_with_stats.append(song, ignore_index=True)
    print('Computed: %s/%s' % (str(index).rjust(len(str(len(billboard.index)))), len(billboard.index)), end='\r')



Computed: 2836/2837

In [83]:
df_with_stats.to_csv('billboard_with_stats.csv', index=False)

In [85]:
df_with_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2837 entries, 0 to 2836
Columns: 149 entries, acousticness to year
dtypes: float64(147), object(2)
memory usage: 3.2+ MB


In [3]:
import pandas as pd 
import numpy as np 
from ast import literal_eval
import scipy.stats as stats
officialcharts = pd.read_csv('officialcharts_including_features_and_analysis.csv')

def build_timbre_matrix(song_dict):
    timbre_matrix = []
    for segment in song_dict["segments"]:
        timbre_matrix.append(segment["timbre"])
    return np.array(timbre_matrix)

def build_beat_list(song_dict):
    beats = []
    for beat in song_dict["beats"]:
        beats.append(beat["duration"])
    return np.array(beats)

df_with_stats = pd.DataFrame()
for index, row in officialcharts.iterrows():   
    segments = literal_eval(row['audio_analysis'])
    timbre_matrix = build_timbre_matrix(segments)
    beats = build_beat_list(segments)
    
    audio_analysis = {}
    audio_analysis["timbre_%d_mean"] = np.mean(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_std"] = np.std(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_var"] = np.var(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_median"] = np.median(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_max"] = np.max(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_min"] = np.min(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_range"] = np.ptp(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_percentile"] = np.percentile(timbre_matrix, q=80, axis=0)
    audio_analysis["timbre_%d_skewness"] = stats.skew(timbre_matrix, axis=0)
    audio_analysis["timbre_%d_kurtosis"] = stats.kurtosis(timbre_matrix, axis=0)
    
    song = row.to_dict()
    del song['audio_analysis']
    song["beatdiff_mean"] = np.mean(beats)
    song["beatdiff_std"] = np.std(beats) 
    song["beatdiff_var"] = np.var(beats) 
    song["beatdiff_median"] = np.median(beats) 
    song["beatdiff_max"] = np.max(beats) 
    song["beatdiff_min"] = np.min(beats) 
    song["beatdiff_range"] = np.ptp(beats) 
    song["beatdiff_percentile"] = np.percentile(beats, q=80) 
    song["beatdiff_skewness"] = stats.skew(beats) 
    song["beatdiff_kurtosis"] = stats.kurtosis(beats)  
    
    for stat in audio_analysis: 
        for i in range(len(audio_analysis[stat])):
            key = stat % (i+1)
            song[key] = audio_analysis[stat][i]

    
    df_with_stats = df_with_stats.append(song, ignore_index=True)
    print('Computed: %s/%s' % (str(index).rjust(len(str(len(officialcharts.index)))), len(officialcharts.index)), end='\r')



Computed: 674/675

In [4]:
df_with_stats.to_csv('officialcharts_with_stats.csv', index=False)