In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
import sys
import seaborn as sns
from copy import deepcopy
from scipy import stats
from functools import reduce

sns.set_style('darkgrid')
sys.path.append('/Users/alexander_wong/Documents/playlist-success/')

from src.data_transformations import create_features, classify_success, add_suffixes

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Merging playlist and track feature data
This notebook combines each song in all avliable playlists with the downloaded song features

In [2]:
# Loading dataframes of spotify features per track_id
features_part_1 = pd.read_parquet('../data/track_features/track_features_all_half_1.parquet')
features_part_2 = pd.read_parquet('../data/track_features/track_features_all_half_2.parquet')
features_frame = pd.concat([features_part_1, features_part_2])

In [3]:
# Loading dataframe of all playlist_id and feature_ids
playlist_frame = all_tracks_frame = pd.read_parquet("../data/playlist_track_ids/playlist_track_ids_all.parquet")

In [23]:
playlist_frame

Unnamed: 0,track_id,popularity,user_id,playlist_id
0,5MiZ199HjYwrZfuNWVSxC9,30.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh
1,3mqGTIR5BxJ678KkHFT4tg,2.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh
2,2pOya9IyPgv2XLR7FfnySh,7.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh
3,6SbP5526KNn4z2Dex2uf2B,0.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh
4,6GjLKnNDVFsg6oLuplDwKF,0.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh
...,...,...,...,...
32,1qavn2qJRP3IoWnLkFzwI8,4.0,055d33003af0de47c79559c56ba6ee5b,6orq0HsuzPGA1yOCNvCLMw
0,6cLgBwRc1LfV0cSoboEfJD,0.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x
1,4f0ldNhe5ZsIhzene1nup1,32.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x
2,0DNk7lEqDGCN6nDD0H8emF,0.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x


In [24]:
features_frame.head(10)

Unnamed: 0,track_id,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time,valence
0,5MiZ199HjYwrZfuNWVSxC9,0.488,0.812,191867.0,0.7,1e-06,7.0,0.363,-3.579,1.0,0.0497,105.023,4.0,0.925
1,3mqGTIR5BxJ678KkHFT4tg,0.345,0.468,167208.0,0.715,0.0,7.0,0.137,-5.671,1.0,0.151,162.317,4.0,0.803
2,2pOya9IyPgv2XLR7FfnySh,0.54,0.636,201853.0,0.783,0.0,11.0,0.301,-3.918,1.0,0.0461,159.097,3.0,0.892
3,6SbP5526KNn4z2Dex2uf2B,0.272,0.645,181360.0,0.465,0.0,11.0,0.166,-6.63,1.0,0.0396,184.906,3.0,0.952
4,6GjLKnNDVFsg6oLuplDwKF,0.821,0.562,175013.0,0.432,6e-06,10.0,0.0935,-4.441,1.0,0.0545,170.406,3.0,0.95
5,38TwIEwC3K5vgkPTrrUvzd,0.285,0.544,223920.0,0.491,3e-06,0.0,0.217,-4.288,1.0,0.0372,155.802,3.0,0.516
6,2Av3o4sAszUWUS6WAEsXij,0.288,0.857,132107.0,0.912,0.00154,0.0,0.553,-4.31,1.0,0.0632,112.379,3.0,0.985
7,2xjxGXr3eASzCvo9MUHk0w,0.702,0.646,204200.0,0.477,0.0,7.0,0.0885,-5.61,0.0,0.0404,153.91,3.0,0.941
8,386yPOHFbMSXiniJ7UOai4,0.73,0.707,148907.0,0.693,1.6e-05,5.0,0.0455,-5.438,1.0,0.045,153.038,3.0,0.988
9,58rgceeiYu1wFneDP2j5LJ,0.524,0.74,144147.0,0.849,0.0,3.0,0.0911,-3.171,1.0,0.113,165.91,3.0,0.964


In [7]:
# Check if there are any missing track features for each track id
missing_frame = playlist_frame.loc[lambda f: ~f['track_id'].isin(features_frame["track_id"])]

In [19]:
%%time
combined_frame = pd.merge(playlist_frame, features_frame, how='left', on='track_id')
print("Merge Complete!")

Merge Complete!
CPU times: user 13.2 s, sys: 2.62 s, total: 15.8 s
Wall time: 16.6 s


In [20]:
%%time
combined_frame.to_parquet("../data/playlist_track_features.parquet")
print("Save Complete!")

Save Complete!
CPU times: user 13.7 s, sys: 6 s, total: 19.7 s
Wall time: 16.6 s


In [26]:
combined_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16456661 entries, 0 to 16456660
Data columns (total 17 columns):
 #   Column            Dtype  
---  ------            -----  
 0   track_id          object 
 1   popularity        float64
 2   user_id           object 
 3   playlist_id       object 
 4   acousticness      float64
 5   danceability      float64
 6   duration          float64
 7   energy            float64
 8   instrumentalness  float64
 9   key               float64
 10  liveness          float64
 11  loudness          float64
 12  mode              float64
 13  speechiness       float64
 14  tempo             float64
 15  time              float64
 16  valence           float64
dtypes: float64(14), object(3)
memory usage: 2.2+ GB


In [27]:
combined_frame.describe()

Unnamed: 0,popularity,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time,valence
count,16456660.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0,16318720.0
mean,23.71073,0.2674312,0.5744729,239814.4,0.6422404,0.1386322,5.240539,0.2016597,-8.060119,0.6554895,0.09400234,121.3427,3.911218,0.4916043
std,25.29908,0.3119884,0.1744586,104296.4,0.2371529,0.2912138,3.574927,0.1739368,4.6845,0.4752084,0.1177563,29.44623,0.4120838,0.2586609
min,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0146,0.463,193347.0,0.49,0.0,2.0,0.095,-9.709,0.0,0.0357,98.121,4.0,0.283
50%,16.0,0.116,0.587,227000.0,0.683,4.1e-05,5.0,0.129,-6.849,1.0,0.049,120.463,4.0,0.487
75%,45.0,0.463,0.701,268853.0,0.833,0.0289,8.0,0.26,-5.016,1.0,0.0943,139.949,4.0,0.699
max,100.0,0.996,0.999,6035552.0,1.0,1.0,11.0,1.0,4.923,1.0,0.971,249.987,5.0,1.0


In [28]:
combined_frame

Unnamed: 0,track_id,popularity,user_id,playlist_id,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time,valence
0,5MiZ199HjYwrZfuNWVSxC9,30.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh,0.4880,0.812,191867.0,0.700,0.000001,7.0,0.3630,-3.579,1.0,0.0497,105.023,4.0,0.925
1,3mqGTIR5BxJ678KkHFT4tg,2.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh,0.3450,0.468,167208.0,0.715,0.000000,7.0,0.1370,-5.671,1.0,0.1510,162.317,4.0,0.803
2,2pOya9IyPgv2XLR7FfnySh,7.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh,0.5400,0.636,201853.0,0.783,0.000000,11.0,0.3010,-3.918,1.0,0.0461,159.097,3.0,0.892
3,6SbP5526KNn4z2Dex2uf2B,0.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh,0.2720,0.645,181360.0,0.465,0.000000,11.0,0.1660,-6.630,1.0,0.0396,184.906,3.0,0.952
4,6GjLKnNDVFsg6oLuplDwKF,0.0,6b7fbed9edd6418ddd3b555bba441536,6zpLN2wRu8z7fo4q6ouaAh,0.8210,0.562,175013.0,0.432,0.000006,10.0,0.0935,-4.441,1.0,0.0545,170.406,3.0,0.950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16456656,1qavn2qJRP3IoWnLkFzwI8,4.0,055d33003af0de47c79559c56ba6ee5b,6orq0HsuzPGA1yOCNvCLMw,0.9930,0.432,113035.0,0.069,0.828000,7.0,0.1400,-19.528,0.0,0.0539,123.090,4.0,0.392
16456657,6cLgBwRc1LfV0cSoboEfJD,0.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x,0.0919,0.650,217278.0,0.762,0.001000,0.0,0.1070,-5.518,1.0,0.0401,93.032,4.0,0.452
16456658,4f0ldNhe5ZsIhzene1nup1,32.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x,0.2130,0.436,221064.0,0.778,0.000050,10.0,0.1330,-5.139,0.0,0.1070,190.189,4.0,0.692
16456659,0DNk7lEqDGCN6nDD0H8emF,0.0,4672952d42bdd93b9215ce9a40394ea6,6W45lqDBZ1TKma71Uu2F5x,0.0561,0.623,220291.0,0.656,0.000000,0.0,0.0855,-5.608,0.0,0.0416,95.878,4.0,0.366


In [31]:
len(combined_frame["playlist_id"].unique())

68352

In [33]:
(70399 - 68352)/70399 * 100

2.9077117572692797

# Feature Engineering

In [2]:
combined_frame = pd.read_parquet("../data/playlist_track_features.parquet")

In [3]:
%%time
mean_frame = combined_frame.groupby("playlist_id").mean().reset_index()

CPU times: user 2.4 s, sys: 844 ms, total: 3.24 s
Wall time: 3.24 s


## Percentiles

In [10]:
%%time
perc_frames = {}
percentiles = [0.025] + [round(x ,2) for x in np.arange(0.05, 1, 0.05).tolist()] + [0.975]
for p in tqdm(percentiles):
    perc_frames[p] = combined_frame.groupby("playlist_id").quantile(p).reset_index()

100%|██████████| 21/21 [26:09<00:00, 74.75s/it]

CPU times: user 25min 30s, sys: 37.5 s, total: 26min 7s
Wall time: 26min 9s





In [168]:
perc_frames_renamed = {}
for p, perc_frame in tqdm(perc_frames.items()):
    quantile_key = str(round(p*100,1)).replace('.','p')
    suffix = f'_percentile_{quantile_key}'
    perc_frames_renamed[quantile_key] = add_suffixes(perc_frame, suffix)            

100%|██████████| 21/21 [00:00<00:00, 41.39it/s]


In [169]:
for p, perc_frame in tqdm(perc_frames_renamed.items()):
    perc_frame.to_parquet(f"../data/track_features_distribution_statistics/track_features_percentile_{p}.parquet")

100%|██████████| 21/21 [00:02<00:00,  8.35it/s]


## Percentile Differences

In [228]:
iqr_frame = perc_frames[0.75].set_index("playlist_id").subtract(perc_frames[0.25].set_index("playlist_id")).reset_index()
iqr_frame_renamed = add_suffixes(iqr_frame, "_iqr")           

In [230]:
iqr_max_frame = perc_frames[0.95].set_index("playlist_id").subtract(perc_frames[0.05].set_index("playlist_id")).reset_index()
iqr_max_frame_renamed = add_suffixes(iqr_max_frame, "_iqr_max")        

In [233]:
iqr_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_iqr.parquet")
iqr_max_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_iqr_max.parquet")

## Other distribution statistics

### mean, var, std, std_error

In [131]:
mean_frame = combined_frame.groupby("playlist_id").mean().reset_index()

In [45]:
var_frame = combined_frame.groupby("playlist_id").var().reset_index()

In [46]:
std_frame = combined_frame.groupby("playlist_id").std().reset_index()

In [47]:
std_error_frame = combined_frame.groupby("playlist_id").sem().reset_index()

In [184]:
mean_frame_renamed = add_suffixes(mean_frame, "_mean")   
var_frame_renamed = add_suffixes(var_frame, "_var")   
std_frame_renamed = add_suffixes(std_frame, "_std")
std_error_frame_renamed = add_suffixes(std_error_frame, "_std_error")

In [185]:
mean_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_mean.parquet")
var_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_var.parquet")
std_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_std.parquet")
std_error_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_std_error.parquet")

## skew, kurtosis

### Unbiased

In [48]:
skew_unbiased_frame = combined_frame.groupby("playlist_id").skew().reset_index()

In [49]:
kurt_unbiased_frame = combined_frame.groupby("playlist_id").apply(pd.DataFrame.kurt)

AttributeError: 'Series' object has no attribute 'columns'

In [149]:
skew_unbiased_frame_pivot = skew_unbiased_frame.reset_index().pivot(index='playlist_id', columns='level_1', values=0).reset_index().drop(columns=["track_id", "user_id"])
skew_unbiased_frame_renamed = add_suffixes(skew_unbiased_frame_pivot, "_skew_unbiased")

In [150]:
kurt_unbiased_frame_pivot = kurt_unbiased_frame.reset_index().pivot(index='playlist_id', columns='level_1', values=0).reset_index().drop(columns=["track_id", "user_id"])
kurt_unbiased_frame_renamed = add_suffixes(kurt_unbiased_frame_pivot, "_kurt_unbiased")

In [156]:
skew_ubbiased_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_skew_unbiased.parquet")
kurt_unbiased_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_kurt_unbiased.parquet")

### Biased

In [91]:
skew_biased_frame = combined_frame.drop(columns=["track_id","user_id"]).groupby("playlist_id").apply(stats.skew)

In [92]:
kurt_biased_frame = combined_frame.drop(columns=["track_id","user_id"]).groupby("playlist_id").apply(stats.kurtosis)

In [110]:
feature_columns = ["popularity", "acousticness", "danceability", "duration", "energy", "instrumentalness", "key" ,
                   "liveness", "loudness", "mode", "speechiness", "tempo", "time", "valence"]

explode_feature_col = feature_columns * int(len(kurt_biased_frame.reset_index().explode(0)) / 14)

In [159]:
skew_biased_frame_pivot = skew_biased_frame.reset_index().explode(0).assign(features=explode_feature_col).pivot(index='playlist_id', columns='features', values=0).reset_index()
kurt_biased_frame_pivot = kurt_biased_frame.reset_index().explode(0).assign(features=explode_feature_col).pivot(index='playlist_id', columns='features', values=0).reset_index()

In [161]:
skew_biased_frame_renamed = add_suffixes(skew_biased_frame_pivot, "_skew_biased")
kurt_biased_frame_renamed = add_suffixes(kurt_biased_frame_pivot, "_kurt_biased")

In [162]:
skew_biased_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_skew_biased.parquet")
kurt_biased_frame_renamed.to_parquet("../data/track_features_distribution_statistics/track_features_kurt_biased.parquet")

## Combining All Features

In [244]:
all_feature_frames = list(perc_frames_renamed.values()) + [
    mean_frame_renamed, 
    var_frame_ranamed, 
    std_frame_renamed,
    std_error_frame_renamed, 
    iqr_frame_renamed,
    iqr_max_frame_renamed,
    skew_unbiased_frame_renamed,
    kurt_unbiased_frame_renamed,
    kurt_biased_frame_renamed, 
    skew_biased_frame_renamed
]

In [245]:
%%time
all_features = reduce(lambda left,right: pd.merge(left,right,on=['playlist_id'], how='outer'), all_feature_frames)

CPU times: user 2.53 s, sys: 1.55 s, total: 4.08 s
Wall time: 4.09 s


In [246]:
# Final dataframe of all features
all_features.to_parquet("../data/track_features_all.parquet")

In [238]:
# analyze_cols = ["n_tracks", "n_artists", "n_albums", "tracks_per_album", "artists_per_album",'popularity', 'acousticness', 'danceability', 'duration', 'energy',
#        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
#        'speechiness', 'tempo', 'time', 'valence']

# for col in analyze_cols:
#     fig, ax = plt.subplots(figsize=(20, 5))
#     sns.boxplot(x="success_streaming_ratio_users", y=col, data=snapshot, showfliers=False)
#     plt.title(col, size=30)
#     plt.xticks(rotation=90)
#     plt.show()

In [239]:
# analyze_cols = ["n_tracks", "n_artists", "n_albums", "tracks_per_album", "artists_per_album",'popularity', 'acousticness', 'danceability', 'duration', 'energy',
#        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
#        'speechiness', 'tempo', 'time', 'valence']

# for col in analyze_cols:
#     fig, ax = plt.subplots(figsize=(20, 5))
#     sns.boxplot(x="genre_1", y=col, data=snapshot, order=snapshot["genre_1"].value_counts().index, hue="success_streaming_ratio_users", showfliers=False)
#     plt.title(col, size=30)
#     plt.xticks(rotation=90)
#     plt.show()

In [240]:
rvs5 = stats.norm.rvs(loc=8, scale=20, size=100)

In [242]:
type(rvs5)

numpy.ndarray

In [247]:
for col in all_features.columns:
    print(col)

playlist_id
popularity_percentile_2p5
acousticness_percentile_2p5
danceability_percentile_2p5
duration_percentile_2p5
energy_percentile_2p5
instrumentalness_percentile_2p5
key_percentile_2p5
liveness_percentile_2p5
loudness_percentile_2p5
mode_percentile_2p5
speechiness_percentile_2p5
tempo_percentile_2p5
time_percentile_2p5
valence_percentile_2p5
popularity_percentile_5p0
acousticness_percentile_5p0
danceability_percentile_5p0
duration_percentile_5p0
energy_percentile_5p0
instrumentalness_percentile_5p0
key_percentile_5p0
liveness_percentile_5p0
loudness_percentile_5p0
mode_percentile_5p0
speechiness_percentile_5p0
tempo_percentile_5p0
time_percentile_5p0
valence_percentile_5p0
popularity_percentile_10p0
acousticness_percentile_10p0
danceability_percentile_10p0
duration_percentile_10p0
energy_percentile_10p0
instrumentalness_percentile_10p0
key_percentile_10p0
liveness_percentile_10p0
loudness_percentile_10p0
mode_percentile_10p0
speechiness_percentile_10p0
tempo_percentile_10p0
time_