# Wilson's Morning Wake Up Playlist Generator, Feature Engineering

This notebook is broken down into the following tasks:

* Clean and pre-process the data.
* Standardization and normalization of numerical variables
* Define features for harmonic sequencing
* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points

In the _next_ notebook, Train_Deploy , we will use the features and `.csv` files created in _this_ notebook to train an estimator in a SageMaker notebook instance.

In [62]:
# Import libraries
import pandas as pd
import numpy as np
import os

This Wilson Morning Wake Up dataset is summarized in a `.csv` file named `wmw_tracks.csv`, which we can read in using `pandas`. This was created in the Explore notebook.

In [33]:
# Read file
csv_file = 'data/wmw_tracks.csv'
wmw_df = pd.read_csv(csv_file)

wmw_df.head()

Unnamed: 0,volume,position,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,38,1,Finding It There,Goldmund,0.187,0.00257,1,-37.134,1,0.0427,...,0.0915,0.0374,123.707,audio_features,6CnPCuUcM3A5PMP4gUy0vw,spotify:track:6CnPCuUcM3A5PMP4gUy0vw,https://api.spotify.com/v1/tracks/6CnPCuUcM3A5...,https://api.spotify.com/v1/audio-analysis/6CnP...,220120,5
1,38,2,Light Forms,Rohne,0.671,0.545,10,-12.848,0,0.0393,...,0.118,0.284,133.036,audio_features,6MkUPsz5hYeneo0a9H0VT8,spotify:track:6MkUPsz5hYeneo0a9H0VT8,https://api.spotify.com/v1/tracks/6MkUPsz5hYen...,https://api.spotify.com/v1/audio-analysis/6MkU...,265870,4
2,38,3,C-Side,Khruangbin,0.688,0.779,11,-10.129,0,0.0579,...,0.349,0.938,94.073,audio_features,6GvAM8oyVApQHGMgpBt8yl,spotify:track:6GvAM8oyVApQHGMgpBt8yl,https://api.spotify.com/v1/tracks/6GvAM8oyVApQ...,https://api.spotify.com/v1/audio-analysis/6GvA...,283407,4
3,38,4,Didn't I (Dave Allison Rework),Darondo,0.539,0.705,0,-6.729,1,0.0527,...,0.133,0.685,186.033,audio_features,1owjOeZt1BdYWW6T8fIAEe,spotify:track:1owjOeZt1BdYWW6T8fIAEe,https://api.spotify.com/v1/tracks/1owjOeZt1BdY...,https://api.spotify.com/v1/audio-analysis/1owj...,328000,4
4,38,5,Woman Of The Ghetto - Akshin Alizadeh Remix,Marlena Shaw,0.707,0.573,7,-8.403,0,0.0276,...,0.0858,0.189,100.006,audio_features,2h8cQH7zhUWrynZi2MKhhC,spotify:track:2h8cQH7zhUWrynZi2MKhhC,https://api.spotify.com/v1/tracks/2h8cQH7zhUWr...,https://api.spotify.com/v1/audio-analysis/2h8c...,302467,4


In [34]:
# Show duplicated songs - woops!
wmw_df[wmw_df.duplicated(['track_name', 'artist_name'], keep=False)].sort_values('track_name')

Unnamed: 0,volume,position,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
525,3,15,4AM,Vessels,0.482,0.794,0,-10.800,0,0.0442,...,0.1070,0.0382,150.036,audio_features,4MCL9tV2X5OiwT1oIl9pqK,spotify:track:4MCL9tV2X5OiwT1oIl9pqK,https://api.spotify.com/v1/tracks/4MCL9tV2X5Oi...,https://api.spotify.com/v1/audio-analysis/4MCL...,395229,4
451,8,15,4AM,Vessels,0.482,0.794,0,-10.800,0,0.0442,...,0.1070,0.0382,150.036,audio_features,77HWONgBl3FZbXarvXfRjc,spotify:track:77HWONgBl3FZbXarvXfRjc,https://api.spotify.com/v1/tracks/77HWONgBl3FZ...,https://api.spotify.com/v1/audio-analysis/77HW...,395229,4
192,26,10,Always Like This - Andhim Remix,HVOB,0.793,0.480,1,-12.183,0,0.0829,...,0.4690,0.2480,122.030,audio_features,7qfLlPC8tlohz2QYwGqkJW,spotify:track:7qfLlPC8tlohz2QYwGqkJW,https://api.spotify.com/v1/tracks/7qfLlPC8tloh...,https://api.spotify.com/v1/audio-analysis/7qfL...,359016,4
207,25,10,Always Like This - Andhim Remix,HVOB,0.793,0.480,1,-12.183,0,0.0829,...,0.4690,0.2480,122.030,audio_features,7qfLlPC8tlohz2QYwGqkJW,spotify:track:7qfLlPC8tlohz2QYwGqkJW,https://api.spotify.com/v1/tracks/7qfLlPC8tloh...,https://api.spotify.com/v1/audio-analysis/7qfL...,359016,4
184,26,2,Anchor (feat. Ishmael),Laurence Guy,0.706,0.640,11,-12.578,1,0.0416,...,0.0915,0.5750,121.024,audio_features,2FVgIrE55fd4ktm86pfhFx,spotify:track:2FVgIrE55fd4ktm86pfhFx,https://api.spotify.com/v1/tracks/2FVgIrE55fd4...,https://api.spotify.com/v1/audio-analysis/2FVg...,287666,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,34,6,U,DJ Seinfeld,0.721,0.838,8,-7.579,1,0.0428,...,0.0445,0.4530,125.999,audio_features,2tYctbOGPK2OZ2g2wKipgj,spotify:track:2tYctbOGPK2OZ2g2wKipgj,https://api.spotify.com/v1/tracks/2tYctbOGPK2O...,https://api.spotify.com/v1/audio-analysis/2tYc...,368000,4
339,15,8,Underworld,Edu Imbernon,0.878,0.482,8,-11.166,1,0.0483,...,0.0701,0.2970,123.023,audio_features,2gGeFW6SBAG5K5SNIQ4LW4,spotify:track:2gGeFW6SBAG5K5SNIQ4LW4,https://api.spotify.com/v1/tracks/2gGeFW6SBAG5...,https://api.spotify.com/v1/audio-analysis/2gGe...,425463,4
490,5,11,Underworld,Edu Imbernon,0.862,0.542,8,-10.615,1,0.0449,...,0.1070,0.3230,122.991,audio_features,2T0QyKXxEGsEGkY31deDam,spotify:track:2T0QyKXxEGsEGkY31deDam,https://api.spotify.com/v1/tracks/2T0QyKXxEGsE...,https://api.spotify.com/v1/audio-analysis/2T0Q...,320000,4
183,26,1,Vates,Otto Totland,0.360,0.231,4,-26.701,0,0.0336,...,0.0878,0.2820,104.094,audio_features,5lfBUkuT4mprXGQML18tyq,spotify:track:5lfBUkuT4mprXGQML18tyq,https://api.spotify.com/v1/tracks/5lfBUkuT4mpr...,https://api.spotify.com/v1/audio-analysis/5lfB...,256725,4


In [35]:
# Drop duplicates
print("Initial shape:", wmw_df.shape[0], 'tracks')
wmw_df.drop_duplicates(subset=['track_name', 'artist_name'], inplace=True)
print("New shape:", wmw_df.shape[0], 'tracks')

Initial shape: 554 tracks
New shape: 512 tracks


In [52]:
# Feature list
feature_list =  ['danceability','energy', 'loudness', 'speechiness', 'acousticness',
                 'instrumentalness', 'liveness', 'valence','mode','key','tempo']

processed_df = wmw_df.loc[:, feature_list]

processed_df.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,mode,key,tempo
0,0.187,0.00257,-37.134,0.0427,0.994,0.952,0.0915,0.0374,1,1,123.707
1,0.671,0.545,-12.848,0.0393,0.524,0.853,0.118,0.284,0,10,133.036
2,0.688,0.779,-10.129,0.0579,0.0387,0.118,0.349,0.938,0,11,94.073
3,0.539,0.705,-6.729,0.0527,0.00828,0.135,0.133,0.685,1,0,186.033
4,0.707,0.573,-8.403,0.0276,0.025,0.291,0.0858,0.189,0,7,100.006


In [53]:
from sklearn.preprocessing import StandardScaler

# Standardize features
standard_features = StandardScaler().fit_transform(processed_df)

processed_df = pd.DataFrame(standard_features, index=processed_df.index, columns=processed_df.columns)

processed_df.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,mode,key,tempo
0,-2.390995,-2.635095,-5.370442,-0.277322,2.096185,0.929695,-0.489837,-1.156919,1.08569,-1.204549,0.233803
1,0.291017,-0.13109,-0.375624,-0.352964,0.685856,0.645904,-0.283988,-0.125409,-0.921073,1.432297,0.682817
2,0.38522,0.949117,0.183583,0.060842,-0.770383,-1.461032,1.510395,2.610226,-0.921073,1.72528,-1.19251
3,-0.440441,0.607513,0.882849,-0.054846,-0.861664,-1.4123,-0.167469,1.551945,1.08569,-1.497532,3.233615
4,0.490506,-0.001835,0.538564,-0.613261,-0.811493,-0.965114,-0.534114,-0.522788,-0.921073,0.553349,-0.906949


In [61]:
processed_df.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,mode,key,tempo
count,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,2.389582e-16,4.5970170000000004e-17,5.833008e-17,8.364620000000001e-17,9.280771e-17,-1.027824e-16,1.075529e-16,7.741204000000001e-17,5.160802000000001e-17,-1.778092e-17,-5.732177e-16
std,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978
min,-3.070364,-2.635095,-5.491785,-0.6399586,-0.8864776,-1.799288,-0.9955266,-1.192893,-0.921073,-1.497532,-3.116691
25%,-0.4071927,-0.6134908,-0.3255958,-0.4291622,-0.8346732,-0.8884327,-0.5088682,-0.8835651,-0.921073,-0.9115661,-0.3627442
50%,0.1912728,0.1320371,0.2218371,-0.277322,-0.4934187,0.5455737,-0.3538988,-0.2027936,-0.921073,-0.03261724,0.05612778
75%,0.702462,0.7183036,0.6001607,0.007447745,0.6768539,0.7670169,-0.01987943,0.6829414,1.08569,0.8463316,0.2482667
max,1.82597,1.803127,1.634972,9.852028,2.099185,1.015693,5.91479,2.714799,1.08569,1.72528,3.712663


In [54]:
# Setup input and target
X = processed_df
y = processed_df.iloc[:,-3:]

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
def make_csv(x, y, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    
    # your code here
    data = pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1)
    data = data.dropna()
    data.to_csv(os.path.join(data_dir, filename), header=False, index=False)
    
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [60]:
# can change directory, if you want
data_dir = 'data'

make_csv(X_train, y_train, filename='train.csv', data_dir=data_dir)
make_csv(X_test, y_test, filename='test.csv', data_dir=data_dir)

Path created: data/train.csv
Path created: data/test.csv
