# Wilson's Morning Wake Up Playlist Generator, Feature Engineering

This notebook is broken down into the following tasks:

* Clean and pre-process the data.
* Standardization and normalization of numerical variables
* Define features for harmonic sequencing
* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points

In the _next_ notebook, Train_Deploy , we will use the features and `.csv` files created in _this_ notebook to train an estimator in a SageMaker notebook instance.

In [1]:
# Import librariess
import pandas as pd
import numpy as np
import os
import pickle

This Wilson Morning Wake Up dataset is summarized in a `.csv` file named `wmw_tracks.csv`, which we can read in using `pandas`. This was created in the Explore notebook.

In [2]:
# Read file
csv_file = 'data/wmw_tracks.csv'
wmw_df = pd.read_csv(csv_file)
wmw_df.head()

Unnamed: 0,volume,position,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,38,1,Finding It There,Goldmund,0.187,0.00257,1,-37.134,1,0.0427,...,0.0915,0.0374,123.707,audio_features,6CnPCuUcM3A5PMP4gUy0vw,spotify:track:6CnPCuUcM3A5PMP4gUy0vw,https://api.spotify.com/v1/tracks/6CnPCuUcM3A5...,https://api.spotify.com/v1/audio-analysis/6CnP...,220120,5
1,38,2,Light Forms,Rohne,0.671,0.545,10,-12.848,0,0.0393,...,0.118,0.284,133.036,audio_features,6MkUPsz5hYeneo0a9H0VT8,spotify:track:6MkUPsz5hYeneo0a9H0VT8,https://api.spotify.com/v1/tracks/6MkUPsz5hYen...,https://api.spotify.com/v1/audio-analysis/6MkU...,265870,4
2,38,3,C-Side,Khruangbin,0.688,0.779,11,-10.129,0,0.0579,...,0.349,0.938,94.073,audio_features,6GvAM8oyVApQHGMgpBt8yl,spotify:track:6GvAM8oyVApQHGMgpBt8yl,https://api.spotify.com/v1/tracks/6GvAM8oyVApQ...,https://api.spotify.com/v1/audio-analysis/6GvA...,283407,4
3,38,4,Didn't I (Dave Allison Rework),Darondo,0.539,0.705,0,-6.729,1,0.0527,...,0.133,0.685,186.033,audio_features,1owjOeZt1BdYWW6T8fIAEe,spotify:track:1owjOeZt1BdYWW6T8fIAEe,https://api.spotify.com/v1/tracks/1owjOeZt1BdY...,https://api.spotify.com/v1/audio-analysis/1owj...,328000,4
4,38,5,Woman Of The Ghetto - Akshin Alizadeh Remix,Marlena Shaw,0.707,0.573,7,-8.403,0,0.0276,...,0.0858,0.189,100.006,audio_features,2h8cQH7zhUWrynZi2MKhhC,spotify:track:2h8cQH7zhUWrynZi2MKhhC,https://api.spotify.com/v1/tracks/2h8cQH7zhUWr...,https://api.spotify.com/v1/audio-analysis/2h8c...,302467,4


In [3]:
# Show duplicated songs - woops!
wmw_df[wmw_df.duplicated(['track_name', 'artist_name'], keep=False)].sort_values('track_name')

Unnamed: 0,volume,position,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
525,3,15,4AM,Vessels,0.482,0.794,0,-10.800,0,0.0442,...,0.1070,0.0382,150.036,audio_features,4MCL9tV2X5OiwT1oIl9pqK,spotify:track:4MCL9tV2X5OiwT1oIl9pqK,https://api.spotify.com/v1/tracks/4MCL9tV2X5Oi...,https://api.spotify.com/v1/audio-analysis/4MCL...,395229,4
451,8,15,4AM,Vessels,0.482,0.794,0,-10.800,0,0.0442,...,0.1070,0.0382,150.036,audio_features,77HWONgBl3FZbXarvXfRjc,spotify:track:77HWONgBl3FZbXarvXfRjc,https://api.spotify.com/v1/tracks/77HWONgBl3FZ...,https://api.spotify.com/v1/audio-analysis/77HW...,395229,4
192,26,10,Always Like This - Andhim Remix,HVOB,0.793,0.480,1,-12.183,0,0.0829,...,0.4690,0.2480,122.030,audio_features,7qfLlPC8tlohz2QYwGqkJW,spotify:track:7qfLlPC8tlohz2QYwGqkJW,https://api.spotify.com/v1/tracks/7qfLlPC8tloh...,https://api.spotify.com/v1/audio-analysis/7qfL...,359016,4
207,25,10,Always Like This - Andhim Remix,HVOB,0.793,0.480,1,-12.183,0,0.0829,...,0.4690,0.2480,122.030,audio_features,7qfLlPC8tlohz2QYwGqkJW,spotify:track:7qfLlPC8tlohz2QYwGqkJW,https://api.spotify.com/v1/tracks/7qfLlPC8tloh...,https://api.spotify.com/v1/audio-analysis/7qfL...,359016,4
184,26,2,Anchor (feat. Ishmael),Laurence Guy,0.706,0.640,11,-12.578,1,0.0416,...,0.0915,0.5750,121.024,audio_features,2FVgIrE55fd4ktm86pfhFx,spotify:track:2FVgIrE55fd4ktm86pfhFx,https://api.spotify.com/v1/tracks/2FVgIrE55fd4...,https://api.spotify.com/v1/audio-analysis/2FVg...,287666,4
199,25,2,Anchor (feat. Ishmael),Laurence Guy,0.706,0.640,11,-12.578,1,0.0416,...,0.0915,0.5750,121.024,audio_features,2FVgIrE55fd4ktm86pfhFx,spotify:track:2FVgIrE55fd4ktm86pfhFx,https://api.spotify.com/v1/tracks/2FVgIrE55fd4...,https://api.spotify.com/v1/audio-analysis/2FVg...,287666,4
127,30,6,Anchor (feat. Ishmael),Laurence Guy,0.706,0.640,11,-12.578,1,0.0416,...,0.0915,0.5750,121.024,audio_features,2FVgIrE55fd4ktm86pfhFx,spotify:track:2FVgIrE55fd4ktm86pfhFx,https://api.spotify.com/v1/tracks/2FVgIrE55fd4...,https://api.spotify.com/v1/audio-analysis/2FVg...,287666,4
86,33,9,Apples And Pears,Canyons,0.750,0.936,10,-4.986,0,0.0437,...,0.1180,0.7740,119.511,audio_features,6jbgySyREi0vZIldoca1n5,spotify:track:6jbgySyREi0vZIldoca1n5,https://api.spotify.com/v1/tracks/6jbgySyREi0v...,https://api.spotify.com/v1/audio-analysis/6jbg...,295347,4
474,6,8,Apples And Pears,Canyons,0.750,0.936,10,-4.986,0,0.0437,...,0.1180,0.7740,119.511,audio_features,6jbgySyREi0vZIldoca1n5,spotify:track:6jbgySyREi0vZIldoca1n5,https://api.spotify.com/v1/tracks/6jbgySyREi0v...,https://api.spotify.com/v1/audio-analysis/6jbg...,295347,4
277,19,5,Baile,Wayward,0.668,0.949,9,-7.919,0,0.0868,...,0.1640,0.3690,117.002,audio_features,69PDpbemLTsnwlRFTmKFW1,spotify:track:69PDpbemLTsnwlRFTmKFW1,https://api.spotify.com/v1/tracks/69PDpbemLTsn...,https://api.spotify.com/v1/audio-analysis/69PD...,340297,4


In [4]:
wmw_df.columns

Index(['volume', 'position', 'track_name', 'artist_name', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature'],
      dtype='object')

In [5]:
# Drop duplicates
print("Initial shape:", wmw_df.shape[0], 'tracks')
wmw_df.drop_duplicates(subset=['track_name', 'artist_name'], inplace=True)
print("New shape:", wmw_df.shape[0], 'tracks')

Initial shape: 554 tracks
New shape: 512 tracks


In [6]:
# Exclude unnecessary columns
exclude_cols = ['track_name', 'artist_name', 'duration_ms', 'track_href', 'uri', 'time_signature', 'id', 'type', 'analysis_url']

wmw_df.drop(columns=exclude_cols, inplace=True)

In [7]:
# Feature list
feature_list =  ['danceability','energy', 'loudness', 'speechiness', 'acousticness',
                 'instrumentalness', 'liveness', 'valence','mode','key','tempo']

In [8]:
from sklearn.preprocessing import StandardScaler

# Standardize features
standard_scaler = StandardScaler()

standard_scaler.fit(wmw_df[feature_list])

standard_features = standard_scaler.transform(wmw_df[feature_list])

processed_df = pd.DataFrame(standard_features, index=wmw_df.index, columns=wmw_df.columns[2:])

wmw_df = pd.concat([wmw_df[['volume', 'position']],processed_df[feature_list]], axis=1)

wmw_df.head()

  return self.partial_fit(X, y)


Unnamed: 0,volume,position,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,mode,key,tempo
0,38,1,-2.390995,-2.635095,-0.277322,0.929695,-0.489837,-1.156919,1.08569,-1.204549,2.096185,-5.370442,0.233803
1,38,2,0.291017,-0.13109,-0.352964,0.645904,-0.283988,-0.125409,-0.921073,1.432297,0.685856,-0.375624,0.682817
2,38,3,0.38522,0.949117,0.060842,-1.461032,1.510395,2.610226,-0.921073,1.72528,-0.770383,0.183583,-1.19251
3,38,4,-0.440441,0.607513,-0.054846,-1.4123,-0.167469,1.551945,1.08569,-1.497532,-0.861664,0.882849,3.233615
4,38,5,0.490506,-0.001835,-0.613261,-0.965114,-0.534114,-0.522788,-0.921073,0.553349,-0.811493,0.538564,-0.906949


In [11]:
#TODO- Save StandarScaler for later us
from pickle import dump

# save the scaler
dump(standard_scaler, open('standard_features.pkl', 'wb'))

In [10]:
def make_csv(x, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    data = pd.DataFrame(x).dropna()
    data.to_csv(os.path.join(data_dir, filename), index=False)
    
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [13]:
# Can change directory, if you want
data_dir = 'data'

make_csv(wmw_df, filename='train.csv', data_dir=data_dir)

Path created: data/train.csv


In [14]:
# Build sequences and targets
def create_playlist_sequences(input_data):
    input_playlists = []
    
    for i in input_data['volume'].unique():
        temp_vol = input_data[input_data['volume'] == i]
        playlist_X = temp_vol.iloc[:, 2:].values
        labels_y = temp_vol.iloc[:, 2:-3].values
        input_playlists.append((playlist_X, labels_y))
        
    return input_playlists

In [15]:
# Gather sequences and targets
processed_data = create_playlist_sequences(wmw_df)

# Take a look at first input sequence and target
processed_data[0][0][0]

array([-2.39099487, -2.63509459, -0.27732204,  0.92969533, -0.48983686,
       -1.15691947,  1.08569029, -1.20454903,  2.09618458, -5.37044178,
        0.23380331])

In [16]:
from unittest.mock import MagicMock, patch

def _print_success_message():
    print('Tests Passed!')

def test_playlist_sequences(input_playlists):
    
    track_features = [-2.39099487, -2.63509459, -0.27732204,  0.92969533, -0.48983686,-1.15691947,  1.08569029, -1.20454903,  2.09618458, -5.37044178, 0.23380331]
    
    track_features_len = 11
    target_features_len = 8
    
    # check shape and equality of first track
    assert len(input_playlists[0][0][0]) == len(track_features), \
        'Number of features in input_playlist features does not match expected number of ' + str(len(track_features))    
    
    # check shape of input and output arrays
    assert input_playlists[0][0].shape[1]==track_features_len, \
        'input_features should have as many columns as selected features, got: {}'.format(train_x.shape[1])
    assert input_playlists[0][1].shape[1]==target_features_len, \
        'target_features should have as many columns as selected features, got: {}'.format(train_x.shape[1])
    
    #TODO: Add more tests
    
    _print_success_message()

In [17]:
# Test processed sequences
test_playlist_sequences(processed_data)

Tests Passed!
