In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Load in the training set
training_set = pd.read_csv("data/mini_data/data/training_set/log_mini.csv")
training_set.hist_user_behavior_reason_start = training_set.hist_user_behavior_reason_start.astype('category')
training_set.hist_user_behavior_reason_end = training_set.hist_user_behavior_reason_end.astype('category')
training_set.context_type = training_set.context_type.astype('category')
training_set.date = training_set.date.apply(pd.to_datetime)

# Load in the track features
track_features = pd.read_pickle("data/pickles/track_features.pkl")

# Using skip_2 as the ground truth
training_set['skipped'] = (training_set.skip_2 | training_set.skip_1).astype('int32')
training_set = training_set.drop(columns=['skip_1','skip_2','skip_3','not_skipped'])

In [3]:
# Drop the acoustic vector information
track_data = track_features.drop(
    columns=[c for c in track_features.columns if 'acoustic_vector' in c]
)
track_data.head()

Unnamed: 0,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,t_a540e552-16d4-42f8-a185-232bd650ea7d,109.706673,1950,99.975414,0.45804,0.519497,0.504949,0.399767,7.51188,0.817709,...,0,0.132124,-11.238,0.3861,1,0.541606,0.079985,166.287003,4,0.935512
1,t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.693329,1950,99.96943,0.916272,0.419223,0.54553,0.491235,9.098376,0.154258,...,0,0.163281,-13.706,0.125,1,0.895874,0.083877,95.261002,3,0.359675
2,t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.839996,1951,99.602549,0.812884,0.42589,0.50828,0.491625,8.36867,0.358813,...,0,0.090115,-10.522,0.200669,0,0.806136,0.038777,105.185997,4,0.726769
3,t_070a63a0-744a-434e-9913-a97b02926a29,175.399994,1951,99.665018,0.396854,0.400934,0.35999,0.552227,5.967346,0.514585,...,0,0.360924,-11.032,0.427152,1,0.492772,0.038337,119.441002,4,0.859075
4,t_d6990e17-9c31-4b01-8559-47d9ce476df1,369.600006,1951,99.991764,0.728831,0.371328,0.335115,0.483044,5.802681,0.721442,...,0,0.189162,-6.836,0.28125,1,0.723808,0.032043,95.261002,4,0.562343


In [4]:
track_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50704 entries, 0 to 50703
Data columns (total 22 columns):
track_id                  50704 non-null object
duration                  50704 non-null float64
release_year              50704 non-null int64
us_popularity_estimate    50704 non-null float64
acousticness              50704 non-null float64
beat_strength             50704 non-null float64
bounciness                50704 non-null float64
danceability              50704 non-null float64
dyn_range_mean            50704 non-null float64
energy                    50704 non-null float64
flatness                  50704 non-null float64
instrumentalness          50704 non-null float64
key                       50704 non-null int64
liveness                  50704 non-null float64
loudness                  50704 non-null float64
mechanism                 50704 non-null float64
mode                      50704 non-null int32
organism                  50704 non-null float64
speechiness     

In [5]:
# Convert the columns to all be consistantly numeric
track_data.iloc[:,1:] = track_data.iloc[:,1:].astype('float32')
track_data.head()

Unnamed: 0,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,t_a540e552-16d4-42f8-a185-232bd650ea7d,109.706673,1950.0,99.975418,0.45804,0.519497,0.504949,0.399767,7.51188,0.817709,...,0.0,0.132124,-11.238,0.3861,1.0,0.541606,0.079985,166.287003,4.0,0.935512
1,t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.693329,1950.0,99.969429,0.916272,0.419223,0.54553,0.491235,9.098376,0.154258,...,0.0,0.163281,-13.706,0.125,1.0,0.895874,0.083877,95.261002,3.0,0.359675
2,t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.839996,1951.0,99.602547,0.812884,0.42589,0.50828,0.491625,8.36867,0.358813,...,0.0,0.090115,-10.522,0.200669,0.0,0.806136,0.038777,105.185997,4.0,0.726769
3,t_070a63a0-744a-434e-9913-a97b02926a29,175.399994,1951.0,99.665016,0.396854,0.400934,0.35999,0.552227,5.967346,0.514585,...,0.0,0.360924,-11.032,0.427152,1.0,0.492772,0.038337,119.441002,4.0,0.859075
4,t_d6990e17-9c31-4b01-8559-47d9ce476df1,369.600006,1951.0,99.99176,0.728831,0.371328,0.335115,0.483044,5.802681,0.721442,...,0.0,0.189162,-6.836,0.28125,1.0,0.723808,0.032043,95.261002,4.0,0.562343


In [6]:
# Join the two together
session_data = (
    training_set[['session_id','session_position','track_id_clean', 'skipped']]
    .merge(
        track_data,
        how='left',
        left_on=['track_id_clean'],
        right_on=['track_id']
    ).drop(columns='track_id_clean')
)
session_data.head()

Unnamed: 0,session_id,session_position,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,...,7.0,0.104322,-5.319,0.824766,0.0,0.131391,0.061158,130.037994,4.0,0.337152
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,...,10.0,0.135776,-5.843,0.774327,1.0,0.296923,0.045354,145.028,4.0,0.373862
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,...,1.0,0.103722,-7.756,0.630996,1.0,0.603271,0.229936,111.982002,4.0,0.64942
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0,t_64f3743c-f624-46bb-a579-0f3f9a07a123,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,...,8.0,0.120842,-4.919,0.759465,1.0,0.170148,0.24098,147.031006,4.0,0.652921


In [12]:
# Create two dataframes to hold attribute averages for session skips and plays
skip_data = session_data.copy()
play_data = session_data.copy()
for c in session_data.columns:
    skip_data[c] *= session_data.skipped
    play_data[c] *= (session_data.skipped != 1).astype('int32')
skip_data.head(10)

Unnamed: 0,session_id,session_position,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0_00006f66-33e5-4de7-a324-2d18e439fc1e,7,1,t_e23c19f5-4c32-4557-aa44-81372c2e3705,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,...,6.0,0.111306,-7.808,0.157576,1.0,0.67522,0.409848,104.466003,4.0,0.10942
7,0_00006f66-33e5-4de7-a324-2d18e439fc1e,8,1,t_0be6eced-f56f-48bd-8086-f2e0b760fdee,214.047348,2018.0,97.24382,0.017628,0.649367,0.68261,...,1.0,0.114787,-6.96,0.354167,1.0,0.456843,0.103687,83.036003,4.0,0.389913
8,0_00006f66-33e5-4de7-a324-2d18e439fc1e,9,1,t_f3ecbd3b-9e8e-4557-b8e0-39cfcd7e65dd,218.260818,2018.0,99.998451,0.030127,0.818798,0.879947,...,5.0,0.128244,-6.245,0.857855,0.0,0.102744,0.049853,130.016998,4.0,0.338321
9,0_00006f66-33e5-4de7-a324-2d18e439fc1e,10,1,t_2af4dfa0-7df3-4b7e-b7ab-353ba48237f9,60.0,2018.0,99.957436,0.332467,0.7515,0.843354,...,4.0,0.119036,-8.597,0.622222,0.0,0.355844,0.154609,100.237,4.0,0.257672


In [14]:
# Create cumsum columns by session
skip_data_cumsum = skip_data.drop(columns=['track_id','skipped','session_position']).groupby(['session_id']).cumsum()
play_data_cumsum = play_data.drop(columns=['track_id','skipped','session_position']).groupby(['session_id']).cumsum()
play_data_cumsum.head(30)

Unnamed: 0,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,flatness,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,0.653119,7.660024,0.553465,1.035007,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
1,416.863037,4036.0,199.864868,0.077659,1.093355,1.209117,1.530513,18.968775,1.280293,2.060642,...,8.0,0.782875,-11.896,1.37155,1.0,0.45206,0.130874,264.062988,8.0,0.489407
2,648.1297,6054.0,299.864838,0.431775,1.62551,1.749528,2.210232,27.034576,1.843302,3.090108,...,18.0,0.918651,-17.739,2.145877,2.0,0.748983,0.176229,409.091003,12.0,0.863269
3,817.95636,8072.0,399.859863,1.201001,2.267266,2.478752,3.075113,38.322163,2.372786,4.083627,...,19.0,1.022372,-25.494999,2.776873,3.0,1.352254,0.406164,521.072998,16.0,1.512689
4,1028.501587,10090.0,499.858368,1.207603,2.999694,3.273632,3.932891,50.50375,3.022842,5.084198,...,27.0,1.143214,-30.414,3.536339,4.0,1.522401,0.647144,668.104004,20.0,2.16561
5,1245.191162,12108.0,599.855957,1.30594,3.548325,3.931955,4.529371,60.920017,3.579971,6.111518,...,28.0,1.199706,-37.936001,3.921365,4.0,1.962777,0.78073,801.351013,25.0,2.826691
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# For each cumsum:
# - Subtract the current value to make it a sum for all past session tracks
# - Divide by the session position to make it an average
for c in skip_data_cumsum.columns:
    # Average the columns
    skip_data_cumsum[c] = (skip_data_cumsum[c] - (skip_data[c])) / (session_data.session_position - 1)
    play_data_cumsum[c] = (play_data_cumsum[c] - (play_data[c])) / (session_data.session_position - 1)
    # Clear the first rows
    skip_data_cumsum[c][session_data.session_position == 1] = np.NaN #-999
    play_data_cumsum[c][session_data.session_position == 1] = np.NaN #-999
play_data_cumsum.head(30)

Unnamed: 0,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,flatness,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,,,,,,,,,,,...,,,,,,,,,,
1,180.066666,2018.0,99.96814,0.015848,0.438551,0.473456,0.653119,7.660025,0.553465,1.035007,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,208.431519,2018.0,99.932434,0.03883,0.546678,0.604558,0.765256,9.484387,0.640146,1.030321,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,216.043228,2018.0,99.954933,0.143925,0.541837,0.583176,0.736744,9.011525,0.614434,1.030036,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,204.489075,2018.0,99.964966,0.30025,0.566817,0.619688,0.768778,9.580541,0.593196,1.020907,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172
5,205.700348,2018.0,99.971664,0.241521,0.599939,0.654726,0.786578,10.100749,0.604568,1.01684,...,5.4,0.228643,-6.0828,0.707268,0.8,0.30448,0.129429,133.620804,4.0,0.433122
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Prepend the columns as "skip_"
skip_data_cumsum.columns = [f"skip_{c}" for c in skip_data_cumsum.columns]
play_data_cumsum.columns = [f"play_{c}" for c in play_data_cumsum.columns]
play_data_cumsum.head()

Unnamed: 0,play_duration,play_release_year,play_us_popularity_estimate,play_acousticness,play_beat_strength,play_bounciness,play_danceability,play_dyn_range_mean,play_energy,play_flatness,...,play_key,play_liveness,play_loudness,play_mechanism,play_mode,play_organism,play_speechiness,play_tempo,play_time_signature,play_valence
0,,,,,,,,,,,...,,,,,,,,,,
1,180.066666,2018.0,99.96814,0.015848,0.438551,0.473456,0.653119,7.660025,0.553465,1.035007,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,208.431519,2018.0,99.932434,0.03883,0.546678,0.604558,0.765256,9.484387,0.640146,1.030321,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,216.043228,2018.0,99.954933,0.143925,0.541837,0.583176,0.736744,9.011525,0.614434,1.030036,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,204.489075,2018.0,99.964966,0.30025,0.566817,0.619688,0.768778,9.580541,0.593196,1.020907,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172


In [17]:
# Join them together
full_session_data = pd.concat([
    session_data,
    skip_data_cumsum,
    play_data_cumsum
],axis=1)
full_session_data.head(20)

Unnamed: 0,session_id,session_position,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,...,play_key,play_liveness,play_loudness,play_mechanism,play_mode,play_organism,play_speechiness,play_tempo,play_time_signature,play_valence
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,...,,,,,,,,,,
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0,t_64f3743c-f624-46bb-a579-0f3f9a07a123,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172
5,0_00006f66-33e5-4de7-a324-2d18e439fc1e,6,0,t_c815228b-3212-4f9e-9d4f-9cb19b248184,216.689484,2018.0,99.997604,0.098337,0.548632,0.658323,...,5.4,0.228643,-6.0828,0.707268,0.8,0.30448,0.129429,133.620804,4.0,0.433122
6,0_00006f66-33e5-4de7-a324-2d18e439fc1e,7,1,t_e23c19f5-4c32-4557-aa44-81372c2e3705,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0_00006f66-33e5-4de7-a324-2d18e439fc1e,8,1,t_0be6eced-f56f-48bd-8086-f2e0b760fdee,214.047348,2018.0,97.24382,0.017628,0.649367,0.68261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0_00006f66-33e5-4de7-a324-2d18e439fc1e,9,1,t_f3ecbd3b-9e8e-4557-b8e0-39cfcd7e65dd,218.260818,2018.0,99.998451,0.030127,0.818798,0.879947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0_00006f66-33e5-4de7-a324-2d18e439fc1e,10,1,t_2af4dfa0-7df3-4b7e-b7ab-353ba48237f9,60.0,2018.0,99.957436,0.332467,0.7515,0.843354,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Examine the columns
full_session_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167880 entries, 0 to 167879
Data columns (total 67 columns):
session_id                     167880 non-null object
session_position               167880 non-null int64
skipped                        167880 non-null int32
track_id                       167880 non-null object
duration                       167880 non-null float32
release_year                   167880 non-null float32
us_popularity_estimate         167880 non-null float32
acousticness                   167880 non-null float32
beat_strength                  167880 non-null float32
bounciness                     167880 non-null float32
danceability                   167880 non-null float32
dyn_range_mean                 167880 non-null float32
energy                         167880 non-null float32
flatness                       167880 non-null float32
instrumentalness               167880 non-null float32
key                            167880 non-null float32
liveness       

In [18]:
# Split the data:
# df -> X, y -> Train, Test, Val
X = full_session_data.drop(
    columns=["skipped","session_id","track_id"]
).fillna(-9999)
y = full_session_data.skipped
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=2
)
X.head(20)

Unnamed: 0,session_position,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,play_key,play_liveness,play_loudness,play_mechanism,play_mode,play_organism,play_speechiness,play_tempo,play_time_signature,play_valence
0,1,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,0.653119,7.660024,0.553465,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,2,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,0.877393,11.30875,0.726828,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,3,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,0.679719,8.065802,0.563009,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,4,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,0.864881,11.287586,0.529484,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,5,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,0.857778,12.181586,0.650057,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172
5,6,216.689484,2018.0,99.997604,0.098337,0.548632,0.658323,0.59648,10.416269,0.557129,...,5.4,0.228643,-6.0828,0.707268,0.8,0.30448,0.129429,133.620804,4.0,0.433122
6,7,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,0.55752,9.834414,0.46935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,214.047348,2018.0,97.24382,0.017628,0.649367,0.68261,0.489405,9.900208,0.715155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,218.260818,2018.0,99.998451,0.030127,0.818798,0.879947,0.964383,15.037571,0.55528,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,60.0,2018.0,99.957436,0.332467,0.7515,0.843354,0.914082,14.457762,0.239318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Train a Logistic Regression model and score
log = LogisticRegressionCV(
    cv=3
).fit(
    X_train,
    y_train
)

print("Log Train score: %s" % log.score(X_train,y_train))
print("Log Val score:   %s" % log.score(X_val,y_val))
print("Log Test score:  %s" % log.score(X_test,y_test))



Log Train score: 0.9559021993056783
Log Val score:   0.9532779866721268
Log Test score:  0.9540147724565166


In [21]:
for pair in sorted(zip(X.columns,log.coef_[0]),key=lambda p: np.abs(p[1]),reverse=True):
    print("%30s | % .8f" % pair)

             play_release_year | -0.04514727
             skip_release_year |  0.04261874
              session_position | -0.03766055
                dyn_range_mean | -0.01122904
                      loudness | -0.00937482
                 play_duration | -0.00477183
                 skip_duration |  0.00465026
        us_popularity_estimate | -0.00320744
                    play_tempo | -0.00268582
                    skip_tempo |  0.00265281
   skip_us_popularity_estimate |  0.00218954
   play_us_popularity_estimate | -0.00215887
                           key |  0.00209249
                     mechanism | -0.00121136
                      organism |  0.00115356
                 beat_strength | -0.00104794
                    bounciness | -0.00092000
                  danceability | -0.00086975
                  acousticness |  0.00076713
                time_signature | -0.00076004
                         tempo | -0.00064268
              instrumentalness |  0.00059930
          

In [22]:
# Train a Random Forest model and score
rfc = RandomForestClassifier(
    n_estimators=100
).fit(
    X_train,
    y_train
)

print("RFC Train score: %s" % rfc.score(X_train,y_train))
print("RFC Val score:   %s" % rfc.score(X_val,y_val))
print("RFC Test score:  %s" % rfc.score(X_test,y_test))

RFC Train score: 0.9902366836369052
RFC Val score:   0.9509698075276423
RFC Test score:  0.9517810340719561


In [23]:
for pair in sorted(zip(X.columns,rfc.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % .8f" % pair)

                 play_loudness |  0.08482849
               skip_bounciness |  0.06518750
              skip_speechiness |  0.05749660
                 skip_organism |  0.04973311
           skip_dyn_range_mean |  0.04868662
                   skip_energy |  0.04846352
             skip_release_year |  0.04833270
                skip_mechanism |  0.04465242
             skip_acousticness |  0.04356959
           skip_time_signature |  0.03450351
             skip_danceability |  0.03416862
                 skip_flatness |  0.03231534
                  skip_valence |  0.02488079
                    skip_tempo |  0.02443999
           play_dyn_range_mean |  0.02111339
                 play_flatness |  0.02109008
            skip_beat_strength |  0.01796669
            play_beat_strength |  0.01758818
   skip_us_popularity_estimate |  0.01633566
                 skip_duration |  0.01632145
                play_mechanism |  0.01585438
                    play_tempo |  0.01497735
          

In [24]:
# Train a XGBoost model and score
xgb = XGBClassifier(
    n_estimators=100000
).fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train),(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

print("XGB Train score: %s" % xgb.score(X_train,y_train))
print("XGB Val score:   %s" % xgb.score(X_val,y_val))
print("XGB Test score:  %s" % xgb.score(X_test,y_test))

[0]	validation_0-error:0.043623	validation_1-error:0.046499
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.043623	validation_1-error:0.046499
[2]	validation_0-error:0.043977	validation_1-error:0.046722
[3]	validation_0-error:0.043977	validation_1-error:0.046722
[4]	validation_0-error:0.043977	validation_1-error:0.046722
[5]	validation_0-error:0.043977	validation_1-error:0.046722
[6]	validation_0-error:0.043912	validation_1-error:0.046722
[7]	validation_0-error:0.043977	validation_1-error:0.046722
[8]	validation_0-error:0.043912	validation_1-error:0.046722
[9]	validation_0-error:0.043912	validation_1-error:0.046722
[10]	validation_0-error:0.043484	validation_1-error:0.046461
[11]	validation_0-error:0.043781	validation_1-error:0.046796
[12]	validation_0-error:0.043912	validation_1-error:0.046722
[13]	validation_0-error:0.043484	validation_1-error:0.04

In [25]:
for pair in sorted(zip(X.columns,xgb.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % .8f" % pair)

                 play_loudness |  0.85143971
                 skip_duration |  0.07764345
                 play_duration |  0.04706667
              session_position |  0.02238893
        us_popularity_estimate |  0.00072774
                      loudness |  0.00027970
                        energy |  0.00024767
                     mechanism |  0.00020612
                      duration |  0.00000000
                  release_year |  0.00000000
                  acousticness |  0.00000000
                 beat_strength |  0.00000000
                    bounciness |  0.00000000
                  danceability |  0.00000000
                dyn_range_mean |  0.00000000
                      flatness |  0.00000000
              instrumentalness |  0.00000000
                           key |  0.00000000
                      liveness |  0.00000000
                          mode |  0.00000000
                      organism |  0.00000000
                   speechiness |  0.00000000
          

In [26]:
# Train a LightGBM model and score
lgbm = LGBMClassifier(
    n_estimators=100000
).fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train),(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

print()
print("LGBM Train score: %s" % lgbm.score(X_train,y_train))
print("LGBM Val score:   %s" % lgbm.score(X_val,y_val))
print("LGBM Test score:  %s" % lgbm.score(X_test,y_test))

[1]	training's binary_logloss: 0.607724	valid_1's binary_logloss: 0.608031
Training until validation scores don't improve for 10 rounds
[2]	training's binary_logloss: 0.538263	valid_1's binary_logloss: 0.5388
[3]	training's binary_logloss: 0.480348	valid_1's binary_logloss: 0.481164
[4]	training's binary_logloss: 0.431401	valid_1's binary_logloss: 0.432448
[5]	training's binary_logloss: 0.389599	valid_1's binary_logloss: 0.390879
[6]	training's binary_logloss: 0.353579	valid_1's binary_logloss: 0.355069
[7]	training's binary_logloss: 0.322372	valid_1's binary_logloss: 0.324071
[8]	training's binary_logloss: 0.295127	valid_1's binary_logloss: 0.297062
[9]	training's binary_logloss: 0.27126	valid_1's binary_logloss: 0.273416
[10]	training's binary_logloss: 0.25026	valid_1's binary_logloss: 0.252621
[11]	training's binary_logloss: 0.231676	valid_1's binary_logloss: 0.234276
[12]	training's binary_logloss: 0.21523	valid_1's binary_logloss: 0.218002
[13]	training's binary_logloss: 0.200637	

In [27]:
for pair in sorted(zip(X.columns,lgbm.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % 15.8f" % pair)

        us_popularity_estimate |    202.00000000
              instrumentalness |    180.00000000
                      duration |    159.00000000
                   speechiness |    158.00000000
                      loudness |    156.00000000
                      liveness |    148.00000000
                         tempo |    147.00000000
                       valence |    146.00000000
                        energy |    145.00000000
              session_position |    133.00000000
                      flatness |    123.00000000
                  acousticness |    122.00000000
                     mechanism |    117.00000000
                  danceability |    112.00000000
                 skip_duration |     97.00000000
                 beat_strength |     93.00000000
                      organism |     93.00000000
                dyn_range_mean |     87.00000000
                  release_year |     79.00000000
                 play_duration |     76.00000000
                    

***

That model's performance is _suspiciously_ good. Can we look at the cases it can't predict...

In [28]:
from sklearn.metrics import confusion_matrix

In [29]:
print("Logistic Regression Confusion Matrix")
print(confusion_matrix(
    y_test,
    log.predict(X_test)
))

Logistic Regression Confusion Matrix
[[15438   762]
 [  782 16594]]


In [30]:
print("Random Forest Confusion Matrix")
print(confusion_matrix(
    y_test,
    rfc.predict(X_test)
))

Random Forest Confusion Matrix
[[15462   738]
 [  881 16495]]


In [31]:
print("XGBoost Confusion Matrix")
print(confusion_matrix(
    y_test,
    xgb.predict(X_test)
))

XGBoost Confusion Matrix
[[15471   729]
 [  823 16553]]


In [32]:
print("LightGBM Confusion Matrix")
print(confusion_matrix(
    y_test,
    lgbm.predict(X_test)
))

LightGBM Confusion Matrix
[[15454   746]
 [  829 16547]]
