In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Load in the training set
training_set = pd.read_csv("data/mini_data/data/training_set/log_mini.csv")
training_set.hist_user_behavior_reason_start = training_set.hist_user_behavior_reason_start.astype('category')
training_set.hist_user_behavior_reason_end = training_set.hist_user_behavior_reason_end.astype('category')
training_set.context_type = training_set.context_type.astype('category')
training_set.date = training_set.date.apply(pd.to_datetime)

# Load in the track features
track_features = pd.read_pickle("data/pickles/track_features.pkl")

# Using skip_2 as the ground truth
training_set['skipped'] = (training_set.skip_2 | training_set.skip_1).astype('int32')
training_set = training_set.drop(columns=['skip_1','skip_2','skip_3','not_skipped'])

In [3]:
# Drop the acoustic vector information
track_data = track_features.drop(
    columns=[c for c in track_features.columns if 'acoustic_vector' in c]
)
track_data.head()

Unnamed: 0,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,t_a540e552-16d4-42f8-a185-232bd650ea7d,109.706673,1950,99.975414,0.45804,0.519497,0.504949,0.399767,7.51188,0.817709,...,0,0.132124,-11.238,0.3861,1,0.541606,0.079985,166.287003,4,0.935512
1,t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.693329,1950,99.96943,0.916272,0.419223,0.54553,0.491235,9.098376,0.154258,...,0,0.163281,-13.706,0.125,1,0.895874,0.083877,95.261002,3,0.359675
2,t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.839996,1951,99.602549,0.812884,0.42589,0.50828,0.491625,8.36867,0.358813,...,0,0.090115,-10.522,0.200669,0,0.806136,0.038777,105.185997,4,0.726769
3,t_070a63a0-744a-434e-9913-a97b02926a29,175.399994,1951,99.665018,0.396854,0.400934,0.35999,0.552227,5.967346,0.514585,...,0,0.360924,-11.032,0.427152,1,0.492772,0.038337,119.441002,4,0.859075
4,t_d6990e17-9c31-4b01-8559-47d9ce476df1,369.600006,1951,99.991764,0.728831,0.371328,0.335115,0.483044,5.802681,0.721442,...,0,0.189162,-6.836,0.28125,1,0.723808,0.032043,95.261002,4,0.562343


In [4]:
track_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50704 entries, 0 to 50703
Data columns (total 22 columns):
track_id                  50704 non-null object
duration                  50704 non-null float64
release_year              50704 non-null int64
us_popularity_estimate    50704 non-null float64
acousticness              50704 non-null float64
beat_strength             50704 non-null float64
bounciness                50704 non-null float64
danceability              50704 non-null float64
dyn_range_mean            50704 non-null float64
energy                    50704 non-null float64
flatness                  50704 non-null float64
instrumentalness          50704 non-null float64
key                       50704 non-null int64
liveness                  50704 non-null float64
loudness                  50704 non-null float64
mechanism                 50704 non-null float64
mode                      50704 non-null int32
organism                  50704 non-null float64
speechiness     

In [5]:
# Convert the columns to all be consistantly numeric
track_data.iloc[:,1:] = track_data.iloc[:,1:].astype('float32')
track_data.head()

Unnamed: 0,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,t_a540e552-16d4-42f8-a185-232bd650ea7d,109.706673,1950.0,99.975418,0.45804,0.519497,0.504949,0.399767,7.51188,0.817709,...,0.0,0.132124,-11.238,0.3861,1.0,0.541606,0.079985,166.287003,4.0,0.935512
1,t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.693329,1950.0,99.969429,0.916272,0.419223,0.54553,0.491235,9.098376,0.154258,...,0.0,0.163281,-13.706,0.125,1.0,0.895874,0.083877,95.261002,3.0,0.359675
2,t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.839996,1951.0,99.602547,0.812884,0.42589,0.50828,0.491625,8.36867,0.358813,...,0.0,0.090115,-10.522,0.200669,0.0,0.806136,0.038777,105.185997,4.0,0.726769
3,t_070a63a0-744a-434e-9913-a97b02926a29,175.399994,1951.0,99.665016,0.396854,0.400934,0.35999,0.552227,5.967346,0.514585,...,0.0,0.360924,-11.032,0.427152,1.0,0.492772,0.038337,119.441002,4.0,0.859075
4,t_d6990e17-9c31-4b01-8559-47d9ce476df1,369.600006,1951.0,99.99176,0.728831,0.371328,0.335115,0.483044,5.802681,0.721442,...,0.0,0.189162,-6.836,0.28125,1.0,0.723808,0.032043,95.261002,4.0,0.562343


In [6]:
# Join the two together
session_data = (
    training_set[['session_id','session_position','track_id_clean', 'skipped']]
    .merge(
        track_data,
        how='left',
        left_on=['track_id_clean'],
        right_on=['track_id']
    ).drop(columns='track_id_clean')
)
session_data.head()

Unnamed: 0,session_id,session_position,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,...,7.0,0.104322,-5.319,0.824766,0.0,0.131391,0.061158,130.037994,4.0,0.337152
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,...,10.0,0.135776,-5.843,0.774327,1.0,0.296923,0.045354,145.028,4.0,0.373862
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,...,1.0,0.103722,-7.756,0.630996,1.0,0.603271,0.229936,111.982002,4.0,0.64942
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0,t_64f3743c-f624-46bb-a579-0f3f9a07a123,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,...,8.0,0.120842,-4.919,0.759465,1.0,0.170148,0.24098,147.031006,4.0,0.652921


In [36]:
session_data[
    (session_data.session_id == "0_00006f66-33e5-4de7-a324-2d18e439fc1e") &
    (session_data.session_position < 5) &
    (session_data.skipped == 0)
].mean()

session_position             2.500000
skipped                      0.000000
duration                   204.489094
release_year              2018.000000
us_popularity_estimate      99.964970
acousticness                 0.300250
beat_strength                0.566817
bounciness                   0.619688
danceability                 0.768778
dyn_range_mean               9.580541
energy                       0.593196
flatness                     1.020907
instrumentalness             0.000873
key                          4.750000
liveness                     0.255593
loudness                    -6.373750
mechanism                    0.694218
mode                         0.750000
organism                     0.338063
speechiness                  0.101541
tempo                      130.268248
time_signature               4.000000
valence                      0.378172
dtype: float64

In [19]:
# Create two dataframes to hold attribute averages for session skips and plays
skip_data = session_data.copy().set_index(['session_id','session_position'])
play_data = session_data.copy().set_index(['session_id','session_position'])
skip_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
session_id,session_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,0.653119,7.660024,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,0.877393,11.30875,...,7.0,0.104322,-5.319,0.824766,0.0,0.131391,0.061158,130.037994,4.0,0.337152
0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,0.679719,8.065802,...,10.0,0.135776,-5.843,0.774327,1.0,0.296923,0.045354,145.028,4.0,0.373862
0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,0.864881,11.287586,...,1.0,0.103722,-7.756,0.630996,1.0,0.603271,0.229936,111.982002,4.0,0.64942
0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0,t_64f3743c-f624-46bb-a579-0f3f9a07a123,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,0.857778,12.181586,...,8.0,0.120842,-4.919,0.759465,1.0,0.170148,0.24098,147.031006,4.0,0.652921


In [34]:
skip_data.loc['0_00006f66-33e5-4de7-a324-2d18e439fc1e',1]

skipped                                                        0
track_id                  t_0479f24c-27d2-46d6-a00c-7ec928f2b539
duration                                                 180.067
release_year                                                2018
us_popularity_estimate                                   99.9681
acousticness                                           0.0158484
beat_strength                                           0.438551
bounciness                                              0.473455
danceability                                            0.653119
dyn_range_mean                                           7.66002
energy                                                  0.553465
flatness                                                 1.03501
instrumentalness                                      0.00348447
key                                                            1
liveness                                                0.678553
loudness                 

In [53]:
# from multiprocessing import Pool

# def update_row(df, skip_df, play_df, sid, spos):
#     skip_df.loc[sid,spos] = df[
#         (df.session_id == sid) &
#         (df.session_position < spos) &
#         (df.skipped == 1)
#     ].mean()
#     play_df.loc[sid,spos] = df[
#         (df.session_id == sid) &
#         (df.session_position < spos) &
#         (df.skipped == 0)
#     ].mean()
#     return

# def df_map(index):
#     sid, spos = index
#     update_row(session_data, skip_data, play_data, sid, spos)
#     return

# with Pool() as p:
#     p.map(df_map,session_data[['session_id','session_position']].values)

In [66]:
import tqdm, progressbar


for sid, spos in progressbar.progressbar(session_data[['session_id','session_position']].values):
    skip_data.loc[sid,spos] = session_data[
        (session_data.session_id == sid) &
        (session_data.session_position < spos) &
        (session_data.skipped == 1)
    ].mean()
    play_data.loc[sid,spos] = session_data[
        (session_data.session_id == sid) &
        (session_data.session_position < spos) &
        (session_data.skipped == 0)
    ].mean()
    
skip_data.head(10)

100% (167880 of 167880) |################| Elapsed Time: 4:18:30 Time:  4:18:30


Unnamed: 0_level_0,Unnamed: 1_level_0,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,...,key,liveness,loudness,mechanism,mode,organism,speechiness,tempo,time_signature,valence
session_id,session_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,6,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,7,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,8,1.0,,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,0.55752,9.834414,...,6.0,0.111306,-7.808,0.157576,1.0,0.67522,0.409848,104.466003,4.0,0.10942
0_00006f66-33e5-4de7-a324-2d18e439fc1e,9,1.0,,200.250336,2018.0,98.552689,0.233629,0.568537,0.645611,0.523463,9.867311,...,3.5,0.113047,-7.384,0.255871,1.0,0.566032,0.256768,93.751007,4.0,0.249667
0_00006f66-33e5-4de7-a324-2d18e439fc1e,10,1.0,,206.25383,2018.0,99.034615,0.165795,0.651958,0.723723,0.670436,11.590732,...,4.0,0.118113,-7.004333,0.456533,0.666667,0.411603,0.187796,105.839668,4.0,0.279218


In [67]:
# Prepend the columns as "skip_"
skip_data.columns = [f"skip_avg_{c}" for c in skip_data.columns]
play_data.columns = [f"play_avg_{c}" for c in play_data.columns]
play_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,play_avg_skipped,play_avg_track_id,play_avg_duration,play_avg_release_year,play_avg_us_popularity_estimate,play_avg_acousticness,play_avg_beat_strength,play_avg_bounciness,play_avg_danceability,play_avg_dyn_range_mean,...,play_avg_key,play_avg_liveness,play_avg_loudness,play_avg_mechanism,play_avg_mode,play_avg_organism,play_avg_speechiness,play_avg_tempo,play_avg_time_signature,play_avg_valence
session_id,session_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,,,,,,,,,,,...,,,,,,,,,,
0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0.0,,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,0.653119,7.660024,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0.0,,208.431519,2018.0,99.932434,0.03883,0.546678,0.604558,0.765256,9.484387,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0.0,,216.043228,2018.0,99.954948,0.143925,0.541837,0.583176,0.736744,9.011525,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0.0,,204.48909,2018.0,99.964966,0.30025,0.566817,0.619688,0.768778,9.580541,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172


In [68]:
skip_data.to_pickle("data/pickles/skip_data.pkl")
play_data.to_pickle("data/pickles/play_data.pkl")

# skip_data = pd.read_pickle("data/pickles/skip_data.pkl")
# play_data = pd.read_pickle("data/pickles/play_data.pkl")

In [70]:
skip_data.shape, play_data.shape

((167880, 23), (167880, 23))

In [75]:
# Join them together
full_session_data = pd.concat([
    session_data,
    skip_data.reset_index(drop=True),
    play_data.reset_index(drop=True)
],axis=1).drop(columns=['skip_avg_track_id','play_avg_track_id'])
full_session_data.head(20)

Unnamed: 0,session_id,session_position,skipped,track_id,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,...,play_avg_key,play_avg_liveness,play_avg_loudness,play_avg_mechanism,play_avg_mode,play_avg_organism,play_avg_speechiness,play_avg_tempo,play_avg_time_signature,play_avg_valence
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,...,,,,,,,,,,
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,0,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,0,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,0,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,0,t_64f3743c-f624-46bb-a579-0f3f9a07a123,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172
5,0_00006f66-33e5-4de7-a324-2d18e439fc1e,6,0,t_c815228b-3212-4f9e-9d4f-9cb19b248184,216.689484,2018.0,99.997604,0.098337,0.548632,0.658323,...,5.4,0.228643,-6.0828,0.707268,0.8,0.30448,0.129429,133.620804,4.0,0.433122
6,0_00006f66-33e5-4de7-a324-2d18e439fc1e,7,1,t_e23c19f5-4c32-4557-aa44-81372c2e3705,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
7,0_00006f66-33e5-4de7-a324-2d18e439fc1e,8,1,t_0be6eced-f56f-48bd-8086-f2e0b760fdee,214.047348,2018.0,97.24382,0.017628,0.649367,0.68261,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
8,0_00006f66-33e5-4de7-a324-2d18e439fc1e,9,1,t_f3ecbd3b-9e8e-4557-b8e0-39cfcd7e65dd,218.260818,2018.0,99.998451,0.030127,0.818798,0.879947,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
9,0_00006f66-33e5-4de7-a324-2d18e439fc1e,10,1,t_2af4dfa0-7df3-4b7e-b7ab-353ba48237f9,60.0,2018.0,99.957436,0.332467,0.7515,0.843354,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115


In [77]:
full_session_data.to_pickle("data/pickles/full_session_data.pkl")
# full_session_data = pd.read_pickle("data/pickles/full_session_data.pkl")

In [78]:
# Examine the columns
full_session_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167880 entries, 0 to 167879
Data columns (total 69 columns):
session_id                         167880 non-null object
session_position                   167880 non-null int64
skipped                            167880 non-null int32
track_id                           167880 non-null object
duration                           167880 non-null float32
release_year                       167880 non-null float32
us_popularity_estimate             167880 non-null float32
acousticness                       167880 non-null float32
beat_strength                      167880 non-null float32
bounciness                         167880 non-null float32
danceability                       167880 non-null float32
dyn_range_mean                     167880 non-null float32
energy                             167880 non-null float32
flatness                           167880 non-null float32
instrumentalness                   167880 non-null float32
key       

In [79]:
# Split the data:
# df -> X, y -> Train, Test, Val
X = full_session_data.drop(
    columns=["skipped","session_id","track_id"]
).fillna(-9999)
y = full_session_data.skipped
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=2
)
X.head(20)

Unnamed: 0,session_position,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,...,play_avg_key,play_avg_liveness,play_avg_loudness,play_avg_mechanism,play_avg_mode,play_avg_organism,play_avg_speechiness,play_avg_tempo,play_avg_time_signature,play_avg_valence
0,1,180.066666,2018.0,99.968132,0.015848,0.438551,0.473455,0.653119,7.660024,0.553465,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,2,236.796371,2018.0,99.896729,0.061811,0.654804,0.735661,0.877393,11.30875,0.726828,...,1.0,0.678553,-6.577,0.546784,1.0,0.320668,0.069717,134.024994,4.0,0.152255
2,3,231.266663,2018.0,99.999977,0.354116,0.532155,0.540411,0.679719,8.065802,0.563009,...,4.0,0.391437,-5.948,0.685775,0.5,0.22603,0.065437,132.031494,4.0,0.244703
3,4,169.826675,2018.0,99.995041,0.769225,0.641756,0.729224,0.864881,11.287586,0.529484,...,6.0,0.306217,-5.913,0.715292,0.666667,0.249661,0.058743,136.363663,4.0,0.287756
4,5,210.545258,2018.0,99.998497,0.006602,0.732428,0.794881,0.857778,12.181586,0.650057,...,4.75,0.255593,-6.37375,0.694218,0.75,0.338063,0.101541,130.26825,4.0,0.378172
5,6,216.689484,2018.0,99.997604,0.098337,0.548632,0.658323,0.59648,10.416269,0.557129,...,5.4,0.228643,-6.0828,0.707268,0.8,0.30448,0.129429,133.620804,4.0,0.433122
6,7,186.453323,2018.0,99.861565,0.449629,0.487708,0.608612,0.55752,9.834414,0.46935,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
7,8,214.047348,2018.0,97.24382,0.017628,0.649367,0.68261,0.489405,9.900208,0.715155,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
8,9,218.260818,2018.0,99.998451,0.030127,0.818798,0.879947,0.964383,15.037571,0.55528,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115
9,10,60.0,2018.0,99.957436,0.332467,0.7515,0.843354,0.914082,14.457762,0.239318,...,4.666667,0.199951,-6.322667,0.653561,0.666667,0.32713,0.130122,133.558502,4.166667,0.471115


In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
# Train a Logistic Regression model and score
scaler = StandardScaler().fit(X_train)
sX_train = scaler.transform(X_train)
sX_val = scaler.transform(X_val)
sX_test = scaler.transform(X_test)

log = LogisticRegressionCV(
    cv=3
).fit(
    sX_train,
    y_train
)

print("Log Train score: %s" % log.score(sX_train,y_train))
print("Log Val score:   %s" % log.score(sX_val,y_val))
print("Log Test score:  %s" % log.score(sX_test,y_test))



Log Train score: 0.5934774717757323
Log Val score:   0.5907449462045344
Log Test score:  0.5911365260900643


In [82]:
for pair in sorted(zip(X.columns,log.coef_[0]),key=lambda p: np.abs(p[1]),reverse=True):
    print("%30s | % .8f" % pair)

                skip_avg_tempo |  9.00873257
             play_avg_duration |  5.54394207
             play_avg_loudness |  3.88471600
             skip_avg_loudness |  2.78298511
         play_avg_release_year |  2.55162743
       play_avg_dyn_range_mean |  1.65893523
             skip_avg_duration |  1.50163127
         skip_avg_release_year |  1.34864555
         skip_avg_acousticness | -1.13568717
         play_avg_acousticness | -1.12572997
             play_avg_organism | -1.12197754
             skip_avg_organism | -1.08294282
     play_avg_instrumentalness | -1.01297564
                  skip_avg_key | -1.01251903
     skip_avg_instrumentalness | -0.97667881
                 skip_avg_mode | -0.95894330
             skip_avg_liveness | -0.95311140
              skip_avg_valence | -0.91209853
                 play_avg_mode | -0.90536696
             play_avg_liveness | -0.88474023
              skip_avg_skipped | -0.87414382
                  play_avg_key | -0.86781896
          

In [83]:
# Train a Random Forest model and score
rfc = RandomForestClassifier(
    n_estimators=100
).fit(
    X_train,
    y_train
)

print("RFC Train score: %s" % rfc.score(X_train,y_train))
print("RFC Val score:   %s" % rfc.score(X_val,y_val))
print("RFC Test score:  %s" % rfc.score(X_test,y_test))

RFC Train score: 0.9899853876008674
RFC Val score:   0.6966978146755519
RFC Test score:  0.6971646414105314


In [84]:
for pair in sorted(zip(X.columns,rfc.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % .8f" % pair)

               skip_avg_energy |  0.02108318
                skip_avg_tempo |  0.02074732
        us_popularity_estimate |  0.02024197
             skip_avg_loudness |  0.02001695
                play_avg_tempo |  0.01967850
            skip_avg_mechanism |  0.01942243
                  play_avg_key |  0.01928399
             skip_avg_liveness |  0.01922174
             skip_avg_duration |  0.01897020
                      duration |  0.01829254
                  skip_avg_key |  0.01814704
             play_avg_liveness |  0.01811061
         play_avg_acousticness |  0.01804260
                      liveness |  0.01795387
             play_avg_organism |  0.01792131
         skip_avg_acousticness |  0.01787204
                   speechiness |  0.01782226
                      loudness |  0.01777418
                         tempo |  0.01773300
          skip_avg_speechiness |  0.01745422
                       valence |  0.01743127
              play_avg_valence |  0.01738857
          

In [85]:
# Train a XGBoost model and score
xgb = XGBClassifier(
    n_estimators=100000
).fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train),(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

print("XGB Train score: %s" % xgb.score(X_train,y_train))
print("XGB Val score:   %s" % xgb.score(X_val,y_val))
print("XGB Test score:  %s" % xgb.score(X_test,y_test))

[0]	validation_0-error:0.384632	validation_1-error:0.389896
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.378443	validation_1-error:0.383456
[2]	validation_0-error:0.368121	validation_1-error:0.375302
[3]	validation_0-error:0.366008	validation_1-error:0.373032
[4]	validation_0-error:0.365738	validation_1-error:0.373032
[5]	validation_0-error:0.365115	validation_1-error:0.372175
[6]	validation_0-error:0.365077	validation_1-error:0.371766
[7]	validation_0-error:0.364053	validation_1-error:0.371803
[8]	validation_0-error:0.364053	validation_1-error:0.37117
[9]	validation_0-error:0.363532	validation_1-error:0.370612
[10]	validation_0-error:0.362723	validation_1-error:0.370649
[11]	validation_0-error:0.36276	validation_1-error:0.370984
[12]	validation_0-error:0.362834	validation_1-error:0.370537
[13]	validation_0-error:0.36276	validation_1-error:0.37038

In [86]:
for pair in sorted(zip(X.columns,xgb.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % .8f" % pair)

                 skip_avg_mode |  0.10525854
                skip_avg_tempo |  0.09254078
             skip_avg_loudness |  0.07906241
                 play_avg_mode |  0.06369763
                  play_avg_key |  0.05403186
     play_avg_instrumentalness |  0.05038163
                  skip_avg_key |  0.03550071
             skip_avg_liveness |  0.03530635
     skip_avg_instrumentalness |  0.03030147
             play_avg_liveness |  0.02397620
         play_avg_acousticness |  0.02175436
             skip_avg_duration |  0.02101089
             play_avg_organism |  0.02057321
              session_position |  0.01770537
         skip_avg_acousticness |  0.01714750
          skip_avg_speechiness |  0.01612241
             skip_avg_organism |  0.01415796
            skip_avg_mechanism |  0.01369866
                play_avg_tempo |  0.01367942
       play_avg_time_signature |  0.01280065
        us_popularity_estimate |  0.01218331
       play_avg_dyn_range_mean |  0.01099710
play_avg_u

In [87]:
# Train a LightGBM model and score
lgbm = LGBMClassifier(
    n_estimators=100000
).fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train),(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

print()
print("LGBM Train score: %s" % lgbm.score(X_train,y_train))
print("LGBM Val score:   %s" % lgbm.score(X_val,y_val))
print("LGBM Test score:  %s" % lgbm.score(X_test,y_test))

[1]	training's binary_logloss: 0.682082	valid_1's binary_logloss: 0.682631
Training until validation scores don't improve for 10 rounds
[2]	training's binary_logloss: 0.67344	valid_1's binary_logloss: 0.674418
[3]	training's binary_logloss: 0.666317	valid_1's binary_logloss: 0.667731
[4]	training's binary_logloss: 0.660262	valid_1's binary_logloss: 0.662037
[5]	training's binary_logloss: 0.655219	valid_1's binary_logloss: 0.657349
[6]	training's binary_logloss: 0.650868	valid_1's binary_logloss: 0.653369
[7]	training's binary_logloss: 0.647033	valid_1's binary_logloss: 0.649838
[8]	training's binary_logloss: 0.643778	valid_1's binary_logloss: 0.646973
[9]	training's binary_logloss: 0.640926	valid_1's binary_logloss: 0.644518
[10]	training's binary_logloss: 0.638403	valid_1's binary_logloss: 0.642308
[11]	training's binary_logloss: 0.636198	valid_1's binary_logloss: 0.640398
[12]	training's binary_logloss: 0.634204	valid_1's binary_logloss: 0.638694
[13]	training's binary_logloss: 0.632

In [88]:
for pair in sorted(zip(X.columns,lgbm.feature_importances_),key=lambda p:p[1],reverse=True):
    print("%30s | % 15.8f" % pair)

     play_avg_instrumentalness |    245.00000000
        us_popularity_estimate |    217.00000000
     skip_avg_instrumentalness |    217.00000000
                skip_avg_tempo |    215.00000000
             skip_avg_liveness |    207.00000000
play_avg_us_popularity_estimate |    204.00000000
             play_avg_duration |    202.00000000
          skip_avg_speechiness |    198.00000000
                  play_avg_key |    198.00000000
             skip_avg_duration |    195.00000000
              play_avg_valence |    187.00000000
         skip_avg_acousticness |    183.00000000
             play_avg_loudness |    183.00000000
skip_avg_us_popularity_estimate |    182.00000000
                  skip_avg_key |    177.00000000
             play_avg_liveness |    172.00000000
             skip_avg_loudness |    170.00000000
              skip_avg_valence |    170.00000000
         play_avg_acousticness |    166.00000000
              session_position |    158.00000000
          play_avg

***

That model's performance is _suspiciously_ good. Can we look at the cases it can't predict...

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print("Logistic Regression Confusion Matrix")
print(confusion_matrix(
    y_test,
    log.predict(X_test)
))

In [None]:
print("Random Forest Confusion Matrix")
print(confusion_matrix(
    y_test,
    rfc.predict(X_test)
))

In [None]:
print("XGBoost Confusion Matrix")
print(confusion_matrix(
    y_test,
    xgb.predict(X_test)
))

In [None]:
print("LightGBM Confusion Matrix")
print(confusion_matrix(
    y_test,
    lgbm.predict(X_test)
))