In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import random

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import faiss
from faiss import normalize_L2

pd.set_option('display.max_columns', None)
path = '/Users/aaronng/Downloads/Thesis/'

seed=123
dataset_suffix = ''
type_suffix = ''

np.random.seed(seed)

In [2]:
# Load train, val, test audio states
train = pd.read_csv(path + 'states-train{}{}.csv'.format(dataset_suffix, type_suffix))
val = pd.read_csv(path + 'states-val{}{}.csv'.format(dataset_suffix, type_suffix))
test = pd.read_csv(path + 'states-test{}{}.csv'.format(dataset_suffix, type_suffix))

train.shape, val.shape, test.shape

((1620370, 21), (405382, 21), (402976, 21))

In [3]:
audio_cols = ['acousticness','beat_strength','bounciness','danceability','dyn_range_mean','energy','flatness','instrumentalness','key','liveness','loudness','mechanism','mode','organism','speechiness','tempo','time_signature','valence']
audio_cols.sort()

In [4]:
N=5

In [5]:
# Load train, val, test sets
train_raw = pd.read_csv(path + 'train{}{}.csv'.format(dataset_suffix, type_suffix))
val_raw = pd.read_csv(path + 'val{}{}.csv'.format(dataset_suffix, type_suffix))
test_raw = pd.read_csv(path + 'test{}{}.csv'.format(dataset_suffix, type_suffix))

train_raw['mode'] = np.where(train_raw['mode'] == 'major', 1, 0)
val_raw['mode'] = np.where(val_raw['mode'] == 'major', 1, 0)
test_raw['mode'] = np.where(test_raw['mode'] == 'major', 1, 0)

train_raw.shape, val_raw.shape, test_raw.shape

((1620370, 51), (405382, 51), (402976, 51))

In [6]:
scaler = MinMaxScaler(feature_range=(0,1))
train_values = scaler.fit_transform(train_raw[audio_cols])
train_values = pd.DataFrame(train_values, columns=audio_cols)
train_values = pd.concat([train_raw.drop(audio_cols, axis=1), train_values], axis=1)

val_values = scaler.transform(val_raw[audio_cols])
val_values = pd.DataFrame(val_values, columns=audio_cols)
val_values = pd.concat([val_raw.drop(audio_cols, axis=1), val_values], axis=1)

test_values = scaler.transform(test_raw[audio_cols])
test_values = pd.DataFrame(test_values, columns=audio_cols)
test_values = pd.concat([test_raw.drop(audio_cols, axis=1), test_values], axis=1)

train_values.shape, val_values.shape, test_values.shape

((1620370, 51), (405382, 51), (402976, 51))

In [7]:
# Load train, val, test top features, and remove sessions with no top features
train_topN = pd.read_csv(path + 'topN-features-train{}{}.csv'.format(dataset_suffix, type_suffix))
train_topN = train_topN[(train_topN['top1'] != 'NONE')].reset_index(drop=True)

val_topN = pd.read_csv(path + 'topN-features-val{}{}.csv'.format(dataset_suffix, type_suffix))
val_topN = val_topN[(val_topN['top1'] != 'NONE')].reset_index(drop=True)

test_topN = pd.read_csv(path + 'topN-features-test{}{}.csv'.format(dataset_suffix, type_suffix))
test_topN = test_topN[(test_topN['top1'] != 'NONE')].reset_index(drop=True)

train_topN

Unnamed: 0,session_id,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,top11,top12,top13,top14,top15,top16,top17,top18
0,0_00010fc5-b79e-4cdf-bc4c-f140d0f99a3a,energy,valence,bounciness,dyn_range_mean,key,mode,speechiness,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
1,0_00064a92-1353-417a-82e2-6f5e361bebff,key,beat_strength,bounciness,danceability,dyn_range_mean,energy,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,0_000cec57-79f9-4acf-87a1-9bb9063d012d,acousticness,organism,speechiness,liveness,energy,loudness,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,0_00127e4f-400b-4284-8e2c-e9accbb9c54b,danceability,organism,bounciness,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,0_0012dae0-48c3-4820-a719-1793f7a7f7ee,organism,mechanism,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88499,7_ffee2528-16a6-4bc9-a7f0-384fdebf9247,bounciness,danceability,beat_strength,dyn_range_mean,flatness,liveness,mechanism,valence,mode,organism,speechiness,NONE,NONE,NONE,NONE,NONE,NONE,NONE
88500,7_fff02c3d-178f-4bea-9f6d-1069de5b3213,mode,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
88501,7_fff288e4-04db-4403-afe9-1765516a6f53,tempo,liveness,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
88502,7_fffed5c4-67d3-4b59-b9e8-6a6c67414380,loudness,time_signature,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE


In [8]:
train_top1_feature_count = train_topN['top1'].value_counts()
train_top1_feature_count

mode                10130
acousticness         7507
speechiness          7220
liveness             5710
tempo                5661
beat_strength        5408
key                  5295
flatness             4945
energy               4925
valence              4634
mechanism            4632
dyn_range_mean       4498
loudness             4127
danceability         3955
bounciness           3352
organism             2578
time_signature       2249
instrumentalness     1678
Name: top1, dtype: int64

In [9]:
train_topN.shape, val_topN.shape, test_topN.shape

((88504, 19), (22158, 19), (22180, 19))

In [10]:
# Filter datasets and merging train/val with their top features
train = train[train['session_id'].isin(list(train_topN['session_id']))].reset_index(drop=True)
val = val[val['session_id'].isin(list(val_topN['session_id']))].reset_index(drop=True)
test = test[test['session_id'].isin(list(test_topN['session_id']))].reset_index(drop=True)

train.shape, val.shape, test.shape

((1454387, 21), (364270, 21), (362880, 21))

## Generate features

### 1) No. of tracks in each state for each feature

In [11]:
enc = OneHotEncoder(sparse=False)
audio_cols_onehot = [col + '_' + str(cat) for col in audio_cols for cat in range(0,3)]

train_states = pd.DataFrame(enc.fit_transform(train[audio_cols]), columns=audio_cols_onehot)
train_states['session_id'] = list(train['session_id'])
train_states['session_position'] = list(train['session_position'])
train_states = train_states[['session_id','session_position'] + audio_cols_onehot]

val_states = pd.DataFrame(enc.fit_transform(val[audio_cols]), columns=audio_cols_onehot)
val_states['session_id'] = list(val['session_id'])
val_states['session_position'] = list(val['session_position'])
val_states = val_states[['session_id','session_position'] + audio_cols_onehot]

test_states = pd.DataFrame(enc.fit_transform(test[audio_cols]), columns=audio_cols_onehot)
test_states['session_id'] = list(test['session_id'])
test_states['session_position'] = list(test['session_position'])
test_states = test_states[['session_id','session_position'] + audio_cols_onehot]

train_states.shape, val_states.shape, test_states.shape

((1454387, 56), (364270, 56), (362880, 56))

In [12]:
train_states_sum = train_states.drop('session_position',axis=1).groupby('session_id').sum().reset_index()
val_states_sum = val_states.drop('session_position',axis=1).groupby('session_id').sum().reset_index()
test_states_sum = test_states.drop('session_position',axis=1).groupby('session_id').sum().reset_index()

train_states_sum.shape, val_states_sum.shape, test_states_sum.shape

((88504, 55), (22158, 55), (22180, 55))

In [13]:
train_states_sum

Unnamed: 0,session_id,acousticness_0,acousticness_1,acousticness_2,beat_strength_0,beat_strength_1,beat_strength_2,bounciness_0,bounciness_1,bounciness_2,danceability_0,danceability_1,danceability_2,dyn_range_mean_0,dyn_range_mean_1,dyn_range_mean_2,energy_0,energy_1,energy_2,flatness_0,flatness_1,flatness_2,instrumentalness_0,instrumentalness_1,instrumentalness_2,key_0,key_1,key_2,liveness_0,liveness_1,liveness_2,loudness_0,loudness_1,loudness_2,mechanism_0,mechanism_1,mechanism_2,mode_0,mode_1,mode_2,organism_0,organism_1,organism_2,speechiness_0,speechiness_1,speechiness_2,tempo_0,tempo_1,tempo_2,time_signature_0,time_signature_1,time_signature_2,valence_0,valence_1,valence_2
0,0_00010fc5-b79e-4cdf-bc4c-f140d0f99a3a,20.0,0.0,0.0,11.0,0.0,9.0,8.0,12.0,0.0,20.0,0.0,0.0,9.0,0.0,11.0,12.0,8.0,0.0,9.0,11.0,0.0,20.0,0.0,0.0,10.0,10.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,12.0,8.0,0.0,20.0,0.0,0.0,16.0,4.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,11.0,9.0,0.0
1,0_00064a92-1353-417a-82e2-6f5e361bebff,17.0,0.0,0.0,9.0,8.0,0.0,9.0,8.0,0.0,9.0,8.0,0.0,11.0,0.0,6.0,6.0,11.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,10.0,7.0,0.0,17.0,0.0,0.0,12.0,5.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0,17.0,0.0,0.0
2,0_000cec57-79f9-4acf-87a1-9bb9063d012d,1.0,9.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,5.0,5.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,5.0,5.0,0.0,7.0,0.0,3.0,10.0,0.0,0.0,10.0,0.0,0.0,1.0,9.0,0.0,6.0,4.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0
3,0_00127e4f-400b-4284-8e2c-e9accbb9c54b,15.0,5.0,0.0,20.0,0.0,0.0,10.0,0.0,10.0,12.0,8.0,0.0,20.0,0.0,0.0,10.0,10.0,0.0,13.0,0.0,7.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,8.0,12.0,0.0,0.0,10.0,10.0,12.0,8.0,0.0,15.0,5.0,0.0,20.0,0.0,0.0,0.0,20.0,0.0,7.0,13.0,0.0
4,0_0012dae0-48c3-4820-a719-1793f7a7f7ee,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,5.0,6.0,0.0,11.0,0.0,0.0,6.0,5.0,0.0,11.0,0.0,0.0,6.0,5.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88499,7_ffee2528-16a6-4bc9-a7f0-384fdebf9247,20.0,0.0,0.0,12.0,0.0,8.0,7.0,13.0,0.0,9.0,11.0,0.0,13.0,0.0,7.0,20.0,0.0,0.0,7.0,13.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,12.0,0.0,8.0,20.0,0.0,0.0,7.0,13.0,0.0,0.0,10.0,10.0,12.0,8.0,0.0,12.0,0.0,8.0,20.0,0.0,0.0,20.0,0.0,0.0,10.0,10.0,0.0
88500,7_fff02c3d-178f-4bea-9f6d-1069de5b3213,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,11.0,0.0,9.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0,20.0,0.0,0.0
88501,7_fff288e4-04db-4403-afe9-1765516a6f53,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,11.0,0.0,2.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,6.0,7.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0
88502,7_fffed5c4-67d3-4b59-b9e8-6a6c67414380,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,3.0,7.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,4.0,0.0,6.0,10.0,0.0,0.0


### 2) No. of state transitions for each feature

In [14]:
def count_transitions(data):
    transitions_raw = []
    
    for i, sess_id in enumerate(data.index.unique()):
        session = data.loc[sess_id]
        sess_rel_transitions = []

        for audio_col in audio_cols:
            sess_feat = list(session[audio_col])
            sess_feat_transition_idx = [i for i in range(1,len(sess_feat)) if sess_feat[i]!=sess_feat[i-1]]
            sess_rel_transitions.append(len(sess_feat_transition_idx))

        transitions_raw.append(sess_rel_transitions)

        if i%100 == 0:
            print('Processed {} sessions'.format(i))
            
    return pd.DataFrame(
        transitions_raw, 
        columns=[col+'_transitions' for col in audio_cols], 
        index=data.index.unique()
    )

In [None]:
train_transitions = count_transitions(train.set_index('session_id'))
val_transitions = count_transitions(val.set_index('session_id'))
test_transitions = count_transitions(test.set_index('session_id'))
train_transitions

### 3) No. of relevant tracks in first N tracks

In [16]:
train_firstN_rel = train[train['session_position'] <= N].groupby('session_id').sum()
train_firstN_rel = train_firstN_rel['relevance']

val_firstN_rel = val[val['session_position'] <= N].groupby('session_id').sum()
val_firstN_rel = val_firstN_rel['relevance']

test_firstN_rel = test[test['session_position'] <= N].groupby('session_id').sum()
test_firstN_rel = test_firstN_rel['relevance']

train_firstN_rel

session_id
0_00010fc5-b79e-4cdf-bc4c-f140d0f99a3a    3
0_00064a92-1353-417a-82e2-6f5e361bebff    3
0_000cec57-79f9-4acf-87a1-9bb9063d012d    1
0_00127e4f-400b-4284-8e2c-e9accbb9c54b    2
0_0012dae0-48c3-4820-a719-1793f7a7f7ee    1
                                         ..
7_ffee2528-16a6-4bc9-a7f0-384fdebf9247    3
7_fff02c3d-178f-4bea-9f6d-1069de5b3213    1
7_fff288e4-04db-4403-afe9-1765516a6f53    2
7_fffed5c4-67d3-4b59-b9e8-6a6c67414380    2
7_ffffc201-3d16-48ea-8b8a-aa89b4d06f83    4
Name: relevance, Length: 88504, dtype: int64

### 4) No. of features with states

In [17]:
train_stateful_feats = train.groupby('session_id').nunique().apply(lambda x: x[audio_cols].replace(1,0).replace(2,1).replace(3,1).sum(), axis=1)
val_stateful_feats = val.groupby('session_id').nunique().apply(lambda x: x[audio_cols].replace(1,0).replace(2,1).replace(3,1).sum(), axis=1)
test_stateful_feats = test.groupby('session_id').nunique().apply(lambda x: x[audio_cols].replace(1,0).replace(2,1).replace(3,1).sum(), axis=1)


### 5) No. of state transitions that coincide with skip/non-skip transitions in first N tracks for each feature

In [18]:
def count_flip_transitions(data):
    transitions_raw = []
    
    for i, sess_id in enumerate(data.index.unique()):
        session = data.loc[sess_id]
        sess_rel_transitions = []

        for audio_col in audio_cols:
            sess_feat = list(session[audio_col])
            sess_feat_transition_idx = [i for i in range(1,len(sess_feat)) if sess_feat[i]!=sess_feat[i-1]]
            
            sess_feat_rel_transitions = []
            for idx in sess_feat_transition_idx:
                rel = list(session['relevance'])[idx-1:idx+1]
                if rel[0] != rel[1]:
                    sess_feat_rel_transitions.append(1)
                else:
                    sess_feat_rel_transitions.append(0)
            sess_rel_transitions.append(sess_feat_rel_transitions)

        transitions_raw.append(sess_rel_transitions)

        if i%100 == 0:
            print('Processed {} sessions'.format(i))
            
    return pd.DataFrame(
        transitions_raw, 
        columns=[col+'_transitions' for col in audio_cols], 
        index=data.index.unique()
    )

In [None]:
train_firstN_transitions_full = count_flip_transitions(train[train['session_position']<=N].set_index('session_id'))
train_flip_transitions = train_firstN_transitions_full.applymap(np.sum).astype(int)
train_firstN_transitions = train_firstN_transitions_full.applymap(len).astype(int)
train_flip_transitions.columns = [col+'_flip_transitions' for col in audio_cols]
train_firstN_transitions.columns = [col+'_firstN_transitions' for col in audio_cols]

val_firstN_transitions_full = count_flip_transitions(val[val['session_position']<=N].set_index('session_id'))
val_flip_transitions = val_firstN_transitions_full.applymap(np.sum).astype(int)
val_firstN_transitions = val_firstN_transitions_full.applymap(len).astype(int)
val_flip_transitions.columns = [col+'_flip_transitions' for col in audio_cols]
val_firstN_transitions.columns = [col+'_firstN_transitions' for col in audio_cols]

test_firstN_transitions_full = count_flip_transitions(test[test['session_position']<=N].set_index('session_id'))
test_flip_transitions = test_firstN_transitions_full.applymap(np.sum).astype(int)
test_firstN_transitions = test_firstN_transitions_full.applymap(len).astype(int)
test_flip_transitions.columns = [col+'_flip_transitions' for col in audio_cols]
test_firstN_transitions.columns = [col+'_firstN_transitions' for col in audio_cols]

train_flip_transitions

### 6) Scoring each feature based on firstN tracks, and remaining best state count per feature

In [20]:
def calc_firstN_score(states, K=10, N=5, topN=3, max_states=5):
    metrics = []
    count=0
    
    for session_id in states.index.unique():
        session_states = states.loc[session_id].reset_index(drop=True)
        
        session_firstN = session_states[session_states['session_position'] <= N]
        session_remaining = session_states[session_states['session_position'] > N]
        rel_tracks = np.sum(list(session_firstN['relevance']))
        
        firstN_feat_scores = []
        rem_best_state_count = []

        # Replace non-relevant tracks (0) with -1 score, relevant tracks (1) remain as score of 1
        session_firstN.insert(len(session_firstN.columns), 'score', session_firstN['relevance'].replace(0,-1))

        for feat in audio_cols:
            firstN_feat_scores_dict = session_firstN[[feat, 'score']].groupby(feat).sum()\
            .sort_values('score', ascending=False).to_dict()['score']
            firstN_feat_scores.append(np.max(list(firstN_feat_scores_dict.values())))
            
            best_state = list(firstN_feat_scores_dict.keys())[0]

            # Fill rest of dict (unseen states) with score of 0
            for i in range(max_states):
                if i not in firstN_feat_scores_dict:
                    firstN_feat_scores_dict[i]=0
            
            # Score states of remaining tracks
            session_remaining.insert(len(session_remaining.columns), feat+'_score', session_remaining[feat].replace(firstN_feat_scores_dict))
            rem_best_state_count.append((session_remaining[feat] == best_state).sum())
            
        feature_score = pd.DataFrame(audio_cols, columns=['feature'])
        feature_score['firstN_score'] = list(firstN_feat_scores)
        feature_score['rem_best_state_count'] = list(rem_best_state_count)
        # feature_score = feature_score.sort_values(['firstN_score'], ascending=[False])
        
        metrics.append([session_id] + list(firstN_feat_scores) + list(rem_best_state_count))

        count+=1
        if count%10 == 0:
            print('Processed {} sessions'.format(count))
    
    return pd.DataFrame(metrics, columns=['session_id'] + [col+'_firstN_score' for col in audio_cols]+ [col+'_rem_best_state_count' for col in audio_cols])

In [None]:
train_firstN_metrics = calc_firstN_score(train.set_index('session_id'))
train_firstN_metrics

In [None]:
val_firstN_metrics = calc_firstN_score(val.set_index('session_id'))
test_firstN_metrics = calc_firstN_score(test.set_index('session_id'))

In [24]:
# Concatenate all features into session vector for all sessions
train_comb = copy.deepcopy(train_states_sum)
train_comb = train_comb.merge(train_transitions.reset_index(), on='session_id')
train_comb['relevant_firstN'] = list(train_firstN_rel)
train_comb['stateful_feats'] = list(train_stateful_feats)
train_comb['session_length'] = list(train.groupby('session_id').max()['session_position'])
train_comb = train_comb.merge(train_flip_transitions.reset_index(), on='session_id')
train_comb = train_comb.merge(train_firstN_transitions.reset_index(), on='session_id')
feat_columns = [col for col in train_comb if col != 'session_id']
train_comb = train_comb.merge(train_firstN_metrics, on='session_id')
train_comb[feat_columns] = train_comb[feat_columns].astype(int)

val_comb = copy.deepcopy(val_states_sum)
val_comb = val_comb.merge(val_transitions.reset_index(), on='session_id')
val_comb['relevant_firstN'] = list(val_firstN_rel)
val_comb['stateful_feats'] = list(val_stateful_feats)
val_comb['session_length'] = list(val.groupby('session_id').max()['session_position'])
val_comb = val_comb.merge(val_flip_transitions.reset_index(), on='session_id')
val_comb = val_comb.merge(val_firstN_transitions.reset_index(), on='session_id')
val_comb = val_comb.merge(val_firstN_metrics, on='session_id')
val_comb[feat_columns] = val_comb[feat_columns].astype(int)

test_comb = copy.deepcopy(test_states_sum)
test_comb = test_comb.merge(test_transitions.reset_index(), on='session_id')
test_comb['relevant_firstN'] = list(test_firstN_rel)
test_comb['stateful_feats'] = list(test_stateful_feats)
test_comb['session_length'] = list(test.groupby('session_id').max()['session_position'])
test_comb = test_comb.merge(test_flip_transitions.reset_index(), on='session_id')
test_comb = test_comb.merge(test_firstN_transitions.reset_index(), on='session_id')
test_comb = test_comb.merge(test_firstN_metrics, on='session_id')
test_comb[feat_columns] = test_comb[feat_columns].astype(int)

train_comb

Unnamed: 0,session_id,acousticness_0,acousticness_1,acousticness_2,beat_strength_0,beat_strength_1,beat_strength_2,bounciness_0,bounciness_1,bounciness_2,danceability_0,danceability_1,danceability_2,dyn_range_mean_0,dyn_range_mean_1,dyn_range_mean_2,energy_0,energy_1,energy_2,flatness_0,flatness_1,flatness_2,instrumentalness_0,instrumentalness_1,instrumentalness_2,key_0,key_1,key_2,liveness_0,liveness_1,liveness_2,loudness_0,loudness_1,loudness_2,mechanism_0,mechanism_1,mechanism_2,mode_0,mode_1,mode_2,organism_0,organism_1,organism_2,speechiness_0,speechiness_1,speechiness_2,tempo_0,tempo_1,tempo_2,time_signature_0,time_signature_1,time_signature_2,valence_0,valence_1,valence_2,acousticness_transitions,beat_strength_transitions,bounciness_transitions,danceability_transitions,dyn_range_mean_transitions,energy_transitions,flatness_transitions,instrumentalness_transitions,key_transitions,liveness_transitions,loudness_transitions,mechanism_transitions,mode_transitions,organism_transitions,speechiness_transitions,tempo_transitions,time_signature_transitions,valence_transitions,relevant_firstN,stateful_feats,session_length,acousticness_flip_transitions,beat_strength_flip_transitions,bounciness_flip_transitions,danceability_flip_transitions,dyn_range_mean_flip_transitions,energy_flip_transitions,flatness_flip_transitions,instrumentalness_flip_transitions,key_flip_transitions,liveness_flip_transitions,loudness_flip_transitions,mechanism_flip_transitions,mode_flip_transitions,organism_flip_transitions,speechiness_flip_transitions,tempo_flip_transitions,time_signature_flip_transitions,valence_flip_transitions,acousticness_firstN_transitions,beat_strength_firstN_transitions,bounciness_firstN_transitions,danceability_firstN_transitions,dyn_range_mean_firstN_transitions,energy_firstN_transitions,flatness_firstN_transitions,instrumentalness_firstN_transitions,key_firstN_transitions,liveness_firstN_transitions,loudness_firstN_transitions,mechanism_firstN_transitions,mode_firstN_transitions,organism_firstN_transitions,speechiness_firstN_transitions,tempo_firstN_transitions,time_signature_firstN_transitions,valence_firstN_transitions,acousticness_firstN_score,beat_strength_firstN_score,bounciness_firstN_score,danceability_firstN_score,dyn_range_mean_firstN_score,energy_firstN_score,flatness_firstN_score,instrumentalness_firstN_score,key_firstN_score,liveness_firstN_score,loudness_firstN_score,mechanism_firstN_score,mode_firstN_score,organism_firstN_score,speechiness_firstN_score,tempo_firstN_score,time_signature_firstN_score,valence_firstN_score,acousticness_rem_best_state_count,beat_strength_rem_best_state_count,bounciness_rem_best_state_count,danceability_rem_best_state_count,dyn_range_mean_rem_best_state_count,energy_rem_best_state_count,flatness_rem_best_state_count,instrumentalness_rem_best_state_count,key_rem_best_state_count,liveness_rem_best_state_count,loudness_rem_best_state_count,mechanism_rem_best_state_count,mode_rem_best_state_count,organism_rem_best_state_count,speechiness_rem_best_state_count,tempo_rem_best_state_count,time_signature_rem_best_state_count,valence_rem_best_state_count
0,0_00010fc5-b79e-4cdf-bc4c-f140d0f99a3a,20,0,0,11,0,9,8,12,0,20,0,0,9,0,11,12,8,0,9,11,0,20,0,0,10,10,0,20,0,0,20,0,0,20,0,0,12,8,0,20,0,0,16,4,0,20,0,0,20,0,0,11,9,0,0,9,6,0,7,12,13,0,11,0,0,0,10,0,7,0,0,13,3,9,20,0,0,0,0,0,3,2,0,1,0,0,0,1,0,3,0,0,2,0,0,0,0,0,3,2,0,1,0,0,0,1,0,3,0,0,2,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,2,15,6,7,15,6,10,5,15,9,15,15,15,11,15,13,15,15,9
1,0_00064a92-1353-417a-82e2-6f5e361bebff,17,0,0,9,8,0,9,8,0,9,8,0,11,0,6,6,11,0,17,0,0,17,0,0,10,7,0,17,0,0,12,5,0,17,0,0,17,0,0,17,0,0,17,0,0,17,0,0,17,0,0,17,0,0,0,8,8,8,12,12,0,0,14,0,10,0,0,0,0,0,0,0,3,7,17,0,1,1,1,1,1,0,0,2,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,0,0,4,0,2,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,12,7,7,7,5,5,12,12,7,12,4,12,12,12,12,12,12,12
2,0_000cec57-79f9-4acf-87a1-9bb9063d012d,1,9,0,10,0,0,10,0,0,10,0,0,10,0,0,5,5,0,10,0,0,10,0,0,10,0,0,5,5,0,7,0,3,10,0,0,10,0,0,1,9,0,6,4,0,10,0,0,10,0,0,10,0,0,1,0,0,0,0,3,0,0,0,5,3,0,0,1,4,0,0,0,1,6,10,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2,0,0,0,1,0,0,0,-3,-3,-3,-3,-3,-1,-3,-3,-3,-1,-1,-3,-3,-3,-1,-3,-3,-3,4,5,5,5,5,2,5,5,5,2,2,5,5,4,1,5,5,5
3,0_00127e4f-400b-4284-8e2c-e9accbb9c54b,15,5,0,20,0,0,10,0,10,12,8,0,20,0,0,10,10,0,13,0,7,20,0,0,20,0,0,20,0,0,20,0,0,8,12,0,0,10,10,12,8,0,15,5,0,20,0,0,0,20,0,7,13,0,8,0,10,12,0,12,10,0,0,0,0,11,9,10,5,0,0,7,2,10,20,1,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,3,0,2,3,0,3,2,0,0,0,0,4,1,4,1,0,0,0,0,-1,1,1,-1,0,1,-1,-1,-1,-1,0,1,0,0,-1,-1,-1,3,15,7,9,15,8,6,15,15,15,15,10,9,10,11,15,15,8
4,0_0012dae0-48c3-4820-a719-1793f7a7f7ee,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,11,0,0,5,6,0,11,0,0,6,5,0,11,0,0,6,5,0,11,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,7,0,6,0,0,1,3,11,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,3,0,3,0,0,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-1,-3,-1,-3,-1,-3,-3,6,6,6,6,6,6,6,6,6,6,6,3,6,2,6,2,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88499,7_ffee2528-16a6-4bc9-a7f0-384fdebf9247,20,0,0,12,0,8,7,13,0,9,11,0,13,0,7,20,0,0,7,13,0,20,0,0,20,0,0,12,0,8,20,0,0,7,13,0,0,10,10,12,8,0,12,0,8,20,0,0,20,0,0,10,10,0,0,12,8,8,11,0,12,0,0,12,0,8,13,9,11,0,0,10,3,11,20,0,2,2,2,2,0,1,0,0,1,0,1,1,2,2,0,0,1,0,3,2,2,3,0,3,0,0,3,0,2,2,3,3,0,0,3,1,2,3,3,2,1,1,1,1,1,1,2,2,1,1,1,1,1,15,6,10,8,5,15,10,15,15,9,15,9,6,9,9,15,15,7
88500,7_fff02c3d-178f-4bea-9f6d-1069de5b3213,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,11,0,9,20,0,0,20,0,0,20,0,0,20,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,1,1,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,0,-3,-3,-3,-3,-3,15,15,15,15,15,15,15,15,15,15,15,15,7,15,15,15,15,15
88501,7_fff288e4-04db-4403-afe9-1765516a6f53,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,11,0,2,13,0,0,13,0,0,13,0,0,13,0,0,13,0,0,6,7,0,13,0,0,13,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,7,0,0,2,2,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,1,-1,-1,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,5,8,8
88502,7_fffed5c4-67d3-4b59-b9e8-6a6c67414380,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,3,7,0,10,0,0,10,0,0,10,0,0,10,0,0,10,0,0,4,0,6,10,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,7,0,2,2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,0,-1,5,5,5,5,5,5,5,5,5,5,3,5,5,5,5,5,2,5


### 7) Find top audio attributes of nearest sessions

In [9]:
feat_columns = [col for col in train.columns if col != 'session_position' and col != 'session_id']
train_gp = train.groupby('session_id').sum().astype('float32')
val_gp = val.groupby('session_id').sum().astype('float32')
test_gp = test.groupby('session_id').sum().astype('float32')
train_raw = np.ascontiguousarray(train_gp.values)
val_raw = np.ascontiguousarray(val_gp.values)
test_raw = np.ascontiguousarray(test_gp.values)
normalize_L2(train_raw)
normalize_L2(val_raw)
normalize_L2(test_raw)

In [10]:
index = faiss.IndexFlatIP(len(train_gp.columns))
print(index.is_trained)
index.add(train_raw)
print(index.ntotal)
train_dists, train_idxes = index.search(train_raw, 11)
val_dists, val_idxes = index.search(val_raw, 11)
test_dists, test_idxes = index.search(test_raw, 11)
train_idxes = train_idxes[:,1:]
val_idxes = val_idxes[:,1:]
test_idxes = test_idxes[:,1:]

True
214452


In [11]:
top1_feats = train['top1'].values
train_topN_feats = np.array([[top1_feats[idx] for idx in sess] for sess in train_idxes])
val_topN_feats = np.array([[top1_feats[idx] for idx in sess] for sess in val_idxes])
test_topN_feats = np.array([[top1_feats[idx] for idx in sess] for sess in test_idxes])
train_topN_feats.shape, val_topN_feats.shape, test_topN_feats.shape

((214452, 10), (53805, 10), (52940, 10))

In [12]:
def audio_col_count(x):
    col_count = np.zeros(len(audio_cols))
    indexes = [audio_cols.index(col) for col in x.values]
    for idx in indexes:
        col_count[idx] += 1
    return pd.Series(col_count)

In [13]:
train_col_count = pd.DataFrame(train_topN_feats).apply(audio_col_count,axis=1).astype(int)
train_col_count.columns = [col+'_nearest_sess' for col in audio_cols]
train_col_count.index = train['session_id']

val_col_count = pd.DataFrame(val_topN_feats).apply(audio_col_count,axis=1).astype(int)
val_col_count.columns = [col+'_nearest_sess' for col in audio_cols]
val_col_count.index = val['session_id']

test_col_count = pd.DataFrame(test_topN_feats).apply(audio_col_count,axis=1).astype(int)
test_col_count.columns = [col+'_nearest_sess' for col in audio_cols]
test_col_count.index = test['session_id']

train_col_count

Unnamed: 0_level_0,acousticness_nearest_sess,beat_strength_nearest_sess,bounciness_nearest_sess,danceability_nearest_sess,dyn_range_mean_nearest_sess,energy_nearest_sess,flatness_nearest_sess,instrumentalness_nearest_sess,key_nearest_sess,liveness_nearest_sess,loudness_nearest_sess,mechanism_nearest_sess,mode_nearest_sess,organism_nearest_sess,speechiness_nearest_sess,tempo_nearest_sess,time_signature_nearest_sess,valence_nearest_sess
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
22_0000f2fc-143f-41ab-a46e-d24f8de9af52,0,0,1,0,0,0,0,0,0,2,1,0,2,2,0,2,0,0
22_00029b9f-b730-4975-acab-4820cb2d5718,0,2,0,0,1,0,1,0,1,0,1,1,1,2,0,0,0,0
22_00099b7f-e596-40ef-a571-723e3773468a,1,1,0,2,1,0,1,0,1,1,0,0,0,0,0,2,0,0
22_0015cbbb-46c3-42f8-a61d-63fc4737efce,0,0,1,0,1,0,0,0,0,1,1,1,3,0,1,0,1,0
22_001771bd-b921-4539-b066-3b88bd04e3b4,1,0,0,1,0,1,1,0,0,3,2,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56_ffc6f689-349a-4183-a73d-aba116f31147,0,0,1,0,1,1,0,0,1,0,0,1,2,0,1,0,2,0
56_ffcb13d3-9914-4a24-97c0-bdf4d7b3d1ac,0,1,1,0,2,0,0,0,1,1,0,0,2,0,1,1,0,0
56_ffdca406-7575-486d-8157-ac384234ca1b,1,1,0,0,1,0,1,0,1,2,1,1,0,0,1,0,0,0
56_fff6830f-e50a-4566-9301-51c298119917,0,0,0,0,0,0,0,0,0,0,0,3,3,1,1,0,0,2


In [14]:
# Append to existing session vectors
train_comb = train_comb.merge(train_col_count.reset_index(), on='session_id')
val_comb = val_comb.merge(val_col_count.reset_index(), on='session_id')
test_comb = test_comb.merge(test_col_count.reset_index(), on='session_id')

## Prediction Model

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import lightgbm as lgbm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, accuracy_score, jaccard_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [34]:
## Grid search on n_estimators
metrics = []
top3_cols = ['top{}'.format(j) for j in range(1,4)]
feat_columns = [col for col in train_tmp if col != 'session_id' and col != 'top1']

for N_est in [20,50,100,150,200,250]:
    model = lgbm.LGBMClassifier(
        n_estimators=N_est,
        n_jobs=10,
        random_state=seed,
    )

    model.fit(train_tmp[feat_columns], train_tmp['top1'].values.ravel())
    
    train_proba = model.predict_proba(train_tmp[feat_columns])
    train_pred_idx = np.squeeze(np.argsort(train_proba)[:,::-1][:,:3])
    train_preds = pd.DataFrame([[audio_cols[idx] for idx in idxes] for idxes in train_pred_idx], columns=top3_cols)
    val_proba = model.predict_proba(val[feat_columns])
    val_pred_idx = np.squeeze(np.argsort(val_proba)[:,::-1][:,:3])
    val_preds = pd.DataFrame([[audio_cols[idx] for idx in idxes] for idxes in val_pred_idx], columns=top3_cols)

    train_soft_acc1 = np.sum([1 if tru in pred else 0 for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1']].values)]) / len(train_tmp)
    val_soft_acc1 = np.sum([1 if tru in pred else 0 for tru, pred in zip(val['top1'].values, val_preds[['top1']].values)]) / len(val)
    
    train_soft_acc2 = np.sum([1 if tru in pred else 0 for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1','top2']].values)]) / len(train_tmp)
    val_soft_acc2 = np.sum([1 if tru in pred else 0 for tru, pred in zip(val['top1'].values, val_preds[['top1','top2']].values)]) / len(val)
    
    train_soft_acc3 = np.sum([1 if tru in pred else 0 for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1','top2','top3']].values)]) / len(train_tmp)
    val_soft_acc3 = np.sum([1 if tru in pred else 0 for tru, pred in zip(val['top1'].values, val_preds[['top1','top2','top3']].values)]) / len(val)
    
    train_soft_1 = [tru if tru in pred else pred[0] for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1']].values)]
    val_soft_1 = [tru if tru in pred else pred[0] for tru, pred in zip(val_tmp['top1'].values, val_preds[['top1']].values)]
    
    train_soft_2 = [tru if tru in pred else pred[0] for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1','top2']].values)]
    val_soft_2 = [tru if tru in pred else pred[0] for tru, pred in zip(val_tmp['top1'].values, val_preds[['top1','top2']].values)]
    
    train_soft_3 = [tru if tru in pred else pred[0] for tru, pred in zip(train_tmp['top1'].values, train_preds[['top1','top2','top3']].values)]
    val_soft_3 = [tru if tru in pred else pred[0] for tru, pred in zip(val_tmp['top1'].values, val_preds[['top1','top2','top3']].values)]
    
    train_f1_1 = f1_score(train_tmp['top1'].values, train_soft_1, average='weighted')
    val_f1_1 = f1_score(val['top1'].values, val_soft_1, average='weighted')
    train_f1_2 = f1_score(train_tmp['top1'].values, train_soft_2, average='weighted')
    val_f1_2 = f1_score(val['top1'].values, val_soft_2, average='weighted')
    train_f1_3 = f1_score(train_tmp['top1'].values, train_soft_3, average='weighted')
    val_f1_3 = f1_score(val['top1'].values, val_soft_3, average='weighted')

    metrics.append([N_est, train_soft_acc1, val_soft_acc1, train_soft_acc2, val_soft_acc2, train_soft_acc3, val_soft_acc3, train_f1_1, val_f1_1, train_f1_2, val_f1_2, train_f1_3, val_f1_3])

pd.DataFrame(metrics, columns=['N_est','Train Soft Acc Top1','Val Soft Acc Top1','Train Soft Acc Top2','Val Soft Acc Top2','Train Soft Acc Top3','Val Soft Acc Top3',' Train F1 Top 1', 'Val F1 Top 1',' Train F1 Top 2', 'Val F1 Top 2',' Train F1 Top 3', 'Val F1 Top 3'])

Unnamed: 0,N_est,Train Soft Acc Top1,Val Soft Acc Top1,Train Soft Acc Top2,Val Soft Acc Top2,Train Soft Acc Top3,Val Soft Acc Top3,Train F1 Top 1,Val F1 Top 1,Train F1 Top 2,Val F1 Top 2,Train F1 Top 3,Val F1 Top 3
0,20,0.435948,0.408791,0.661164,0.63864,0.793586,0.773608,0.426981,0.398172,0.655418,0.63167,0.789737,0.768868
1,50,0.474181,0.428064,0.696846,0.656166,0.82168,0.789481,0.470211,0.423134,0.694693,0.653102,0.820251,0.787395
2,100,0.512926,0.432599,0.733348,0.661556,0.852181,0.793662,0.510289,0.428972,0.731957,0.659059,0.851248,0.791802
3,150,0.545828,0.432227,0.763672,0.661388,0.875497,0.792919,0.543764,0.429096,0.762665,0.659088,0.874852,0.791188
4,200,0.576684,0.431912,0.790923,0.660849,0.895678,0.79199,0.574975,0.428941,0.790143,0.658809,0.895212,0.790287
5,250,0.605329,0.430908,0.814047,0.659809,0.911519,0.792027,0.603929,0.428005,0.813455,0.65776,0.91117,0.790466


In [82]:
train_dist = pd.DataFrame(train['top1'].value_counts() / len(train))
train_dist.columns = ['true']
train_dist['preds'] = train_preds['top1'].value_counts() / len(train_preds)
train_dist

Unnamed: 0,true,preds
mode,0.086157,0.062436
acousticness,0.086157,0.10101
speechiness,0.083481,0.078982
liveness,0.067905,0.067449
tempo,0.066467,0.062973
beat_strength,0.063815,0.082114
key,0.061863,0.07321
flatness,0.057271,0.061676
energy,0.056605,0.060905
valence,0.054279,0.050879


In [83]:
val_dist = pd.DataFrame(val['top1'].value_counts() / len(val))
val_dist.columns = ['true']
val_dist['preds'] = val_preds['top1'].value_counts() / len(val_preds)
val_dist

Unnamed: 0,true,preds
mode,0.115843,0.068241
acousticness,0.085087,0.114895
speechiness,0.081022,0.076145
liveness,0.066299,0.06657
tempo,0.064673,0.06237
beat_strength,0.060699,0.091591
key,0.059525,0.073616
valence,0.055144,0.048866
flatness,0.054241,0.060112
dyn_range_mean,0.053654,0.047241


## Make test predictions

In [26]:
# Train with N_est that gives best F1 score and predict test set
# model = RandomForestClassifier(random_state=seed, n_jobs=10, n_estimators=500, class_weight='balanced')
model = lgbm.LGBMClassifier(
    n_estimators=100,
    n_jobs=10,
    random_state=seed,
)
# model = MultiOutputClassifier(model)
model.fit(train[feat_columns], train['top1'].values.ravel())

test_proba = model.predict_proba(test[feat_columns])
test_pred_idx = np.squeeze(np.argsort(test_proba)[:,::-1][:,:3])
test_preds = pd.DataFrame([[audio_cols[idx] for idx in idxes] for idxes in test_pred_idx], columns=top3_cols)
test_preds['session_id'] = list(test['session_id'])
test_preds = test_preds[['session_id'] + top3_cols]
test_preds

Unnamed: 0,session_id,top1,top2,top3
0,36_00042572-ee58-48eb-a6fc-675bda0bbb98,acousticness,beat_strength,mode
1,36_0007583d-ca87-4edc-8282-987bd32c95be,tempo,mechanism,acousticness
2,36_0009b5af-9798-4567-a7e7-03a8c701655f,acousticness,mode,speechiness
3,36_0009bf71-73ae-41bf-bc2a-84acfbb7cab9,speechiness,mode,acousticness
4,36_000c1684-8086-4151-b660-be5100368dc2,acousticness,speechiness,organism
...,...,...,...,...
52935,58_ffdd827f-1e2e-4351-8c3e-f2e444232abc,energy,mode,acousticness
52936,58_ffe22e73-179a-49a9-8ea8-c7568021fe91,loudness,acousticness,key
52937,58_fff1b8f2-9398-4bbd-8f7d-13e838fe9570,dyn_range_mean,bounciness,speechiness
52938,58_fff6508f-57f9-4f5a-848a-cfe768b01933,valence,danceability,energy


In [56]:
# Save predictions to file
test_preds.to_csv(path + 'top{}-features-test.csv'.format(3),index=False)

In [29]:
# Soft accuracy for test set
test_soft_acc1 = np.sum([1 if tru in pred else 0 for tru, pred in zip(test['top1'].values, test_preds[['top1']].values)]) / len(test)
test_soft_acc2 = np.sum([1 if tru in pred else 0 for tru, pred in zip(test['top1'].values, test_preds[['top1','top2']].values)]) / len(test)
test_soft_acc3 = np.sum([1 if tru in pred else 0 for tru, pred in zip(test['top1'].values, test_preds[['top1','top2','top3']].values)]) / len(test)
test_soft_acc1, test_soft_acc2, test_soft_acc3

(0.42553834529656215, 0.6512655836796373, 0.7843407631280696)

In [39]:
# Soft F1 for test set
test_soft_1 = [tru if tru in pred else pred[0] for tru, pred in zip(test['top1'].values, test_preds[['top1']].values)]
test_soft_2 = [tru if tru in pred else pred[0] for tru, pred in zip(test['top1'].values, test_preds[['top1','top2']].values)]
test_soft_3 = [tru if tru in pred else pred[0] for tru, pred in zip(test['top1'].values, test_preds[['top1','top2','top3']].values)]
test_soft_f1_1 = f1_score(test['top1'].values, test_soft_1, average='weighted')
test_soft_f1_2 = f1_score(test['top1'].values, test_soft_2, average='weighted')
test_soft_f1_3 = f1_score(test['top1'].values, test_soft_3, average='weighted')
test_soft_f1_1, test_soft_f1_2, test_soft_f1_3

(0.42225252174929295, 0.6490705270307066, 0.7828327278184543)

In [30]:
test_f1 = f1_score(test['top1'].values, test_preds['top1'].values, average='weighted')
test_f1

0.42225252174929295

In [27]:
# train_firstN_metrics.to_csv(path + 'train_firstN_metrics{}{}.csv'.format(dataset_suffix, type_suffix),index=False)
# val_firstN_metrics.to_csv(path + 'val_firstN_metrics{}{}.csv'.format(dataset_suffix, type_suffix),index=False)
# test_firstN_metrics.to_csv(path + 'test_firstN_metrics{}{}.csv'.format(dataset_suffix, type_suffix),index=False)

In [29]:
# train_N.to_csv(path + 'train_N{}{}.csv'.format(dataset_suffix, type_suffix),index=False)
# val_N.to_csv(path + 'val_N{}{}.csv'.format(dataset_suffix, type_suffix),index=False)
# test_comb.to_csv(path + 'test_N{}{}.csv'.format(dataset_suffix, type_suffix),index=False)