In [3]:
import pandas as pd
import numpy as np
import math
import pickle
import torch
import random
import torch.nn as nn
import torch.optim as optim
import time
from torch.utils.data import Dataset, DataLoader
from models.BertSeqTransformer import StandardTransformer
from datasets.SpotifyDataset import SpotifyDataset, bert_collate_fn, custom_collate_fn
torch.manual_seed(1)
EPOCHS = 1

N=100000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

You are using device: cpu


In [30]:
def accuracy(output, target):
        
    seq_len = target.shape[1]
    correct = output.eq(target)
    correct = correct.sum(axis=1) * 1.0
    acc = correct / seq_len
    return acc

def accuracy_at_k(output, target):
    
    output = output.to(device)
    target = target.to(device)
    
    T = output.shape[1]
    batch_size = target.shape[0]
    acc = torch.zeros(T)

    for i in range(T):        
        acc[i] = torch.mean(accuracy(output[:,i].reshape(batch_size,1), target[:,i].reshape(batch_size,1)))
        
    return acc

In [2]:
print("READING THE DATA")
with open("data/all_session_tracks_train.pkl", 'rb') as f:
    train_tracks = pickle.load(f)
    train_tracks = train_tracks[0:N]

with open("data/all_session_skips_train.pkl", 'rb') as f:
    train_skips = pickle.load(f)
    train_skips = train_skips[0:N]

with open("data/all_session_tracks_test.pkl", 'rb') as f:
    test_tracks = pickle.load(f)
    test_tracks = test_tracks[0:N]

with open("data/all_session_skips_test.pkl", 'rb') as f:
    test_skips = pickle.load(f)
    test_skips = test_skips[0:N]

with open("data/track_vocabs.pkl", 'rb') as f:
    track_vocab = pickle.load(f)

READING THE DATA


In [45]:
bert_aug_seq_preds = np.load('output/transformer_bert_aug_seq_preds_v2.npy').astype(int)
bert_aug_seq_labels = np.load('output/transformer_bert_aug_seq_labels.npy').astype(int)

seq_preds = np.load('output/transformer_seq_preds_v2.npy')
seq_labels = np.load('output/transformer_seq_labels.npy').astype(int)

skip_preds = np.load('output/transformer_skip_preds_v2.npy')
skip_labels = np.load('output/transformer_skip_labels.npy')
skip_preds = np.argmax(skip_preds, axis=2)

In [46]:
seq_preds

array([[ 12168,  81694,  28681, ...,  27147,  65175,  17010],
       [ 14657,   9756,  99171, ...,  64317,  23134,  42242],
       [ 39370,   2339,  64515, ...,  10186,  79031,  52474],
       ...,
       [ 75651,  71519,  36036, ...,  98671,  99672,  22421],
       [ 68105,  76688,  81105, ...,  89613,  60948,  11095],
       [  3744,  80300,  91501, ...,   9235, 101661,   9235]])

In [47]:
bert_aug_seq_preds

array([[12562, 52180, 62182, ..., 65033, 75651, 12168],
       [14657, 38284, 83850, ..., 83850, 38284, 83850],
       [39370, 43012, 28185, ..., 24596, 28185, 43012],
       ...,
       [75651, 71519, 36036, ..., 98671, 99672, 22421],
       [99652, 99652, 99652, ...,   230, 20159, 20159],
       [27147, 24049, 24049, ..., 24049, 24049, 24049]])

In [18]:
skip_preds

array([[1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [35]:
print("BERT AUG LOCATION ACCURACY")
accuracy_at_k(torch.Tensor(bert_aug_seq_preds), torch.Tensor(bert_aug_seq_labels))

BERT AUG LOCATION ACCURACY


tensor([0.2457, 0.2168, 0.1947, 0.1788, 0.1647, 0.1522, 0.1404, 0.1301, 0.1183,
        0.1091])

In [37]:
print("SEQ TRANSFORMER LOCATION ACCURACY")
accuracy_at_k(torch.Tensor(seq_preds), torch.Tensor(seq_labels))

SEQ TRANSFORMER LOCATION ACCURACY


tensor([0.2774, 0.2451, 0.2172, 0.2007, 0.1826, 0.1682, 0.1556, 0.1431, 0.1310,
        0.1222])

In [39]:
print("SKIP TRANSFORMER LOCATION ACCURACY")
accuracy_at_k(torch.Tensor(skip_preds), torch.Tensor(skip_labels))

SKIP TRANSFORMER LOCATION ACCURACY


tensor([0.5289, 0.5296, 0.5367, 0.5367, 0.5453, 0.5352, 0.5449, 0.5336, 0.5275,
        0.5075])

In [67]:
#number of songs correct in the whole sequence
x = np.sum(bert_aug_seq_preds==seq_labels, axis=1)
unique, counts = np.unique(x, return_counts=True)
counts/100000*100

array([67.777,  9.52 ,  3.637,  2.123,  1.586,  1.383,  1.171,  1.092,
        1.066,  1.005,  9.64 ])

In [66]:
#number of songs correct in the whole sequence
x = np.sum(seq_preds==seq_labels, axis=1)
unique, counts = np.unique(x, return_counts=True)
counts/100000*100

array([66.502,  8.448,  3.407,  2.236,  1.823,  1.611,  1.362,  1.275,
        1.193,  1.086, 11.057])

In [117]:
seq_preds_sort = np.sort(seq_preds,axis=1)
unique_per_session = (seq_preds_sort[:,1:] != seq_preds_sort[:,:-1]).sum(axis=1)+1
correct_per_session = np.sum(seq_preds==seq_labels, axis=1)
df = pd.DataFrame({'unique_per_session':unique_per_session, 'correct_per_session':correct_per_session})
df.groupby('unique_per_session').sum('correct_per_session')/df.groupby('unique_per_session').count()

Unnamed: 0_level_0,correct_per_session
unique_per_session,Unnamed: 1_level_1
1,0.478357
2,0.299649
3,0.324548
4,0.492754
5,0.494466
6,0.582177
7,0.706208
8,0.91757
9,0.773691
10,2.683918


In [116]:
seq_preds_sort = np.sort(bert_aug_seq_preds,axis=1)
unique_per_session = (seq_preds_sort[:,1:] != seq_preds_sort[:,:-1]).sum(axis=1)+1
correct_per_session = np.sum(seq_preds==seq_labels, axis=1)
df = pd.DataFrame({'unique_per_session':unique_per_session, 'correct_per_session':correct_per_session})
df.groupby('unique_per_session').sum('correct_per_session')/df.groupby('unique_per_session').count()

Unnamed: 0_level_0,correct_per_session
unique_per_session,Unnamed: 1_level_1
1,0.382192
2,0.468513
3,0.441935
4,0.562171
5,0.600872
6,0.730182
7,0.921245
8,1.437368
9,1.787886
10,6.176684


In [121]:
#number of skips correct in the whole sequence
x = np.sum(skip_preds==skip_labels, axis=1)
unique, counts = np.unique(x, return_counts=True)
counts/100000*100

array([ 3.201,  5.576,  7.796,  9.618, 11.939, 12.827, 13.124, 12.477,
       10.906,  7.995,  4.541])

In [132]:
number_of_skips = np.sum(skip_preds, axis=1)
correct_per_session = np.sum(skip_preds==skip_labels, axis=1)
df = pd.DataFrame({'number_of_skips':unique_per_session, 'accuracy_per_session':correct_per_session})
df = df.groupby('number_of_skips').sum('correct_per_session')/df.groupby('number_of_skips').count()
df = df/10
df

Unnamed: 0_level_0,accuracy_per_session
number_of_skips,Unnamed: 1_level_1
1,0.565702
2,0.553062
3,0.549985
4,0.547071
5,0.53256
6,0.545497
7,0.534895
8,0.534553
9,0.53157
10,0.525982
