### Load Libraries

In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import scipy.sparse as sp

In [3]:
from utilities import utils

In [4]:
sparse_matrix = sp.load_npz('./data/subset_mpdNDB_sparse_matrix.npz')

In [5]:
sparse_matrix

<171069x170117 sparse matrix of type '<class 'numpy.int8'>'
	with 18378358 stored elements in Compressed Sparse Row format>

In [7]:
R = sparse_matrix.nnz
I = sparse_matrix.shape[1]
U = sparse_matrix.shape[0]

In [8]:
test_Sparsity = 1 - (R / (I * U))
print(f'Sparsity level of Rating Matrix, {test_Sparsity:1.6f}')

Sparsity level of Rating Matrix, 0.999368


### Convert sparse matrix (csr) to Dataframe

In [9]:
data = pd.DataFrame({'pid': sparse_matrix.tocoo().row, 'tid': sparse_matrix.tocoo().col, 'rate': sparse_matrix.tocoo().data})

In [10]:
data

Unnamed: 0,pid,tid,rate
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
18378353,171068,145822,1
18378354,171068,148898,1
18378355,171068,153234,1
18378356,171068,154997,1


### Train Test Split stratify by Playlist ID (pid)

In [12]:
train_data, test_data = train_test_split(data, test_size = .3, random_state=42, stratify = data['pid'])

In [10]:
train_data = train_data.sort_values(by=['pid'], ascending=True).sort_index()

In [11]:
train_data = train_data.reset_index(drop=True)

In [12]:
train_data.shape

(12864850, 3)

In [13]:
train_data['pid'].nunique()

171069

In [14]:
train_data['tid'].nunique()

170068

In [15]:
train_data.tail()

Unnamed: 0,pid,tid,rate
12864845,171068,125417,1
12864846,171068,129942,1
12864847,171068,153234,1
12864848,171068,154997,1
12864849,171068,160312,1


In [16]:
utils.memory_usage(train_data)

110.42

### Save Train-Test data into Dataframe and Sparse Matrix Format

In [17]:
# Write train_data to pickle
print(f'Export DF {len(train_data)} train_data_subset_mpdNDB playlist to Pickle')
train_data.to_pickle('./data/train_data_subset_mpdNDB.compress', compression='gzip')

Export DF 12864850 train_data_subset_mpdNDB playlist to Pickle


In [18]:
row_size = sparse_matrix.shape[0]
col_size = sparse_matrix.shape[1]

In [19]:
tr_data = train_data.rate
tr_rows = train_data.pid
tr_cols = train_data.tid

In [21]:
train_subset_mpdNDB_sparse_matrix = sp.csr_matrix((tr_data, (tr_rows, tr_cols)), shape=(row_size, col_size))

In [26]:
train_subset_mpdNDB_sparse_matrix

<171069x170117 sparse matrix of type '<class 'numpy.int8'>'
	with 12864850 stored elements in Compressed Sparse Row format>

In [24]:
sp.save_npz('./data/train_subset_mpdNDB_sparse_matrix.npz', train_subset_mpdNDB_sparse_matrix)

In [27]:
test_data = test_data.sort_values(by=['pid'], ascending=True).sort_index()

In [28]:
test_data = test_data.reset_index(drop=True)

In [29]:
test_data.shape

(5513508, 3)

In [30]:
test_data['pid'].nunique()

171069

In [31]:
test_data['tid'].nunique()

167581

In [32]:
test_data.tail()

Unnamed: 0,pid,tid,rate
5513503,171068,108885,1
5513504,171068,115068,1
5513505,171068,119735,1
5513506,171068,145822,1
5513507,171068,148898,1


In [33]:
utils.memory_usage(test_data)

47.32

In [34]:
# Write test_data to pickle
print(f'Export DF {len(test_data)} test_data_subset_mpdNDB tracks to Pickle')
test_data.to_pickle('./data/test_data_subset_mpdNDB.compress', compression='gzip')

Export DF 5513508 test_data_subset_mpdNDB tracks to Pickle


In [35]:
te_data = test_data.rate
te_rows = test_data.pid
te_cols = test_data.tid

In [36]:
test_subset_mpdNDB_sparse_matrix = sp.csr_matrix((te_data, (te_rows, te_cols)), shape=(row_size, col_size))

In [37]:
test_subset_mpdNDB_sparse_matrix

<171069x170117 sparse matrix of type '<class 'numpy.int8'>'
	with 5513508 stored elements in Compressed Sparse Row format>

In [38]:
sp.save_npz('./data/test_subset_mpdNDB_sparse_matrix.npz', test_subset_mpdNDB_sparse_matrix)