In [1]:
%%time

import numpy as np
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import tarfile as tar
import gzip as gz
import shutil
import glob
import os

CPU times: user 1.24 s, sys: 1.01 s, total: 2.25 s
Wall time: 1.16 s


In [2]:
%%time

with tar.open("Webscope_R2-1.tgz", "r") as tar_ref:
    tar_ref.extractall()
with tar.open("Webscope_R2-2.tgz", "r") as tar_ref:
    tar_ref.extractall()

CPU times: user 1min 50s, sys: 15.8 s, total: 2min 6s
Wall time: 3min 5s


In [3]:
%%time

with open('train_tmp.txt','wb') as wfd:
    for f in glob.glob("ydata-ymusic-user-song-ratings-meta-v1_0/train*.txt"):
        with open(f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

CPU times: user 4.35 s, sys: 25.1 s, total: 29.5 s
Wall time: 1min 32s


In [4]:
%%time

with open('test_tmp.txt','wb') as wfd:
    for f in glob.glob("ydata-ymusic-user-song-ratings-meta-v1_0/test*.txt"):
        with open(f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

CPU times: user 145 ms, sys: 899 ms, total: 1.04 s
Wall time: 30.5 s


In [5]:
%%time

shutil.rmtree("ydata-ymusic-user-song-ratings-meta-v1_0/")

CPU times: user 0 ns, sys: 2.76 s, total: 2.76 s
Wall time: 2.76 s


In [6]:
m = 1_823_179
n = 136_736

nnz_train = 699_640_226
nnz_test = 18_231_790

train_data_file = "train_tmp.txt"
test_data_file = "test_tmp.txt"

In [7]:
print("preparing training data")
%time test_data = pd.read_csv(test_data_file, delimiter = "\t",header=None)
test_user = test_data[0].values
test_item = test_data[1].values
test_rating = test_data[2].values

print("preparing test data")
%time train_data = pd.read_csv(train_data_file, delimiter = "\t",header=None)
train_user = train_data[0].values
train_item = train_data[1].values
train_rating = train_data[2].values

preparing training data
CPU times: user 7.58 s, sys: 1.1 s, total: 8.68 s
Wall time: 8.75 s
preparing test data
CPU times: user 4min 42s, sys: 42 s, total: 5min 24s
Wall time: 5min 29s


In [8]:
%%time

os.remove("test_tmp.txt")
os.remove("train_tmp.txt")

CPU times: user 0 ns, sys: 3.2 s, total: 3.2 s
Wall time: 3.22 s


In [9]:
%%time

print(test_user)
print(test_item)
print(test_rating)
print("")
print(np.max(test_user))
print(np.max(test_item))
print(np.max(test_rating))
print("")
print(np.min(test_user))
print(np.min(test_item))
print(np.min(test_rating))
print("")
print(np.unique(test_user).size)
print(np.unique(test_item).size)
print(np.unique(test_rating).size)
print("")
print(test_user.size)

assert test_user.size == nnz_test

[      0       0       0 ... 1823178 1823178 1823178]
[  7171   8637  21966 ...  81597 116524 118873]
[5 4 4 ... 1 3 2]

1823178
136735
5

0
0
1

1823179
136735
5

18231790
CPU times: user 3.61 s, sys: 111 ms, total: 3.72 s
Wall time: 3.73 s


In [10]:
%%time

print(train_user)
print(train_item)
print(train_rating)
print("")
print(np.max(train_user))
print(np.max(train_item))
print(np.max(train_rating))
print("")
print(np.min(train_user))
print(np.min(train_item))
print(np.min(train_rating))
print("")
print(np.unique(train_user).size)
print(np.unique(train_item).size)
print(np.unique(train_rating).size)
print("")
print(train_user.size)

assert train_user.size == nnz_train
assert np.max(train_user)+1 == m  # ids start with 0
assert np.max(train_item)+1 == n  # ids start with 0

[      0       0       0 ... 1823178 1823178 1823178]
[   166   2245   3637 ... 125420 128016 135359]
[5 4 4 ... 1 5 5]

1823178
136735
5

0
0
1

1823179
136736
5

699640226
CPU times: user 2min 23s, sys: 10.1 s, total: 2min 33s
Wall time: 2min 33s


In [11]:
%%time

R_test_coo = sparse.coo_matrix((test_rating, (test_user, test_item)))
assert R_test_coo.nnz == nnz_test

# 0-based to 1-based
outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str(test_user[i] + 1) + " " + str(test_item[i] + 1) + " " + str(test_rating[i]) + "\n")
outfile_test.close()

CPU times: user 2min 47s, sys: 506 ms, total: 2min 47s
Wall time: 2min 47s


In [12]:
%%time
# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 34.8 ms, sys: 330 ms, total: 365 ms
Wall time: 362 ms


In [13]:
%%time

print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[5 4 4 ... 1 3 2]
[      0       0       0 ... 1823178 1823178 1823178]
[  7171   8637  21966 ...  81597 116524 118873]

[5. 4. 4. ... 1. 3. 2.]
[      0       0       0 ... 1823178 1823178 1823178]
[  7171   8637  21966 ...  81597 116524 118873]
CPU times: user 1.75 ms, sys: 108 µs, total: 1.86 ms
Wall time: 1.53 ms


In [14]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(test_user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(test_item).size)
print(np.unique(R_test_coo.col).size)

5
1823178
136735

1
0
0

1823179
1823179
136735
136735
CPU times: user 5.07 s, sys: 115 ms, total: 5.18 s
Wall time: 5.17 s


In [15]:
%%time

R_train_coo = sparse.coo_matrix((train_rating, (train_user, train_item)))
assert R_train_coo.nnz == nnz_train

# 0-based to 1-based
outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str(train_user[i] + 1) + " " + str(train_item[i] + 1) + " " + str(train_rating[i]) + "\n")
outfile_train.close()

CPU times: user 1h 48min 56s, sys: 44.8 s, total: 1h 49min 41s
Wall time: 1h 49min 41s


In [16]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 1min 27s, sys: 1min 10s, total: 2min 38s
Wall time: 3min 4s


In [17]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 1.53 s, sys: 38.5 s, total: 40.1 s
Wall time: 2min 38s


In [18]:
%%time

print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[5 4 4 ... 1 5 5]
[      0       0       0 ... 1823178 1823178 1823178]
[   166   2245   3637 ... 125420 128016 135359]

[5. 4. 4. ... 1. 5. 5.]
[      0       0       0 ... 1823178 1823178 1823178]
[   166   2245   3637 ... 125420 128016 135359]

[5 4 4 ... 1 5 5]
[   166   2245   3637 ... 125420 128016 135359]
[        0        34       421 ... 699635967 699640164 699640226]

[5. 4. 4. ... 1. 5. 5.]
[   166   2245   3637 ... 125420 128016 135359]
[        0        34       421 ... 699635967 699640164 699640226]

[5 5 3 ... 5 3 5]
[    354    1051    2549 ... 1822688 1822731 1822735]
[        0      2005      3351 ... 699625566 699630220 699640226]

[5. 5. 3. ... 5. 3. 5.]
[    354    1051    2549 ... 1822688 1822731 1822735]
[        0      2005      3351 ... 699625566 699630220 699640226]
CPU times: user 7.8 ms, sys: 3.99 ms, total: 11.8 ms
Wall time: 147 ms


In [19]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(train_user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(train_item).size)
print(np.unique(R_train_coo.col).size)

5
1823178
136735

1
0
0

1823179
1823179
136736
136736
CPU times: user 3min 25s, sys: 56.4 s, total: 4min 21s
Wall time: 5min 43s


In [20]:
%%time
#NNZ by cols
print(len(R_train_csr.getnnz(axis=0)))
print(R_train_csr.getnnz(axis=0))
print("")
print(np.min(R_train_csr.getnnz(axis=0)))
print(np.max(R_train_csr.getnnz(axis=0)))
print("")
print(np.mean(R_train_csr.getnnz(axis=0)))

136736
[ 2005  1346  1835 ...  1092  4654 10006]

929
323512

5116.722925930259
CPU times: user 33.8 s, sys: 35.4 s, total: 1min 9s
Wall time: 1min 9s


In [21]:
%%time
#NNZ by rows
print(len(R_train_csr.getnnz(axis=1)))
print(R_train_csr.getnnz(axis=1))
print("")
print(np.min(R_train_csr.getnnz(axis=1)))
print(np.max(R_train_csr.getnnz(axis=1)))
print("")
print(np.mean(R_train_csr.getnnz(axis=1)))

1823179
[  34  387  358 ...  445 4197   62]

20
131523

383.7474137207592
CPU times: user 27.4 ms, sys: 6.68 ms, total: 34 ms
Wall time: 83.4 ms


In [22]:
%%time

print("write extra meta file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + "\n")
outfile_meta.write("""R_test_coo.data.bin
R_test_coo.row.bin
R_test_coo.col.bin
""")
outfile_meta.close()

write extra meta file
CPU times: user 1.12 ms, sys: 3 µs, total: 1.12 ms
Wall time: 970 µs


In [23]:
%%time

print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")
outfile_meta.close()

writing extra meta file
CPU times: user 1.32 ms, sys: 24 µs, total: 1.35 ms
Wall time: 915 µs
