In [1]:
%%time

import numpy as np
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import tarfile as tar
import gzip as gz
import shutil
import glob
import os

CPU times: user 998 ms, sys: 124 ms, total: 1.12 s
Wall time: 1.12 s


In [2]:
%%time

with tar.open("Webscope_R2-1.tgz", "r") as tar_ref:
    tar_ref.extractall()
with tar.open("Webscope_R2-2.tgz", "r") as tar_ref:
    tar_ref.extractall()

CPU times: user 1min 50s, sys: 19.2 s, total: 2min 9s
Wall time: 3min 20s


In [3]:
%%time

with open('train_tmp.txt','wb') as wfd:
    for f in glob.glob("ydata-ymusic-user-song-ratings-meta-v1_0/train*.txt"):
        with open(f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

CPU times: user 3.19 s, sys: 24.8 s, total: 28 s
Wall time: 46.4 s


In [4]:
%%time

with open('test_tmp.txt','wb') as wfd:
    for f in glob.glob("ydata-ymusic-user-song-ratings-meta-v1_0/test*.txt"):
        with open(f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

CPU times: user 83.9 ms, sys: 745 ms, total: 829 ms
Wall time: 38.5 s


In [5]:
%%time

shutil.rmtree("ydata-ymusic-user-song-ratings-meta-v1_0/")

CPU times: user 73 µs, sys: 3.46 s, total: 3.46 s
Wall time: 4.54 s


In [6]:
m = 1_823_179
n = 136_736

nnz_train = 699_640_226
nnz_test = 18_231_790

train_data_file = "train_tmp.txt"
test_data_file = "test_tmp.txt"

In [7]:
print("preparing training data")
%time test_data = pd.read_csv(test_data_file, delimiter = "\t",header=None)
test_user = test_data[0].values
test_item = test_data[1].values
test_rating = test_data[2].values

print("preparing test data")
%time train_data = pd.read_csv(train_data_file, delimiter = "\t",header=None)
train_user = train_data[0].values
train_item = train_data[1].values
train_rating = train_data[2].values

preparing training data
CPU times: user 8.39 s, sys: 1.01 s, total: 9.4 s
Wall time: 7.82 s
preparing test data
CPU times: user 4min 40s, sys: 54.8 s, total: 5min 34s
Wall time: 4min 45s


In [8]:
%%time

os.remove("test_tmp.txt")
os.remove("train_tmp.txt")

CPU times: user 0 ns, sys: 3.43 s, total: 3.43 s
Wall time: 3.46 s


In [9]:
%%time

print(test_user)
print(test_item)
print(test_rating)
print("")
print(np.max(test_user))
print(np.max(test_item))
print(np.max(test_rating))
print("")
print(np.min(test_user))
print(np.min(test_item))
print(np.min(test_rating))
print("")
print(np.unique(test_user).size)
print(np.unique(test_item).size)
print(np.unique(test_rating).size)
print("")
print(test_user.size)

assert test_user.size == nnz_test

[ 400000  400000  400000 ... 1399999 1399999 1399999]
[  2971   8102  30706 ...  99354 119511 123117]
[4 5 5 ... 3 3 3]

1823178
136735
5

0
0
1

1823179
136735
5

18231790
CPU times: user 6.62 s, sys: 205 ms, total: 6.83 s
Wall time: 3.74 s


In [10]:
%%time

print(train_user)
print(train_item)
print(train_rating)
print("")
print(np.max(train_user))
print(np.max(train_item))
print(np.max(train_rating))
print("")
print(np.min(train_user))
print(np.min(train_item))
print(np.min(train_rating))
print("")
print(np.unique(train_user).size)
print(np.unique(train_item).size)
print(np.unique(train_rating).size)
print("")
print(train_user.size)

assert train_user.size == nnz_train
assert np.max(train_user)+1 == m  # ids start with 0
assert np.max(train_item)+1 == n  # ids start with 0

[1600000 1600000 1600000 ...  399999  399999  399999]
[  2896   4468   8791 ... 114925 116754 125579]
[4 1 5 ... 5 1 2]

1823178
136735
5

0
0
1

1823179
136736
5

699640226
CPU times: user 2min 35s, sys: 36.8 s, total: 3min 12s
Wall time: 2min 44s


In [11]:
%%time

R_test_coo = sparse.coo_matrix((test_rating, (test_user, test_item)))
assert R_test_coo.nnz == nnz_test

# 0-based to 1-based
outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str(test_user[i] + 1) + " " + str(test_item[i] + 1) + " " + str(test_rating[i]) + "\n")

CPU times: user 2min 46s, sys: 2.89 s, total: 2min 49s
Wall time: 2min 51s


In [12]:
%%time
# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 50.8 ms, sys: 679 ms, total: 729 ms
Wall time: 762 ms


In [13]:
print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[4 5 5 ... 3 3 3]
[ 400000  400000  400000 ... 1399999 1399999 1399999]
[  2971   8102  30706 ...  99354 119511 123117]

[4. 5. 5. ... 3. 3. 3.]
[ 400000  400000  400000 ... 1399999 1399999 1399999]
[  2971   8102  30706 ...  99354 119511 123117]


In [14]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(test_user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(test_item).size)
print(np.unique(R_test_coo.col).size)

5
1823178
136735

1
0
0

1823179
1823179
136735
136735
CPU times: user 9.08 s, sys: 612 ms, total: 9.69 s
Wall time: 5.43 s


In [15]:
%%time

R_train_coo = sparse.coo_matrix((train_rating, (train_user, train_item)))
assert R_train_coo.nnz == nnz_train

# 0-based to 1-based
outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str(train_user[i] + 1) + " " + str(train_item[i] + 1) + " " + str(train_rating[i]) + "\n")

CPU times: user 1h 45min 44s, sys: 2min 20s, total: 1h 48min 4s
Wall time: 1h 48min 48s


In [16]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 2min 29s, sys: 1min 13s, total: 3min 43s
Wall time: 4min 52s


In [17]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 1.19 s, sys: 35.5 s, total: 36.7 s
Wall time: 3min 48s


In [18]:
print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[4 1 5 ... 5 1 2]
[1600000 1600000 1600000 ...  399999  399999  399999]
[  2896   4468   8791 ... 114925 116754 125579]

[4. 1. 5. ... 5. 1. 2.]
[1600000 1600000 1600000 ...  399999  399999  399999]
[  2896   4468   8791 ... 114925 116754 125579]

[5 4 4 ... 1 5 5]
[   166   2245   3637 ... 125420 128016 135359]
[        0        34       421 ... 699635967 699640164 699640226]

[5. 4. 4. ... 1. 5. 5.]
[   166   2245   3637 ... 125420 128016 135359]
[        0        34       421 ... 699635967 699640164 699640226]

[5 5 3 ... 5 3 5]
[    354    1051    2549 ... 1822688 1822731 1822735]
[        0      2005      3351 ... 699625566 699630220 699640226]

[5. 5. 3. ... 5. 3. 5.]
[    354    1051    2549 ... 1822688 1822731 1822735]
[        0      2005      3351 ... 699625566 699630220 699640226]


In [19]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(train_user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(train_item).size)
print(np.unique(R_train_coo.col).size)

5
1823178
136735

1
0
0

1823179
1823179
136736
136736
CPU times: user 3min 36s, sys: 1min 13s, total: 4min 49s
Wall time: 12min 38s


In [20]:
%%time
print("write extra meta file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

write extra meta file
CPU times: user 0 ns, sys: 1.92 ms, total: 1.92 ms
Wall time: 24.8 ms


In [21]:
%%time
print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta file
CPU times: user 0 ns, sys: 2.97 ms, total: 2.97 ms
Wall time: 103 ms
