In [1]:
%%time

import numpy as np
from scipy import sparse

CPU times: user 61.1 ms, sys: 8.17 ms, total: 69.3 ms
Wall time: 67.3 ms


In [2]:
%%time

# netflix_mm and netflix_mme should look like this
'''
1 1  3
2 1  5
3 1  4
5 1  3
6 1  3
7 1  4
8 1  3
'''

m = 480_189
n = 17_770
nnz_train = 99_072_112
nnz_test = 1_408_395

train_data_file = "netflix_mm"
test_data_file = "netflix_mme"

print("preparing test data")
test_user, test_item, test_rating = np.loadtxt(test_data_file, dtype=np.int32, unpack=True)

print("preparing training data")
train_user, train_item, train_rating = np.loadtxt(train_data_file, dtype=np.int32, unpack=True)

preparing test data
preparing training data
CPU times: user 6min 55s, sys: 1.89 s, total: 6min 56s
Wall time: 7min 4s


In [3]:
%%time

print(test_user)
print(test_item)
print(test_rating)
print("")
print(np.max(test_user))
print(np.max(test_item))
print(np.max(test_rating))
print("")
print(np.min(test_user))
print(np.min(test_item))
print(np.min(test_rating))
print("")
print(np.unique(test_user).size)
print(np.unique(test_item).size)
print(np.unique(test_rating).size)
print("")
print(test_user.size)

assert test_user.size == nnz_test

[     4     48     60 ... 174372 206132 221852]
[    1     1     1 ... 17770 17770 17770]
[4 4 3 ... 2 3 3]

480093
17770
5

1
1
1

462858
16938
5

1408395
CPU times: user 149 ms, sys: 8.04 ms, total: 157 ms
Wall time: 151 ms


In [4]:
%%time

print(train_user)
print(train_item)
print(train_rating)
print("")
print(np.max(train_user))
print(np.max(train_item))
print(np.max(train_rating))
print("")
print(np.min(train_user))
print(np.min(train_item))
print(np.min(train_rating))
print("")
print(np.unique(train_user).size)
print(np.unique(train_item).size)
print(np.unique(train_rating).size)
print("")
print(train_user.size)

assert train_user.size == nnz_train
assert np.max(train_user) == m
assert np.max(train_item) == n

[     1      2      3 ... 432954 440851 451200]
[    1     1     1 ... 17770 17770 17770]
[3 5 4 ... 3 2 3]

480189
17770
5

1
1
1

480189
17770
5

99072112
CPU times: user 8.72 s, sys: 820 ms, total: 9.54 s
Wall time: 9.52 s


In [5]:
%%time

#1-based to 0-based
R_test_coo = sparse.coo_matrix((test_rating, (test_user - 1, test_item - 1)))
assert R_test_coo.nnz == nnz_test

outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str(test_user[i]) + " " + str(test_item[i]) + " " + str(test_rating[i]) + "\n")

CPU times: user 3.29 s, sys: 44.2 ms, total: 3.33 s
Wall time: 3.28 s


In [6]:
%%time

# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 0 ns, sys: 25.8 ms, total: 25.8 ms
Wall time: 22.9 ms


In [7]:
print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[4 4 3 ... 2 3 3]
[     3     47     59 ... 174371 206131 221851]
[    0     0     0 ... 17769 17769 17769]

[4. 4. 3. ... 2. 3. 3.]
[     3     47     59 ... 174371 206131 221851]
[    0     0     0 ... 17769 17769 17769]


In [8]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(test_user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(test_item).size)
print(np.unique(R_test_coo.col).size)

5
480092
17769

1
0
0

462858
462858
16938
16938
CPU times: user 3.42 s, sys: 7.66 ms, total: 3.43 s
Wall time: 370 ms


In [9]:
%%time

#1-based to 0-based
R_train_coo = sparse.coo_matrix((train_rating, (train_user - 1, train_item - 1)))
assert R_train_coo.nnz == nnz_train

outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str(train_user[i]) + " " + str(train_item[i]) + " " + str(train_rating[i]) + "\n")

CPU times: user 3min 58s, sys: 3.11 s, total: 4min 1s
Wall time: 3min 55s


In [10]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 7.65 s, sys: 3.18 s, total: 10.8 s
Wall time: 10.8 s


In [11]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 0 ns, sys: 1.45 s, total: 1.45 s
Wall time: 1.45 s


In [12]:
print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[3 5 4 ... 3 2 3]
[     0      1      2 ... 432953 440850 451199]
[    0     0     0 ... 17769 17769 17769]

[3. 5. 4. ... 3. 2. 3.]
[     0      1      2 ... 432953 440850 451199]
[    0     0     0 ... 17769 17769 17769]

[3 4 2 ... 5 5 4]
[    0     7    16 ... 17761 17761 17763]
[       0     2202     2344 ... 99072110 99072111 99072112]

[3. 4. 2. ... 5. 5. 4.]
[    0     7    16 ... 17761 17761 17763]
[       0     2202     2344 ... 99072110 99072111 99072112]

[3 5 4 ... 3 2 3]
[     0      1      2 ... 432953 440850 451199]
[       0      524      659 ... 99064534 99071199 99072112]

[3. 5. 4. ... 3. 2. 3.]
[     0      1      2 ... 432953 440850 451199]
[       0      524      659 ... 99064534 99071199 99072112]


In [13]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(train_user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(train_item).size)
print(np.unique(R_train_coo.col).size)

5
480188
17769

1
0
0

480189
480189
17770
17770
CPU times: user 17.9 s, sys: 1.48 s, total: 19.4 s
Wall time: 12.4 s


In [14]:
%%time

print("write extra meta file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

write extra meta file
CPU times: user 544 µs, sys: 10 µs, total: 554 µs
Wall time: 368 µs


In [15]:
%%time

print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta file
CPU times: user 2.68 ms, sys: 0 ns, total: 2.68 ms
Wall time: 442 ms
