In [1]:
%%time

import numpy as np
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split

CPU times: user 426 ms, sys: 39.7 ms, total: 466 ms
Wall time: 464 ms


In [2]:
%%time

data_file = "ydatamusic.txt"

m = 1948882
n = 98213
nnz_train = 103723717
nnz_test = 11524858

%time test_data = pd.read_csv(data_file, delimiter = "\t",header=None)
user = test_data[0].values
item = test_data[1].values
rating = test_data[2].values

CPU times: user 30.3 s, sys: 14.2 s, total: 44.5 s
Wall time: 24.3 s
CPU times: user 30.3 s, sys: 14.2 s, total: 44.5 s
Wall time: 24.3 s


In [3]:
print(user)
print(item)
print(rating)
print("")
print(np.min(user))
print(np.min(item))
print(np.min(rating))
print("")
print(np.max(user))
print(np.max(item))
print(np.max(rating))
print("")
print(np.unique(user).size)
print(np.unique(item).size)
print(np.unique(rating).size)
print("")
print(user.size)

assert np.max(user) == m
assert np.max(item) == n
assert user.size == nnz_train + nnz_test

[      1       1       1 ... 1948882 1948882 1948882]
[   1    2    3 ...  178  729 2191]
[ 90 100  90 ...  90  90  90]

1
1
0

1948882
98213
100

1947156
98213
101

115248575


In [4]:
%%time

user_item = np.vstack((user, item))

user_item_train, user_item_test, rating_train, rating_test = train_test_split(user_item.T,
                                                                              rating,
                                                                              test_size=nnz_test,
                                                                              random_state=42)

CPU times: user 17.9 s, sys: 7.21 s, total: 25.1 s
Wall time: 19.5 s


In [5]:
%%time

#1-based to 0-based
R_test_coo = sparse.coo_matrix((rating_test, (user_item_test[:, 0] - 1, user_item_test[:, 1] - 1)))
assert R_test_coo.nnz == nnz_test

outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str((user_item_test[i, 0])) + " " + str((user_item_test[i, 1])) + " " + str(rating_test[i]) + "\n")

CPU times: user 33.9 s, sys: 628 ms, total: 34.5 s
Wall time: 34.3 s


In [6]:
%%time

# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 11.7 ms, sys: 247 ms, total: 259 ms
Wall time: 257 ms


In [7]:
print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[90 90 90 ... 30 70  0]
[1257724 1264325 1454022 ... 1034777  645041  271612]
[ 363  260  105 ...  150 3038  835]

[90. 90. 90. ... 30. 70.  0.]
[1257724 1264325 1454022 ... 1034777  645041  271612]
[ 363  260  105 ...  150 3038  835]


In [8]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_test_coo.col).size)

100
1948879
98200

0
0
0

1947156
1429064
98213
38805
CPU times: user 22.1 s, sys: 4.52 s, total: 26.7 s
Wall time: 10.8 s


In [9]:
%%time

#1-based to 0-based
R_train_coo = sparse.coo_matrix((rating_train, (user_item_train[:, 0] - 1, user_item_train[:, 1] - 1)))
assert R_train_coo.nnz == nnz_train

outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str((user_item_train[i, 0])) + " " + str((user_item_train[i, 1])) + " " + str(rating_train[i]) + "\n")

CPU times: user 4min 43s, sys: 5.52 s, total: 4min 49s
Wall time: 4min 46s


In [10]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 32.6 s, sys: 4.5 s, total: 37.1 s
Wall time: 37.1 s


In [11]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 0 ns, sys: 1.74 s, total: 1.74 s
Wall time: 1.74 s


In [12]:
print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[90  0 50 ... 60 80  0]
[ 494892 1656417 1924655 ...  958739  956552 1106630]
[  452 49583   790 ...   214   339   410]

[90.  0. 50. ... 60. 80.  0.]
[ 494892 1656417 1924655 ...  958739  956552 1106630]
[  452 49583   790 ...   214   339   410]

[ 90 100 100 ...  90  90  90]
[   0    1    3 ...  728  822 2190]
[        0        35        40 ... 103723709 103723711 103723717]

[ 90. 100. 100. ...  90.  90.  90.]
[   0    1    3 ...  728  822 2190]
[        0        35        40 ... 103723709 103723711 103723717]

[90 90 90 ...  0 30 50]
[      0       4       9 ... 1899090 1899090 1920258]
[        0    154739    228698 ... 103723715 103723716 103723717]

[90. 90. 90. ...  0. 30. 50.]
[      0       4       9 ... 1899090 1899090 1920258]
[        0    154739    228698 ... 103723715 103723716 103723717]


In [13]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_train_coo.col).size)

100
1948881
98212

0
0
0

1947156
1926462
98213
94105
CPU times: user 34.6 s, sys: 14.7 s, total: 49.3 s
Wall time: 24.7 s


In [14]:
%%time

print("writing extra meta_modified_all file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta_modified_all file
CPU times: user 883 µs, sys: 106 µs, total: 989 µs
Wall time: 586 µs


In [15]:
%%time

print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta file
CPU times: user 998 µs, sys: 0 ns, total: 998 µs
Wall time: 749 µs
