In [1]:
%%time

import numpy as np
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import tarfile as tar
import gzip as gz
import shutil
import os

CPU times: user 1.74 s, sys: 2.73 s, total: 4.48 s
Wall time: 666 ms


In [2]:
%%time

with tar.open("Webscope_R1.tgz", "r") as tar_ref:
    tar_ref.extractall()

CPU times: user 909 ms, sys: 380 ms, total: 1.29 s
Wall time: 1.29 s


In [3]:
%%time

with gz.open("ydata-ymusic-user-artist-ratings-v1_0.txt.gz", "r") as f_in:
    with open('ydata-ymusic-user-artist-ratings-v1_0.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

CPU times: user 9.98 s, sys: 1.6 s, total: 11.6 s
Wall time: 11.7 s


In [4]:
%%time

# Cleanup original dataset from Yahoo
# Removes ratings of 255 (which means never play again)[actual ratings range from 0 to 100]
# Removes items -100 and -99 (Unknown Artist and Not Applicable)
# Number of entries will be reduced from 115,579,440 to 115,248,575 and then to 115,248,573
# "Normalize" item ids (They go from ranging between -100 and 1,101,749 to ranging from 1 to 98,213)
# "Normalize" user ids

fileIN = open("ydata-ymusic-user-artist-ratings-v1_0.txt", "r+")
fileOUT = open("ydatamusic.txt", "w+")

map_items = {}
map_users = {}
n = 1
m = 1

for line in fileIN:
        token = line.strip().split()
        uid = token[0]
        iid = token[1]
        score = token[2]
        if int(score) != 255 and int(iid) > 0:
            if uid in map_users:
                real_uid = map_users[uid]
            else:
                map_users[uid] = m
                real_uid = m
                m+=1

            if iid in map_items:
                real_iid = map_items[iid]
            else:
                map_items[iid] = n
                real_iid = n
                n+=1

            fileOUT.write(str(real_uid)+"\t"+str(real_iid)+"\t"+str(score)+"\n")

fileIN.close()
fileOUT.close()

CPU times: user 5min 15s, sys: 2.8 s, total: 5min 17s
Wall time: 5min 18s


In [5]:
%%time

data_file = "ydatamusic.txt"

m = 1_947_156
n = 98_211
nnz_train = 103_723_717
nnz_test = 11_524_856

all_data = pd.read_csv(data_file, delimiter="\t", header=None)
user = all_data[0].values
item = all_data[1].values
rating = all_data[2].values

CPU times: user 21.7 s, sys: 4.58 s, total: 26.2 s
Wall time: 26.2 s


In [6]:
%%time

os.remove("ydata-ymusic-user-artist-ratings-v1_0.txt.gz")
os.remove("ydata-ymusic-user-artist-ratings-v1_0.txt")
os.remove("ydata-ymusic-artist-names-v1_0.txt.gz")
os.remove("ydatamusic.txt")
os.remove("WebscopeReadMe.txt")
os.remove("readme.txt")

CPU times: user 706 µs, sys: 448 ms, total: 449 ms
Wall time: 450 ms


In [7]:
%%time

print(user)
print(item)
print(rating)
print("")
print(np.min(user))
print(np.min(item))
print(np.min(rating))
print("")
print(np.max(user))
print(np.max(item))
print(np.max(rating))
print("")
print(np.unique(user).size)
print(np.unique(item).size)
print(np.unique(rating).size)
print("")
print(user.size)

assert np.max(user) == m
assert np.max(item) == n
assert user.size == nnz_train + nnz_test

[      1       1       1 ... 1947156 1947156 1947156]
[   1    2    3 ...  178  729 2191]
[ 90 100  90 ...  90  90  90]

1
1
0

1947156
98211
100

1947156
98211
101

115248573
CPU times: user 12.5 s, sys: 1.39 s, total: 13.9 s
Wall time: 13.9 s


In [8]:
%%time

user_item = np.vstack((user, item))

user_item_train, user_item_test, rating_train, rating_test = train_test_split(user_item.T,
                                                                              rating,
                                                                              test_size=nnz_test,
                                                                              random_state=42)

CPU times: user 12.9 s, sys: 3.97 s, total: 16.9 s
Wall time: 16.9 s


In [9]:
%%time

# 1-based to 0-based
R_test_coo = sparse.coo_matrix((rating_test, (user_item_test[:, 0] - 1, user_item_test[:, 1] - 1)))
assert R_test_coo.nnz == nnz_test

outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str((user_item_test[i, 0])) + " " + str((user_item_test[i, 1])) + " " + str(rating_test[i]) + "\n")
outfile_test.close()

CPU times: user 31.6 s, sys: 446 ms, total: 32.1 s
Wall time: 31.9 s


In [10]:
%%time

# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 3.31 ms, sys: 248 ms, total: 251 ms
Wall time: 249 ms


In [11]:
%%time

print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[90 82 90 ...  0  0  0]
[1256592 1545375 1452700 ... 1370209 1711701 1690355]
[ 336   54  310 ... 5898  355  828]

[90. 82. 90. ...  0.  0.  0.]
[1256592 1545375 1452700 ... 1370209 1711701 1690355]
[ 336   54  310 ... 5898  355  828]
CPU times: user 1.42 ms, sys: 65 µs, total: 1.49 ms
Wall time: 1.17 ms


In [12]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_test_coo.col).size)

100
1947153
98205

0
0
0

1947156
1428803
98211
38802
CPU times: user 10.6 s, sys: 1.25 s, total: 11.8 s
Wall time: 11.8 s


In [13]:
%%time

# 1-based to 0-based
R_train_coo = sparse.coo_matrix((rating_train, (user_item_train[:, 0] - 1, user_item_train[:, 1] - 1)))
assert R_train_coo.nnz == nnz_train

outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str((user_item_train[i, 0])) + " " + str((user_item_train[i, 1])) + " " + str(rating_train[i]) + "\n")
outfile_train.close()

CPU times: user 4min 50s, sys: 5.66 s, total: 4min 56s
Wall time: 4min 54s


In [14]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 29.4 s, sys: 4.37 s, total: 33.8 s
Wall time: 34.1 s


In [15]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 3.47 ms, sys: 1.63 s, total: 1.63 s
Wall time: 1.63 s


In [16]:
%%time

print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[70  0 90 ... 60 80  0]
[ 644479  271389  494478 ...  957893  955709 1105636]
[3038  835  452 ...  214  339  410]

[70.  0. 90. ... 60. 80.  0.]
[ 644479  271389  494478 ...  957893  955709 1105636]
[3038  835  452 ...  214  339  410]

[ 90 100 100 ...  90  90  90]
[   0    1    3 ...  728  822 2190]
[        0        35        40 ... 103723709 103723711 103723717]

[ 90. 100. 100. ...  90.  90.  90.]
[   0    1    3 ...  728  822 2190]
[        0        35        40 ... 103723709 103723711 103723717]

[90 90 90 ...  0 30 50]
[      0       4       9 ... 1897411 1897411 1918556]
[        0    154716    228738 ... 103723715 103723716 103723717]

[90. 90. 90. ...  0. 30. 50.]
[      0       4       9 ... 1897411 1897411 1918556]
[        0    154716    228738 ... 103723715 103723716 103723717]
CPU times: user 2.96 ms, sys: 3.96 ms, total: 6.92 ms
Wall time: 5.8 ms


In [17]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_train_coo.col).size)

100
1947155
98210

0
0
0

1947156
1926443
98211
94086
CPU times: user 22.4 s, sys: 2.25 s, total: 24.6 s
Wall time: 24.6 s


In [18]:
%%time
#NNZ by cols
print(len(R_train_csr.getnnz(axis=0)))
print(R_train_csr.getnnz(axis=0))
print("")
print(np.min(R_train_csr.getnnz(axis=0)))
print(np.max(R_train_csr.getnnz(axis=0)))
print("")
print(np.mean(R_train_csr.getnnz(axis=0)))

98211
[154716  74022 166262 ...      1      1      1]

0
703731

1056.1313600309538
CPU times: user 1.47 s, sys: 1.65 s, total: 3.12 s
Wall time: 3.11 s


In [19]:
%%time
#NNZ by rows
print(len(R_train_csr.getnnz(axis=1)))
print(R_train_csr.getnnz(axis=1))
print("")
print(np.min(R_train_csr.getnnz(axis=1)))
print(np.max(R_train_csr.getnnz(axis=1)))
print("")
print(np.mean(R_train_csr.getnnz(axis=1)))

1947156
[35  5  1 ... 39  2  6]

0
88144

53.269341028659234
CPU times: user 23.5 ms, sys: 7.99 ms, total: 31.5 ms
Wall time: 28.3 ms


In [20]:
%%time

print("write extra meta file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + "\n")
outfile_meta.write("""R_test_coo.data.bin
R_test_coo.row.bin
R_test_coo.col.bin
""")
outfile_meta.close()

write extra meta file
CPU times: user 1.14 ms, sys: 0 ns, total: 1.14 ms
Wall time: 926 µs


In [21]:
%%time

print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")
outfile_meta.close()

writing extra meta file
CPU times: user 968 µs, sys: 45 µs, total: 1.01 ms
Wall time: 826 µs
