In [1]:
%%time

import sys
from urllib.request import urlretrieve
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import train_test_split
import zipfile as zf

CPU times: user 1.62 s, sys: 2.84 s, total: 4.46 s
Wall time: 639 ms


In [2]:
%%time

# Download the data.
url = 'http://files.grouplens.org/datasets/movielens/'


def reporthook(blocknum, blocksize, totalsize):
    readsofar = blocknum * blocksize
    if totalsize > 0:
        percent = readsofar * 1e2 / totalsize
        s = "\r%5.1f%% %*d / %d" % (
            percent, len(str(totalsize)), readsofar, totalsize)
        sys.stderr.write(s)
        if readsofar >= totalsize: # near the end
            sys.stderr.write("\n")
    else: # total size is unknown
        sys.stderr.write("read %d\n" % (readsofar,))

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename,reporthook)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


data_file = maybe_download('ml-10m.zip', 65566137)

Found and verified ml-10m.zip
CPU times: user 56 µs, sys: 0 ns, total: 56 µs
Wall time: 58.7 µs


In [3]:
%%time

with zf.ZipFile("ml-10m.zip", "r") as zip_ref:
    zip_ref.extractall()

CPU times: user 1.32 s, sys: 219 ms, total: 1.54 s
Wall time: 1.55 s


In [4]:
%%time

# file should look like
'''
1::122::5::838985046
1::185::5::838983525
1::231::5::838983392
1::292::5::838983421
1::316::5::838983392
1::329::5::838983392
1::355::5::838984474
1::356::5::838983653
1::362::5::838984885
1::364::5::838983707
'''

map_items = {}
map_users = {}
n = 1
m = 1

fRatings = "ml-10M100K/ratings.dat"
fileIN = open(fRatings, "r")
fileOUT = open("ratings.dat", "w")

for line in fileIN:
    token = line.strip().split('::')
    uid = token[0]
    iid = token[1]
    score = token[2]

    if uid in map_users:
        real_uid = map_users[uid]
    else:
        map_users[uid] = m
        real_uid = m
        m+=1

    if iid in map_items:
        real_iid = map_items[iid]
    else:
        map_items[iid] = n
        real_iid = n
        n+=1

    fileOUT.write(str(real_uid)+","+str(real_iid)+","+str(score)+"\n")
    
fileIN.close()
fileOUT.close()

CPU times: user 19.4 s, sys: 232 ms, total: 19.6 s
Wall time: 19.8 s


In [5]:
%%time

m = 69_878
n = 10_677
nnz_train = 9_000_048
nnz_test = 1_000_006

data_filename = 'ratings.dat'

data = pd.read_csv(data_filename,dtype={0:'int32',1:'int32',2:'float32'},usecols=[0,1,2],header=None)

user = data[0].values
item = data[1].values
rating = data[2].values

CPU times: user 1.75 s, sys: 200 ms, total: 1.95 s
Wall time: 1.95 s


In [6]:
%%time

print(user)
print(item)
print(rating)
print("")
print(np.min(user))
print(np.min(item))
print(np.min(rating))
print("")
print(np.max(user))
print(np.max(item))
print(np.max(rating))
print("")
print(np.unique(user).size)
print(np.unique(item).size)
print(np.unique(rating).size)
print("")
print(user.size)

assert np.max(user) == m
assert np.max(item) == n
assert user.size == nnz_train + nnz_test

[    1     1     1 ... 69878 69878 69878]
[   1    2    3 ...  538  542 1672]
[5. 5. 5. ... 5. 2. 2.]

1
1
0.5

69878
10677
5.0

69878
10677
10

10000054
CPU times: user 1.05 s, sys: 84.9 ms, total: 1.13 s
Wall time: 1.12 s


In [7]:
%%time

user_item = np.vstack((user, item))

user_item_train, user_item_test, rating_train, rating_test = train_test_split(user_item.T,
                                                                              rating,
                                                                              test_size=nnz_test,
                                                                              random_state=42)

CPU times: user 772 ms, sys: 180 ms, total: 952 ms
Wall time: 1e+03 ms


In [8]:
%%time

#1-based to 0-based
R_test_coo = sparse.coo_matrix((rating_test, (user_item_test[:, 0] - 1, user_item_test[:, 1] - 1)))
assert R_test_coo.nnz == nnz_test

outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str(user_item_test[i, 0]) + " " + str(user_item_test[i, 1]) + " " + str(rating_test[i]) + "\n")
outfile_test.close()

CPU times: user 2.53 s, sys: 22 ms, total: 2.56 s
Wall time: 2.53 s


In [9]:
%%time

# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 3.2 ms, sys: 19.9 ms, total: 23.1 ms
Wall time: 62.1 ms


In [10]:
%%time

print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[2.  3.5 5.  ... 5.  4.5 3. ]
[  912 39343 67757 ... 39442 35599  5790]
[235   9 610 ... 381 389 348]

[2.  3.5 5.  ... 5.  4.5 3. ]
[  912 39343 67757 ... 39442 35599  5790]
[235   9 610 ... 381 389 348]
CPU times: user 3.2 ms, sys: 0 ns, total: 3.2 ms
Wall time: 2.78 ms


In [11]:
%%time

print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_test_coo.col).size)

5.0
69877
10669

0.5
0
0

69878
68541
10677
9784
CPU times: user 780 ms, sys: 48 ms, total: 828 ms
Wall time: 822 ms


In [12]:
%%time

#1-based to 0-based
R_train_coo = sparse.coo_matrix((rating_train, (user_item_train[:, 0] - 1, user_item_train[:, 1] - 1)))
assert R_train_coo.nnz == nnz_train

outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str(user_item_train[i, 0]) + " " + str(user_item_train[i, 1]) + " " + str(rating_train[i]) + "\n")
outfile_train.close()

CPU times: user 22.9 s, sys: 342 ms, total: 23.3 s
Wall time: 23.2 s


In [13]:
%%time

# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 1.55 s, sys: 273 ms, total: 1.82 s
Wall time: 2.02 s


In [14]:
%%time

train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 0 ns, sys: 140 ms, total: 140 ms
Wall time: 139 ms


In [15]:
%%time

print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[2.5 3.  3.  ... 5.  4.5 4. ]
[28591 31467 61778 ... 29956 45749 44857]
[ 679  618   19 ... 2151 1954 1578]

[2.5 3.  3.  ... 5.  4.5 4. ]
[28591 31467 61778 ... 29956 45749 44857]
[ 679  618   19 ... 2151 1954 1578]

[5. 5. 5. ... 1. 1. 1.]
[   0    1    3 ... 3066 3448 5330]
[      0      18      35 ... 8999952 8999999 9000048]

[5. 5. 5. ... 1. 1. 1.]
[   0    1    3 ... 3066 3448 5330]
[      0      18      35 ... 8999952 8999999 9000048]

[5.  3.  2.5 ... 2.  4.  4. ]
[    0   128   136 ... 65888 67546 69154]
[      0    2158   15637 ... 9000046 9000047 9000048]

[5.  3.  2.5 ... 2.  4.  4. ]
[    0   128   136 ... 65888 67546 69154]
[      0    2158   15637 ... 9000046 9000047 9000048]
CPU times: user 11.8 ms, sys: 3.99 ms, total: 15.8 ms
Wall time: 14.5 ms


In [16]:
%%time

print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_train_coo.col).size)

5.0
69877
10676

0.5
0
0

69878
69878
10677
10667
CPU times: user 1.73 s, sys: 99.5 ms, total: 1.83 s
Wall time: 1.82 s


In [17]:
%%time
#NNZ by cols
print(len(R_train_csr.getnnz(axis=0)))
print(R_train_csr.getnnz(axis=0))
print("")
print(np.min(R_train_csr.getnnz(axis=0)))
print(np.max(R_train_csr.getnnz(axis=0)))
print("")
print(np.mean(R_train_csr.getnnz(axis=0)))

10677
[ 2158 13479 16029 ...     1     1     1]

0
31307

842.9379039055915
CPU times: user 123 ms, sys: 127 ms, total: 251 ms
Wall time: 248 ms


In [18]:
%%time
#NNZ by rows
print(len(R_train_csr.getnnz(axis=1)))
print(R_train_csr.getnnz(axis=1))
print("")
print(np.min(R_train_csr.getnnz(axis=1)))
print(np.max(R_train_csr.getnnz(axis=1)))
print("")
print(np.mean(R_train_csr.getnnz(axis=1)))

69878
[ 18  17  28 ... 128  47  49]

11
6619

128.79658833967773
CPU times: user 3.11 ms, sys: 0 ns, total: 3.11 ms
Wall time: 2.12 ms


In [19]:
%%time

print("write extra meta file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + "\n")
outfile_meta.write("""R_test_coo.data.bin
R_test_coo.row.bin
R_test_coo.col.bin
""")
outfile_meta.close()

write extra meta file
CPU times: user 874 µs, sys: 0 ns, total: 874 µs
Wall time: 610 µs


In [20]:
%%time

print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")
outfile_meta.close()

writing extra meta file
CPU times: user 665 µs, sys: 0 ns, total: 665 µs
Wall time: 592 µs
