In [1]:
import os
from six.moves import urllib
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split

In [2]:
%%time
# Download the data.
url = 'http://files.grouplens.org/datasets/movielens/'


def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


data_file = maybe_download('ml-10m.zip', 65566137)

Found and verified ml-10m.zip
CPU times: user 370 ms, sys: 214 ms, total: 584 ms
Wall time: 6.27 s


In [3]:
%%time
get_ipython().system(u'unzip -o ml-10m.zip')

Archive:  ml-10m.zip
   creating: ml-10M100K/
  inflating: ml-10M100K/allbut.pl    
  inflating: ml-10M100K/movies.dat   
  inflating: ml-10M100K/ratings.dat  
  inflating: ml-10M100K/README.html  
  inflating: ml-10M100K/split_ratings.sh  
  inflating: ml-10M100K/tags.dat     
CPU times: user 53.9 ms, sys: 30 ms, total: 83.9 ms
Wall time: 3.39 s


In [4]:
%%time
# file should look like
'''
1::122::5::838985046
1::185::5::838983525
1::231::5::838983392
1::292::5::838983421
1::316::5::838983392
1::329::5::838983392
1::355::5::838984474
1::356::5::838983653
1::362::5::838984885
1::364::5::838983707
'''
m = 71567
n = 65133
nnz_train = 9000048
nnz_test = 1000006

data_filename = 'ml-10M100K/ratings.dat'

user, item, rating = np.loadtxt(data_filename, delimiter='::',
                                dtype=[('f0', np.int32), ('f1', np.int32), ('f2', np.float)],
                                unpack=True)

CPU times: user 1min 17s, sys: 913 ms, total: 1min 18s
Wall time: 1min 18s


In [5]:
print(user)
print(item)
print(rating)
print("")
print(np.min(user))
print(np.min(item))
print(np.min(rating))
print("")
print(np.max(user))
print(np.max(item))
print(np.max(rating))
print("")
print(np.unique(user).size)
print(np.unique(item).size)
print(np.unique(rating).size)
print("")
print(user.size)

assert np.max(user) == m
assert np.max(item) == n
assert user.size == nnz_train + nnz_test

[    1     1     1 ... 71567 71567 71567]
[ 122  185  231 ... 2294 2338 2384]
[5. 5. 5. ... 5. 2. 2.]

1
1
0.5

71567
65133
5.0

69878
10677
10

10000054


In [6]:
%%time
user_item = np.vstack((user, item))

user_item_train, user_item_test, rating_train, rating_test = train_test_split(user_item.T,
                                                                              rating,
                                                                              test_size=nnz_test,
                                                                              random_state=42)

CPU times: user 1.78 s, sys: 1.03 s, total: 2.8 s
Wall time: 2.82 s


In [7]:
%%time
R_test_coo = sparse.coo_matrix((rating_test, (user_item_test[:, 0], user_item_test[:, 1])))
assert R_test_coo.nnz == nnz_test

outfile_test = open("test.txt", 'w')
for i in range(nnz_test):
    outfile_test.write(str((user_item_test[i, 0])) + " " + str((user_item_test[i, 1])) + " " + str(rating_test[i]) + "\n")

CPU times: user 3.2 s, sys: 23.9 ms, total: 3.22 s
Wall time: 3.24 s


In [8]:
%%time
# for test data, we need COO format to calculate test RMSE

R_test_coo.data.astype(np.float32).tofile('R_test_coo.data.bin')
R_test_coo.row.tofile('R_test_coo.row.bin')
R_test_coo.col.tofile('R_test_coo.col.bin')

test_data = np.fromfile('R_test_coo.data.bin', dtype=np.float32)
test_row = np.fromfile('R_test_coo.row.bin', dtype=np.int32)
test_col = np.fromfile('R_test_coo.col.bin', dtype=np.int32)

CPU times: user 62.4 ms, sys: 23 ms, total: 85.4 ms
Wall time: 84 ms


In [9]:
print(R_test_coo.data)
print(R_test_coo.row)
print(R_test_coo.col)
print("")
print(test_data)
print(test_row)
print(test_col)

[2.  3.5 5.  ... 5.  4.5 3. ]
[  958 40310 69359 ... 40414 36467  5973]
[1270  364 2916 ... 1035 1089  479]

[2.  3.5 5.  ... 5.  4.5 3. ]
[  958 40310 69359 ... 40414 36467  5973]
[1270  364 2916 ... 1035 1089  479]


In [10]:
print(np.max(R_test_coo.data))
print(np.max(R_test_coo.row))
print(np.max(R_test_coo.col))
print("")
print(np.min(R_test_coo.data))
print(np.min(R_test_coo.row))
print(np.min(R_test_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_test_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_test_coo.col).size)

5.0
71567
65091

0.5
1
1

69878
68541
10677
9784


In [11]:
%%time
R_train_coo = sparse.coo_matrix((rating_train, (user_item_train[:, 0], user_item_train[:, 1])))
assert R_train_coo.nnz == nnz_train

outfile_train = open("train.txt", 'w')
for i in range(nnz_train):
    outfile_train.write(str((user_item_train[i, 0])) + " " + str((user_item_train[i, 1])) + " " + str(rating_train[i]) + "\n")

CPU times: user 30.6 s, sys: 275 ms, total: 30.9 s
Wall time: 31.3 s


In [12]:
%%time
# for training data, we need COO format to calculate training RMSE
# we need CSR format R when calculate X from \Theta
# we need CSC format of R when calculating \Theta from X
R_train_coo.data.astype(np.float32).tofile('R_train_coo.data.bin')
R_train_coo.row.tofile('R_train_coo.row.bin')
R_train_coo.col.tofile('R_train_coo.col.bin')

R_train_csr = R_train_coo.tocsr()
R_train_csc = R_train_coo.tocsc()

R_train_csr.data.astype(np.float32).tofile('R_train_csr.data.bin')
R_train_csr.indices.tofile('R_train_csr.indices.bin')
R_train_csr.indptr.tofile('R_train_csr.indptr.bin')
R_train_csc.data.astype(np.float32).tofile('R_train_csc.data.bin')
R_train_csc.indices.tofile('R_train_csc.indices.bin')
R_train_csc.indptr.tofile('R_train_csc.indptr.bin')

CPU times: user 3.09 s, sys: 715 ms, total: 3.81 s
Wall time: 3.83 s


In [13]:
%%time
train_data = np.fromfile('R_train_coo.data.bin', dtype=np.float32)
train_row = np.fromfile('R_train_coo.row.bin', dtype=np.int32)
train_col = np.fromfile('R_train_coo.col.bin', dtype=np.int32)

train_csc_data = np.fromfile('R_train_csc.data.bin', dtype=np.float32)
train_csc_indices = np.fromfile('R_train_csc.indices.bin', dtype=np.int32)
train_csc_indptr = np.fromfile('R_train_csc.indptr.bin', dtype=np.int32)

train_csr_data = np.fromfile('R_train_csr.data.bin', dtype=np.float32)
train_csr_indices = np.fromfile('R_train_csr.indices.bin', dtype=np.int32)
train_csr_indptr = np.fromfile('R_train_csr.indptr.bin', dtype=np.int32)

CPU times: user 4.97 ms, sys: 253 ms, total: 258 ms
Wall time: 261 ms


In [14]:
print(R_train_coo.data)
print(R_train_coo.row)
print(R_train_coo.col)
print("")
print(train_data)
print(train_row)
print(train_col)
print("")
print(R_train_csr.data)
print(R_train_csr.indices)
print(R_train_csr.indptr)
print("")
print(train_csr_data)
print(train_csr_indices)
print(train_csr_indptr)
print("")
print(R_train_csc.data)
print(R_train_csc.indices)
print(R_train_csc.indptr)
print("")
print(train_csc_data)
print(train_csc_indices)
print(train_csc_indptr)

[2.5 3.  3.  ... 5.  4.5 4. ]
[29330 32262 63164 ... 30726 46840 45933]
[ 3701  2976   589 ...  2018 30749   405]

[2.5 3.  3.  ... 5.  4.5 4. ]
[29330 32262 63164 ... 30726 46840 45933]
[ 3701  2976   589 ...  2018 30749   405]

[5. 5. 5. ... 5. 2. 2.]
[ 122  185  292 ... 2294 2338 2384]
[      0       0      18 ... 8999952 8999999 9000048]

[5. 5. 5. ... 5. 2. 2.]
[ 122  185  292 ... 2294 2338 2384]
[      0       0      18 ... 8999952 8999999 9000048]

[1.  3.  3.  ... 2.  2.5 5. ]
[    5    14    18 ... 40570 45430 68151]
[      0       0   23770 ... 9000041 9000041 9000048]

[1.  3.  3.  ... 2.  2.5 5. ]
[    5    14    18 ... 40570 45430 68151]
[      0       0   23770 ... 9000041 9000041 9000048]


In [15]:
print(np.max(R_train_coo.data))
print(np.max(R_train_coo.row))
print(np.max(R_train_coo.col))
print("")
print(np.min(R_train_coo.data))
print(np.min(R_train_coo.row))
print(np.min(R_train_coo.col))
print("")
print(np.unique(user).size)
print(np.unique(R_train_coo.row).size)
print(np.unique(item).size)
print(np.unique(R_train_coo.col).size)

5.0
71567
65133

0.5
1
1

69878
69878
10677
10667


In [16]:
%%time
print("writing extra meta_modified_all file")

outfile_meta = open("meta_modified_all", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n" + str(nnz_train) + "\n")
outfile_meta.write("""R_train_coo.data.bin
R_train_coo.row.bin
R_train_coo.col.bin
R_train_csr.indptr.bin
R_train_csr.indices.bin
R_train_csr.data.bin
R_train_csc.indptr.bin
R_train_csc.indices.bin
R_train_csc.data.bin
""")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta_modified_all file
CPU times: user 0 ns, sys: 605 µs, total: 605 µs
Wall time: 440 µs


In [17]:
%%time
print("writing extra meta file")

outfile_meta = open("meta", 'w')
outfile_meta.write(str(m) + " " + str(n) + "\n")
outfile_meta.write(str(nnz_train) + " " + "train.txt\n")
outfile_meta.write(str(nnz_test) + " " + "test.txt\n")

writing extra meta file
CPU times: user 713 µs, sys: 933 µs, total: 1.65 ms
Wall time: 806 µs
