code in part inspired by: https://github.com/EthanRosenthal/torchmf

In [1]:
import os
import mxnet as mx
from mxnet import gluon, nd, ndarray

import pandas as pd
import numpy as np

In [2]:
data_path = '/home/ubuntu/mxnet-the-straight-dope/incubator-mxnet/example/recommenders/ml-100k/'
num_emb = 64
opt = 'Adam'
lr = 0.02
mmntm = 0.
wd = 0.
batch_size = 50
ctx = mx.gpu(4)

In [3]:
def download_ml_data(prefix):
    if not os.path.exists("%s.zip" % prefix):
        print("Downloading MovieLens data: %s" % prefix)
        os.system("wget http://files.grouplens.org/datasets/movielens/%s.zip" % prefix)
        os.system("unzip {}.zip" % prefix)    

In [4]:
def max_id(fname):
    mu = 0
    mi = 0
    with open(fname) as f:
        for line in f:
            tks = line.strip().split('\t')
            if len(tks) != 4:
                continue
            mu = max(mu, int(tks[0]))
            mi = max(mi, int(tks[1]))
    return mu + 1, mi + 1
max_users, max_items = max_id(data_path + 'u.data')

FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/mxnet-the-straight-dope/incubator-mxnet/example/recommenders/ml-100k/u.data'

In [73]:
train_df = pd.read_csv(data_path+'u1.base', header=None, sep='\t')
test_df = pd.read_csv(data_path+'u1.test', header=None, sep='\t')

train_data = nd.array(train_df[[0,1]].values, dtype=np.float32)
train_label = nd.array(train_df[2].values, dtype=np.float32)

test_data = nd.array(test_df[[0,1]].values, dtype=np.float32)
test_label = nd.array(test_df[2].values, dtype=np.float32)

In [74]:
class SparseMatrixDataset(gluon.data.Dataset):
    def __init__(self, data, label):
        assert data.shape[0] == len(label)
        self.data = data
        self.label = label
        if isinstance(label, ndarray.NDArray) and len(label.shape) == 1:
            self._label = label.asnumpy()
        else:
            self._label = label       
        
    def __getitem__(self, idx):
        return self.data[idx, 0], self.data[idx, 1], self.label[idx]
    
    def __len__(self):
        return self.data.shape[0]
        

In [75]:
class MFBlock(gluon.Block):
    def __init__(self, max_users, max_items, num_emb, dropout_p=0.5):
        super(MFBlock, self).__init__()
        
        self.max_users = max_users
        self.max_items = max_items
        self.dropout_p = dropout_p
        self.num_emb = num_emb
        
        with self.name_scope():
            self.user_embeddings = gluon.nn.Embedding(max_users, num_emb)
            self.item_embeddings = gluon.nn.Embedding(max_items, num_emb)
            self.dropout = gluon.nn.Dropout(dropout_p)
            
    def forward(self, users, items):
        a = self.user_embeddings(users)
        b = self.item_embeddings(items)
        predictions = self.dropout(a) * self.dropout(b)      
        predictions = nd.sum(predictions, axis=1)
        return predictions

        

In [76]:
net = MFBlock(max_users=max_users, max_items=max_items, num_emb=num_emb, dropout_p=0.)
net.collect_params()

mfblock4_ (
  Parameter mfblock4_embedding0_weight (shape=(944, 64), dtype=<class 'numpy.float32'>)
  Parameter mfblock4_embedding1_weight (shape=(1683, 64), dtype=<class 'numpy.float32'>)
)

In [77]:
loss_function = gluon.loss.L2Loss()

In [78]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx, force_reinit=True)

In [79]:
trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'learning_rate': lr, 'wd': wd, 'momentum': 0.9})

In [80]:
train_data_iter = gluon.data.DataLoader(SparseMatrixDataset(train_data, train_label), 
                                        shuffle=True, batch_size=batch_size)
test_data_iter = gluon.data.DataLoader(SparseMatrixDataset(test_data, test_label),
                                          shuffle=True, batch_size=batch_size)

In [81]:
def eval_net(data, net):
    acc = mx.metric.RMSE()
    for i, (user, item, label) in enumerate(data):
        user = user.as_in_context(ctx).reshape((batch_size,))
        item = item.as_in_context(ctx).reshape((batch_size,))
        label = label.as_in_context(ctx).reshape((batch_size,))
        predictions = net(user, item)
        loss = loss_function(predictions, label)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [58]:
eval_net(test_data_iter, net)


3.5359177671939133

In [101]:
epochs = 1
#smoothing_constant = 10

def train(data_iter, net):
    a = []
    b = []
    c = []
    d = []
    for e in range(epochs):
        print("epoch: {}".format(e))
        for i, (user, item, label) in enumerate(train_data_iter):
            user = user.as_in_context(ctx).reshape((batch_size,))
            item = item.as_in_context(ctx).reshape((batch_size,))
            label = label.as_in_context(ctx).reshape((batch_size,))
            with mx.autograd.record():
                output = net(user, item)               
                loss = loss_function(output, label)
                loss.backward()
            net.collect_params().values()
            trainer.step(batch_size)
        a = eval_net(test_data_iter, net)
        b = eval_net(train_data_iter, net)
        print("EPOCH {}: RMSE ON TRAINING and TEST: {}. {}".format(e,a,b))
        
    return a, b

In [102]:
(a,b) = train(train_data_iter, net)

epoch: 0
EPOCH 0: RMSE ON TRAINING and TEST: 0.7893240341067315. 0.7341532736964523


In [103]:
(a,b)

(0.78932403410673146, 0.73415327369645234)