In [1]:
import pickle, time, os
import numpy as np
import paddle
import pandas as pd
import paddle.nn as nn
from tqdm import tqdm

from visualdl import LogWriter
logwriter = LogWriter(logdir='./runs')
# visualdl --logdir ./runs/ --host 0.0.0.0 --port 8040

users_df = pd.read_csv('data/csv/users.csv')
items_df = pd.read_csv('data/csv/items.csv')

In [2]:
emb_scale = 256
batch_size = 16384
len_users = len(users_df)
len_items = len(items_df)

In [3]:
class Dataset(paddle.io.Dataset):
    def __init__(self, data):
        self.data = data
        pass
    def __getitem__(self, idx):
        return self.data[idx][0:2], self.data[idx][2]
    def __len__(self):
        return len(self.data)
    pass 

In [4]:
class Net(nn.Layer):
    def __init__(self):
        super(Net,self).__init__()
        self.users_emb = nn.Embedding(len_users+1, emb_scale)
        self.items_emb = nn.Embedding(len_items+1, emb_scale)
        self.user_fc1 = nn.Linear(emb_scale, 128)
        self.item_fc1 = nn.Linear(emb_scale, 128)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.cos = nn.CosineSimilarity()
        pass
    def forward(self, input):
        user = self.users_emb(input[:,0])
        item = self.items_emb(input[:,1])
        user = self.user_fc1(user)
        item = self.item_fc1(item)
        user = self.relu(user)
        item = self.relu(item)
        x = self.cos(user, item)
        x = self.sigmoid(x)
        return x
    pass
net = Net()
optim = paddle.optimizer.Adam(parameters=net.parameters(), learning_rate=0.005, weight_decay=paddle.regularizer.L2Decay(1e-3))
m = paddle.metric.Recall()

In [5]:
train_loss_, train_acc_, eval_loss_, eval_acc_ = [], [], [], []

for epoch_id in range(1):

    train_loss, train_acc, eval_loss, eval_acc = [], [], [], []

    net.train()
    m.reset()
    for batch_id, data in enumerate(tqdm(train_dataset)):
        x_data = data[0]
        y_data = data[1]
        x_data = paddle.cast(x_data, dtype='int32')

        y_pred = net(x_data)
        loss = nn.functional.mse_loss(y_pred, paddle.cast(y_data, dtype='float32'))
        acc = paddle.static.accuracy(paddle.reshape(y_pred, (batch_size, 1)), paddle.reshape(paddle.cast(y_data, dtype='int64'), (batch_size, 1)))
        m.update(y_pred, y_data)
        loss.backward()

        optim.step()
        optim.clear_grad()
        train_loss.append(loss.numpy())
        train_acc.append(acc.numpy())

        logwriter.add_scalar("train_loss", value=loss.numpy(), step=batch_id+epoch_id*(batch_size))
        logwriter.add_scalar("train_acc", value=acc.numpy(), step=batch_id+epoch_id*(batch_size))
        logwriter.add_scalar("train_recall", value=m.accumulate(), step=batch_id+epoch_id*(batch_size))
        
        if m.accumulate() > 0.3:
            break

    file_list = os.listdir('data/net/eval/')
    file = open('data/net/eval/'+np.random.choice(file_list), 'rb')
    train_dataset = paddle.io.DataLoader(Dataset(pickle.load(file)),
                                         drop_last=True,
                                         batch_size=batch_size,
                                         shuffle=True)
    file.close() 

    net.eval()
    m.reset()
    for batch_id, data in enumerate(tqdm(train_dataset)):
        x_data = data[0]
        y_data = data[1]
        x_data = paddle.cast(x_data, dtype='int32')

        y_pred = net(x_data)
        loss = nn.functional.mse_loss(y_pred, paddle.cast(y_data, dtype='float32'))
        acc = paddle.static.accuracy(paddle.reshape(y_pred, (batch_size, 1)), paddle.reshape(paddle.cast(y_data, dtype='int64'), (batch_size, 1)))
        m.update(y_pred, y_data)

    logwriter.add_scalar("eval_loss", value=np.mean(eval_loss), step=epoch_id)
    logwriter.add_scalar("eval_acc", value=np.mean(eval_acc), step=epoch_id)
    logwriter.add_scalar("eval_recall", value=m.accumulate(), step=epoch_id)

    train_loss_.append(train_loss)
    train_acc_.append(train_acc)
    eval_loss_.append(eval_loss)
    eval_acc_.append(eval_acc)
    print("epoch_id: {}, batch_id: {}, loss: {}, acc: {}, recall: {}".format(epoch_id, batch_id+1, loss.numpy(), acc.numpy(), m.accumulate()))

NameError: name 'train_dataset' is not defined

In [6]:
train_loss_, train_acc_, eval_loss_, eval_acc_ = [], [], [], []

file_list = os.listdir('data/net/train/')

for epoch_id, file in enumerate(file_list):
    
    file = open('data/net/train/'+file, 'rb')
    train_dataset = paddle.io.DataLoader(Dataset(pickle.load(file)),
                                         drop_last=True,
                                         batch_size=batch_size,
                                         shuffle=True)
    file.close() 

    train_loss, train_acc, eval_loss, eval_acc = [], [], [], []
    
    net.eval()
    m.reset()
    for batch_id, data in enumerate(train_dataset):
        x_data = data[0]
        y_data = data[1]
        x_data = paddle.cast(x_data, dtype='int32')

        y_pred = net(x_data)
        loss = nn.functional.mse_loss(y_pred, paddle.cast(y_data, dtype='float32'))
        acc = paddle.static.accuracy(paddle.reshape(y_pred, (batch_size, 1)), paddle.reshape(paddle.cast(y_data, dtype='int64'), (batch_size, 1)))
        m.update(y_pred, y_data)

    logwriter.add_scalar("eval_loss", value=np.mean(eval_loss), step=epoch_id)
    logwriter.add_scalar("eval_acc", value=np.mean(eval_acc), step=epoch_id)
    logwriter.add_scalar("eval_recall", value=m.accumulate(), step=epoch_id)

    train_loss_.append(train_loss)
    train_acc_.append(train_acc)
    eval_loss_.append(eval_loss)
    eval_acc_.append(eval_acc)
    print("epoch_id: {}, batch_id: {}, loss: {}, acc: {}, recall: {}".format(epoch_id, batch_id+1, loss.numpy(), acc.numpy(), m.accumulate()))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


epoch_id: 0, batch_id: 104, loss: [0.33535033], acc: [0.9990845], recall: 1.0
epoch_id: 1, batch_id: 104, loss: [0.33551526], acc: [0.9980469], recall: 1.0
epoch_id: 2, batch_id: 104, loss: [0.33550274], acc: [0.9990845], recall: 1.0
epoch_id: 3, batch_id: 104, loss: [0.33559328], acc: [0.99871826], recall: 1.0
epoch_id: 4, batch_id: 104, loss: [0.3355109], acc: [0.99871826], recall: 1.0
epoch_id: 5, batch_id: 104, loss: [0.33555275], acc: [0.9989624], recall: 1.0
epoch_id: 6, batch_id: 104, loss: [0.33550736], acc: [0.9984131], recall: 1.0
epoch_id: 7, batch_id: 104, loss: [0.33529878], acc: [0.9987793], recall: 1.0
epoch_id: 8, batch_id: 104, loss: [0.33524355], acc: [0.9984741], recall: 1.0
epoch_id: 9, batch_id: 104, loss: [0.33541608], acc: [0.9987793], recall: 1.0
epoch_id: 10, batch_id: 104, loss: [0.33567217], acc: [0.9993286], recall: 1.0
epoch_id: 11, batch_id: 104, loss: [0.33535877], acc: [0.9989624], recall: 1.0
epoch_id: 12, batch_id: 104, loss: [0.33580738], acc: [0.9992

In [18]:
file_list

['1.pkl',
 '10.pkl',
 '100.pkl',
 '11.pkl',
 '12.pkl',
 '13.pkl',
 '14.pkl',
 '15.pkl',
 '16.pkl',
 '17.pkl',
 '18.pkl',
 '19.pkl',
 '2.pkl',
 '20.pkl',
 '21.pkl',
 '22.pkl',
 '23.pkl',
 '24.pkl',
 '25.pkl',
 '26.pkl',
 '27.pkl',
 '28.pkl',
 '29.pkl',
 '3.pkl',
 '30.pkl',
 '31.pkl',
 '32.pkl',
 '33.pkl',
 '34.pkl',
 '35.pkl',
 '36.pkl',
 '37.pkl',
 '38.pkl',
 '39.pkl',
 '4.pkl',
 '40.pkl',
 '41.pkl',
 '42.pkl',
 '43.pkl',
 '44.pkl',
 '45.pkl',
 '46.pkl',
 '47.pkl',
 '48.pkl',
 '49.pkl',
 '5.pkl',
 '50.pkl',
 '51.pkl',
 '52.pkl',
 '53.pkl',
 '54.pkl',
 '55.pkl',
 '56.pkl',
 '57.pkl',
 '58.pkl',
 '59.pkl',
 '6.pkl',
 '60.pkl',
 '61.pkl',
 '62.pkl',
 '63.pkl',
 '64.pkl',
 '65.pkl',
 '66.pkl',
 '67.pkl',
 '68.pkl',
 '69.pkl',
 '7.pkl',
 '70.pkl',
 '71.pkl',
 '72.pkl',
 '73.pkl',
 '74.pkl',
 '75.pkl',
 '76.pkl',
 '77.pkl',
 '78.pkl',
 '79.pkl',
 '8.pkl',
 '80.pkl',
 '81.pkl',
 '82.pkl',
 '83.pkl',
 '84.pkl',
 '85.pkl',
 '86.pkl',
 '87.pkl',
 '88.pkl',
 '89.pkl',
 '9.pkl',
 '90.pkl',
 '91.pk

In [22]:
len(train_dataset) * batch_size * len(file_list)

42803200

In [8]:
paddle.save(net.state_dict(), 'data/model.pdparams')

In [17]:
net = Net()

In [19]:
net.set_state_dict(paddle.load('data/model.pdparams'))

([], [])

In [20]:
net.state_dict()

OrderedDict([('users_emb.weight',
              Parameter containing:
              Tensor(shape=[12119, 256], dtype=float32, place=Place(cpu), stop_gradient=False,
                     [[ 0.01226373, -0.00018846,  0.01930048, ..., -0.00143591,
                       -0.00942702,  0.00551308],
                      [ 0.01507964,  0.01957043, -0.00743642, ...,  0.01419000,
                        0.00617457,  0.00221456],
                      [ 0.01868310, -0.01385227, -0.02084335, ...,  0.00755191,
                       -0.00677125,  0.00990534],
                      ...,
                      [-0.00433739, -0.01126241, -0.01520206, ...,  0.01855347,
                        0.01137436,  0.01814571],
                      [-0.00794235, -0.00775500,  0.01469941, ..., -0.00523128,
                        0.01908569, -0.01021772],
                      [-0.00430771, -0.00947088,  0.01025717, ...,  0.00727685,
                       -0.00015765, -0.01571073]])),
             ('items_emb.