In [37]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import math
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [2]:
data = pd.read_csv('../data/ml-1m/ratings.dat', sep='::', names=['user','item', 'rating', 'timeStamp'])
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user,item,rating,timeStamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
user2id = {}
for idx, uid in enumerate(data['user'].unique().tolist()):
    user2id[uid] = idx
data['user'] = data['user'].map(user2id)

item2id = {}
for idx, itemid in enumerate(data['item'].unique().tolist()):
    item2id[itemid] = idx
data['item'] = data['item'].map(item2id)


In [10]:
data = data[['user', 'item', 'rating']]
data.head()

Unnamed: 0,user,item,rating
0,0,0,5
1,0,1,3
2,0,2,3
3,0,3,4
4,0,4,5


In [12]:
for row in data:
    print(row[0])

u
i
r


In [237]:
class DMF(nn.Module):
    def __init__(self,data, hidden1, hidden2, outputs):
        """
        data: dataFrame of [user, item, rating]
        """
        super(DMF, self).__init__()
        self.data = data
        self.num_user = len(data['user'].unique())
        self.num_item = len(data['item'].unique())
        print(self.num_user, self.num_item)
        self.hidden1 = hidden1
        # 构建初始的embedding向量，这里user embedding是利用user对历史item的打分vector
        # item embedding 是利用item历史被若干user打分的vector
        self.user_item_matrix = self.generate_useritem_matrix(data)
        weight_user_item = torch.FloatTensor(self.user_item_matrix)
        weight_item_user = torch.FloatTensor(self.user_item_matrix.T)
        self.user_embed = nn.Embedding.from_pretrained(weight_user_item, freeze=True)
        self.item_embed = nn.Embedding.from_pretrained(weight_item_user, freeze=True)
        
        self.fc_user = nn.Linear(self.num_item, hidden1)
        self.fc_item = nn.Linear(self.num_user, hidden1)
        # MLP网络
        self.mlp_model = nn.Sequential(
                        nn.Linear(hidden1, hidden2),
                        nn.ReLU(),
                        nn.Linear(hidden2, outputs)
                    )
        
    def generate_useritem_matrix(self, data):
        user_item_matrix = np.zeros([self.num_user, self.num_item], np.float32)
        for row in data.values:
            user = row[0]
            item = row[1]
            rating = row[2]
            user_item_matrix[user][item] = rating
        return user_item_matrix
            
    def forward(self, user, item):
        user_input = self.user_embed(user)
        item_input = self.item_embed(item)
        hidden1_user = F.relu(self.fc_user(user_input))
        hidden1_item = F.relu(self.fc_item(item_input))
        user_output = self.mlp_model(hidden1_user)
        item_output = self.mlp_model(hidden1_item)
        norm_user_output = torch.sqrt(torch.sum(user_output**2, dim=1))
        norm_item_output = torch.sqrt(torch.sum(item_output**2, dim=1))
        predict = torch.sum(user_output*item_output,dim=1)/(norm_user_output*norm_item_output)
        predict = torch.clamp(predict,1e-6)
        return predict


In [238]:
class Dataloader:
    def __init__(self, data):
        """
        data: DataFrame ['user', 'item', 'rating']
        """
        self.num_user = len(data['user'].unique())
        self.num_item = len(data['item'].unique())

        self.train, self.test = self.getTrainTest(data)
        self.trainDict = self.getTrainDict(self.train)
        
    def getTrainTest(self, data):
        data = data.sort_values(by=['user','rating'])
        train = pd.DataFrame()
        test = []
        for user in data['user'].unique():
            df = data[data['user']==user]
            df_train = df.iloc[:-1,:]
            df_test = df.iloc[-1,:].values
            if train.empty:
                train = df_train
            else:
                train = pd.concat((train, df_train))
            test.append(df_test)
        return train, pd.DataFrame(test)
    
    def getTrainDict(self,data):
        trainDict = {}
        for row in data.values:
            user = row[0]
            item = row[1]
            rating = row[2]
            trainDict[(user, item)] = rating
        return trainDict
    
    def generate_train_dataset(self, negative_num):
        user_item = []
        rating = []
        for user_items, r in self.trainDict.items():
            user_item.append(list(user_items))
            rating.append(r)
            for t in range(negative_num):
                j = np.random.randint(self.num_item)
                while (user_items[0], j) in self.trainDict:
                    j = np.random.randint(self.num_item)
                user_item.append([user_items[0], j])
                rating.append(0.0)
        return np.array(user_item), np.array(rating)
    
    def generate_test_dataset(self, negative_num_test=99):
        user_item = []
        for row in self.test.values:
            tmp_user_item = []
            u = row[0]
            i = row[1]
            tmp_user_item.append([u, i])
            neglist = set()
            neglist.add(i)
            for t in range(negative_num):
                j = np.random.randint(self.num_item)
                while (row[0], j) in self.trainDict or j in neglist:
                    j = np.random.randint(self.num_item)
                neglist.add(j)
                tmp_user_item.append([row[0],j])
            user_item.append(tmp_user_item)
        return np.array(user_item)

In [239]:
datas = Dataloader(data)
x_train, y_train = datas.generate_train_dataset(5)
x_test = datas.generate_test_dataset(99)

x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)

In [240]:
dmf = DMF(data, 128, 64, 32)

6040 3706


In [250]:
def train(model, epochs, maxRate,batch_size=256, lr=0.01):
    optimizer = optim.Adam(model.parameters(),lr=lr)
    model.train()
    for epoch in range(epochs):
        losses = []
        print("Training epoch %d" %(epoch+1))
        for i, data in enumerate(train_loader):
            data_x = data[0]
            data_y = data[1]
            y_pred = model(data_x[:,0], data_x[:,1])
            regRate = data_y/maxRate
            loss =regRate * torch.log(y_pred.double()) + (1-regRate)*torch.log(1-y_pred.double())
            loss = -torch.mean(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print("loss is %.3f"%(sum(losses)/len(losses)))
        hr, ndcg = test(x_test, model)
        print("EVAL hr %.3f ndcg %.3f" %(hr, ndcg))

In [251]:
#  计算逻辑：target为测试集的目标item， ranklist是对每个用户的1个真实目标item和进行负采样的99个item的预测评分的排序
# 然后取topk个，如果target在这预测的topk的ranklist中表示命中，记录命中的位置（因为NDCG是衡量排序结果好坏的指标与命中位置有关）
def getNDCG(ranklist, target):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == target:
            return math.log(2)/math.log(i+2)
    return 0

#  计算逻辑：target为测试集的目标item， ranklist是对每个用户的1个真实目标item和进行负采样的99个item的预测评分的排序
# 然后取topk个，如果target在这预测的topk的ranklist中表示命中，return 1 反之 return0
def getHR(ranklist, target):
    for item in ranklist:
        if item == target:
            return 1
    return 0

def test(x_test, model,topk=10):
    model.eval()
    hr = []
    NDCG = []
    x_test = torch.from_numpy(x_test)
    test_user = x_test[:,:,0]
    test_item = x_test[:,:,1]
    for i in range(test_user.shape[0]):
        target = test_item[i][0]
        predict = model(test_user[i], test_item[i])
        ranklist = sorted(zip(test_item[i], predict), key=lambda x:x[1])[:topk]
        ranklist = [item.item() for item, val in ranklist]
        tmp_hr = getHR(ranklist, target)
        hr.append(tmp_hr)
        tmp_ndcg = getNDCG(ranklist, target)
        NDCG.append(tmp_ndcg)
    return np.mean(hr), np.mean(NDCG)

In [252]:
%%time
train(dmf, 3, 5)

Training epoch 1


KeyboardInterrupt: 

### 经过上述计算可以得到user和item的low dimension表示，这个表示可以用来计算用户对item的评分,用余弦相似度计算，然后取TopK进行推荐

In [None]:
user_embeding = dmf.