In [1]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import os
import random
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torch import optim

'/Users/wenyi/Desktop/个人/学习/常用算法'

In [3]:
def load_data(path):
    user_items = defaultdict(set)
    # 统计用户数和电影数
    max_u_id = -1
    max_i_id = -1
    count = 0
    with open(path, 'r') as f:
        for line in f.readlines():
            count += 1
            u, i, _, _ = line.split('::')
            u = int(u)
            i = int(i)
            user_items[u].add(i)
            max_u_id = max(u, max_u_id)
            max_i_id = max(i, max_i_id)
    print("用户最大id数为:",max_u_id)
    print("电影最大id数为：",max_i_id)
    return max_u_id, max_i_id, user_items, count

def generate_test(user_items):
    user_test = dict()
    for u, i_list in user_items.items():
        user_test[u] = random.sample(user_items[u], 1)[0]
    return user_test
path = os.path.join(os.path.abspath('..'), 'data/ml-1m/ratings.dat')
num_user, num_movie, user_items, count = load_data(path)
user_items_test = generate_test(user_items)

用户最大id数为: 6040
电影最大id数为： 3952


In [4]:
def generate_test_batch(user_items, user_items_test, num_movie):
    for u in user_items.keys():
        test = []
        i = user_items_test[u]
        for j in range(1, num_movie+1):
            if j not in user_items[u]:
                test.append([u,i,j])
        yield t.LongTensor(test)

In [5]:
def generate_train_batch(user_items, user_items_test, num_movie, batch_size=128):
    result = []
    for i in range(batch_size):
        u = random.sample(user_items.keys(), 1)[0]
        i = random.sample(user_items[u], 1)[0]
        while i == user_items_test[u]:
            i = random.sample(user_items[u], 1)[0]
        j = random.randint(1, num_movie)
        while j in user_items[u]:
            j = random.randint(1, num_movie)
        result.append([u, i, j])
    return t.LongTensor(result)

In [6]:
class BPRMF(nn.Module):
    def __init__(self, num_user, num_movie, embed_dim):
        super(BPRMF, self).__init__()
        self.user_embedding = nn.Embedding(num_user+1, embed_dim)
        self.movie_embedding = nn.Embedding(num_movie+1, embed_dim)
        # 可以添加bias
        
    def forward(self, x):
        user_embed = self.user_embedding(x[:, 0])
        i_embed = self.movie_embedding(x[:, 1])
        j_embed = self.movie_embedding(x[:,2])
        
        l2_norm = t.sum(t.mul(user_embed, user_embed))+t.sum(t.mul(i_embed, i_embed))+t.sum(t.mul(j_embed, j_embed))
        xuij = user_embed*i_embed - user_embed*j_embed
        return t.sum(xuij, dim=1),l2_norm
    

In [8]:
# 定义超参数
EPOCH = 10
embed_dim = 16
learn_rate = 0.01

In [20]:
model = BPRMF(num_user, num_movie, embed_dim)
optimizer = optim.Adam(model.parameters(), lr=lean_rate)
for epoch in range(EPOCH):
    model.train()
    print("epoch is %d" %(epoch+1))
    losses = []
    count = 0
    for i in range(1000):
        train = generate_train_batch(user_items, user_items_test, num_movie, batch_size=512)
        result, l2_norm = model(train)
        regulation_rate = 0.001
        loss = regulation_rate*l2_norm - F.logsigmoid(result).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("train loss is %.3f" % (sum(losses)/len(losses)))
    model.eval()
    auc_sum = 0.0
    test_loss = 0.0
    user_count = 0
    for test in generate_test_batch(user_items, user_items_test, num_movie):
        result, l2_norm= model(test)
        mf_auc = t.mean((result>0).float())
        user_count += 1
        auc_sum += mf_auc
        test_loss += loss
    print("Test loss: %.3f test_auc: %.3f" %(test_loss.item()/user_count,auc_sum.item()/user_count))

epoch is 1
train loss is 10.158
Test loss: 1.063 test_auc: 0.503
epoch is 2
train loss is 0.777
Test loss: 0.697 test_auc: 0.505
epoch is 3
train loss is 0.694
Test loss: 0.693 test_auc: 0.512
epoch is 4
train loss is 0.693
Test loss: 0.693 test_auc: 0.507
epoch is 5
train loss is 0.693
Test loss: 0.693 test_auc: 0.503
epoch is 6
train loss is 0.693
Test loss: 0.694 test_auc: 0.496
epoch is 7
train loss is 0.698
Test loss: 0.700 test_auc: 0.500
epoch is 8
train loss is 0.701
Test loss: 0.702 test_auc: 0.501
epoch is 9
train loss is 0.702
Test loss: 0.702 test_auc: 0.500
epoch is 10
train loss is 0.702
Test loss: 0.702 test_auc: 0.501


In [77]:
# 根据训练的模型进行推荐
# 方法：模型通过优化学习到了user_embeding和item_embedding对于每个用户可以用相应的user_embedding
# 和所有的item_embedding进行矩阵相乘得到用户对每部电影的rating，根据rating排序去topN推荐（这里要去掉用户已经评分的电影）
def recommend(user, k=10):
    """
    根据用户id给用户推荐topK个电影
    user: int 用户id
    k: int topK result
    """
    #用户对所有的movie的评分
    score = t.sum(model.user_embedding.weight[user]*model.movie_embedding.weight,dim=1).detach().numpy()
    score = [(val, i) for i, val in enumerate(score)]
    score = sorted(score, key=lambda x:x[0], reverse=True)
    
    # 根据评分推荐
    rec_result = []
    for val, i in score:
        if i not in user_items[1]:
            if len(rec_result) == k:
                break
            rec_result.append(i)
    return rec_result

In [87]:
user = 2
k = 10
rec_result = recommend(user, k)

In [88]:
import pandas as pd
movie = pd.read_csv('/Users/wenyi/Desktop/个人/学习/常用算法/data/ml-1m/movies.dat',sep='::',names=['movie', 'name','type'])

  


In [89]:
# 用户推荐的k个电影
movie[movie['movie'].isin(rec_result)]

Unnamed: 0,movie,name,type
649,655,Mutters Courage (1995),Comedy
651,657,Yankee Zulu (1994),Comedy|Drama
1379,1400,Somebody is Waiting (1996),Drama
1525,1565,Head Above Water (1996),Comedy|Thriller
1711,1767,Music From Another Room (1998),Drama|Romance
2519,2588,Clubland (1998),Drama
3153,3222,Carmen (1984),Drama
3454,3523,Taffin (1988),Action|Thriller
3773,3843,Sleepaway Camp (1983),Horror


In [86]:
# 用户历史观看的电影
movie[movie['movie'].isin(user_items[2])]

Unnamed: 0,movie,name,type
20,21,Get Shorty (1995),Action|Comedy|Drama
93,95,Broken Arrow (1996),Action|Thriller
108,110,Braveheart (1995),Action|Drama|War
161,163,Desperado (1995),Action|Romance|Thriller
163,165,Die Hard: With a Vengeance (1995),Action|Thriller
232,235,Ed Wood (1994),Comedy|Drama
262,265,Like Water for Chocolate (Como agua para choco...,Drama|Romance
289,292,Outbreak (1995),Action|Drama|Thriller
315,318,"Shawshank Redemption, The (1994)",Drama
345,349,Clear and Present Danger (1994),Action|Adventure|Thriller
