# DSSM深度召回实践
关于DSSM的论文和方案非常的多，简单来说就是构建一个user的embedding和item的embedding然后进行匹配，通过优化匹配的距离来达到让用户感兴趣的item跟用户的相似度更高，用户不感兴趣的item跟用户相似度更低的目的。模型的结构看起来非常的简单，但是要能做出效果来可能还需要一定的经验和技巧，很多时候拿一个数据集进行实验，效果可能是非常非常差的，模型基本没有区分能力，跟itemCF完全没有对比性。这里对DSSM方案进行实践，通过经典的方案和一些优化技巧，来让DSSM达到一定的可用效果。

In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from dnn_model import DNN
from encoder_model import Encoder
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from weight_initiallizer import Initializer
import torch.nn.init as init
from itemcf import itemcf_sim
import time
import os
import faiss

## 数据预处理
本次实验选用的是movielen 1M的数据集，实验对比方案为itemCF，数据切分按照时间以大于2003-01-01的数据为验证集，小于2020-01-01的数据集为训练集，同时对于验证集的用户要保证其在训练集中有历史行为，因此需要过滤掉在训练集中没有行为的用户

In [2]:
root_path = os.path.abspath('.')
user = pd.read_csv(os.path.join(root_path, 'ml-1m', 'users.dat'), sep='::', names = ['user', 'gender', 'age', 
                                                                                     'occupation', 'zip_code'])
movie = pd.read_csv(os.path.join(root_path, 'ml-1m', 'movies.dat'), sep='::', names = ['movie', 'title', 'genres'])
rating = pd.read_csv(os.path.join(root_path, 'ml-1m', 'ratings.dat'), sep='::', names = ['user', 'movie', 'ratings', 
                                                                                     'timestamp'])
# mapping
user_id_dict = dict()
for idx, uid in enumerate(user['user'].tolist()):
    user_id_dict[uid] = idx
movie_id_dict = dict()
for idx, mid in enumerate(movie['movie'].tolist()):
    movie_id_dict[mid] = idx
user['user'] = user['user'].map(user_id_dict)
movie['movie'] = movie['movie'].map(movie_id_dict)
rating['user'] = rating['user'].map(user_id_dict)
rating['movie'] = rating['movie'].map(movie_id_dict)


# 时间处理
rating['timestamp'] = rating['timestamp'].apply(lambda x: time.localtime(x))
rating['time_str'] = rating['timestamp'].apply(lambda x: 
                                                  time.strftime("%Y-%m-%d %H:%M:%S",x))

# 切分训练和验证集，数据是2000到2003年的，以2003年用户的行为为验证集，用用户2000-2002年数据进行预测和验证，过滤掉那些只在2003年有行为的用户
train_data = rating[rating['time_str']<'2003-01-01 00:00:00']
val_data = rating[rating['time_str']>='2003-01-01 00:00:00']
# 过滤
val_data = val_data[val_data['user'].isin(train_data['user'].unique())]

  user = pd.read_csv(os.path.join(root_path, 'ml-1m', 'users.dat'), sep='::', names = ['user', 'gender', 'age',
  movie = pd.read_csv(os.path.join(root_path, 'ml-1m', 'movies.dat'), sep='::', names = ['movie', 'title', 'genres'])
  rating = pd.read_csv(os.path.join(root_path, 'ml-1m', 'ratings.dat'), sep='::', names = ['user', 'movie', 'ratings',


## 基于ItemCF的base方案

In [3]:
# itemcf
user_movie_dict = dict()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    u = rows['user']
    m = rows['movie']
    dt = rows['time_str']
    if u not in user_movie_dict:
        user_movie_dict[u] = [(m, dt)]
    else:
         user_movie_dict[u].append((m, dt))
            
# 计算movie之间的相似度
sim = itemcf_sim(user_movie_dict)

user_movie_dict_val = dict()
for idx, rows in tqdm(val_data.iterrows(), total=len(val_data)):
    u = rows['user']
    m = rows['movie']
    if u not in user_movie_dict_val:
        user_movie_dict_val[u] = [m]
    else:
         user_movie_dict_val[u].append(m)
            
            
user_rec_dict = dict()
for uid, _ in tqdm(user_movie_dict_val.items()):
    movies = user_movie_dict[uid]
    user_rec_dict[uid] = dict()
    for mov,_ in movies:
        for m, w in sorted(sim[mov].items(), key=lambda x:x[1], reverse=True)[:50]:
            if m not in movies:
                if m not in user_rec_dict[uid]:
                    user_rec_dict[uid][m] = w
                else:
                    user_rec_dict[uid][m] += w

# 计算召回
hits, total = 0, 0
for uid, movies in user_rec_dict.items():
    rec_movies = [m for m, _ in sorted(movies.items(), key=lambda x: x[1], reverse=True)[:50]]
    hits += len(set(rec_movies) & set(user_movie_dict_val[uid]))
    total += len(user_movie_dict_val[uid])
# itemcf recall = 0.0570
print("recall is %.3f" % (hits/total))

100%|██████████| 996861/996861 [01:47<00:00, 9279.35it/s]
100%|██████████| 6040/6040 [18:58<00:00,  5.31it/s]  
100%|██████████| 3348/3348 [00:00<00:00, 7867.68it/s]
100%|██████████| 178/178 [00:10<00:00, 16.55it/s]

recall is 0.057





## 基于0，1的正负样本构造方案
### 数据集构造
方案1:将用户评分的电影作为正样本，在所有的电影中随机采样若干个电影作为用户的负样本；
方案2:对数据集中用户评分较高的作为正样本，用户评分低的作为负样本；
方案1和2其实都有可用的场景，当我们面对不同的任务的时候可以采样不同的数据构造方案，我们需要清楚的意识到模型是死的，但是数据是活的，同一个模型输入不同的数据，模型能学到的东西是完全不一样的，因此对于不同的任务我们需要对任务的目标构造出适合任务的数据集。这里我们的任务是召回，也就是从全量电影资源中选择用户感兴趣的电影，面对的是整个电影数据资源，感兴趣的对用户来说是一个较为模糊的结果，对精度的要求没有那么高。在来看方案1和2显然方案2所面对的资源是全量电影资源中很少一部分，对于大多数电影是没有预测能力的，同时由于对电影的评分进行了细粒度的量化，模型能很好的区分哪些电影是用户喜欢的哪些是不喜欢的精度较高，显然方案2更适合排序。方案1的数据是全量电影数据，对于正负样本的定义也没那么精准，所以模型学习到的也是一个较为模糊的偏好。

### 随机负采样

In [4]:
# 随机负采样
sample_list = list(train_data['movie'].unique())
data = list()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    use = rows['user']
    mov = rows['movie']
    data.append([use, mov, 1])
    for m in np.random.choice(sample_list, 3):
        data.append([use, m, 0])
data = pd.DataFrame(data, columns=['user', 'movie', 'tag'])

100%|██████████| 996861/996861 [08:58<00:00, 1852.32it/s]


### 带权负采样

In [None]:
# 加权负采样(跟上面的随机负采样只能选择一种)
tmp = train_data['movie'].value_counts().reset_index(name='count')
high_frequency_df = tmp[tmp['count']>=5]
high_frequency = high_frequency_df['index'].tolist()
high_frequency_p = list()
for m, c in zip(high_frequency, high_frequency_df['count']):
    high_frequency_p.append(c**(3/4))
hig_frequency_p = [c/sum(high_frequency_p) for c in high_frequency_p]
low_frequency = tmp[tmp['count']<5]['index'].tolist()
data = list()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    use = rows['user']
    mov = rows['movie']
    data.append([use, mov, 1])
    for i in range(3):
        if np.random.random() > 0.7:
            mov = np.random.choice(high_frequency, p=hig_frequency_p, size=1)[0]
            data.append([use, mov, 0])
        else:
            mov = np.random.choice(low_frequency, size=1)[0]
            data.append([use, mov, 0])
data = pd.DataFrame(data, columns=['user', 'movie', 'tag'])

### 训练数据构造

In [5]:
class trainset(Dataset):
    def __init__(self, data):
        self.x = data[0]
        self.y = data[1]

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        data = (x, y)
        return data

    def __len__(self):
        return len(self.x)

# 训练集验证集随机分割
train_df, test_df = train_test_split(data, test_size=0.2, random_state=2021)
train_x = train_df[['user', 'movie']].values
train_y = train_df['tag'].values
test_x = test_df[['user', 'movie']].values
test_y = test_df['tag'].values

# 构造dataloader
train_dataset = trainset((train_x, train_y))
test_dataset = trainset((test_x, test_y))
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [33]:
class DNNModel(nn.Module):
    def __init__(self, inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm):
        super(DNNModel, self).__init__()
        self.user_embed = nn.Embedding(input_user_categorical_feature[0][0], input_user_categorical_feature[0][1])
        self.movie_embed = nn.Embedding(input_movie_categorical_feature[0][0], input_movie_categorical_feature[0][1])
        
        self.user_dnn = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 64),
            nn.LeakyReLU()
        )
        
        self.movie_dnn = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 64),
            nn.LeakyReLU()
        )
        
    def forward(self, x):
        u = self.user_embed(x[:, 0])
        m = self.movie_embed(x[:, 1])
        u = self.user_dnn(u)
        m = self.movie_dnn(m)
        u = u/torch.sum(u*u, 1).view(-1,1)
        m = m/torch.sum(m*m, 1).view(-1,1)
        return u, m

In [34]:
def train_model(model, train_loader, val_loader, epoch, loss_function, optimizer, path, early_stop):
    """
    pytorch 模型训练通用代码
    :param model: pytorch 模型
    :param train_loader: dataloader, 训练数据
    :param val_loader: dataloader, 验证数据
    :param epoch: int, 训练迭代次数
    :param loss_function: 优化损失函数
    :param optimizer: pytorch优化器
    :param path: save path
    :param early_stop: int, 提前停止步数
    :return: None
    """
    # 是否使用GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     device = torch.device("cpu")
    model = model.to(device)
    
    # 多少步内验证集的loss没有变小就提前停止
    patience, eval_loss = 0, 0
    
    # 训练
    for i in range(epoch):
        total_loss, count = 0, 0
        y_pred = list()
        y_true = list()
        for idx, (x, y) in tqdm(enumerate(train_loader), total=len(train_loader)):
            x, y = x.to(device), y.to(device) 
            u, m = model(x)
            predict = torch.sigmoid(torch.sum(u*m, 1))
            y_pred.extend(predict.cpu().detach().numpy())
            y_true.extend(y.cpu().detach().numpy())
            loss = loss_function(predict, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += float(loss)
            count += 1
            
        train_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        torch.save(model, path.format(i+1))
        print("Epoch %d train loss is %.3f and train auc is %.3f" % (i+1, total_loss / count, train_auc))
    
        # 验证
        total_eval_loss = 0
        model.eval()
        count_eval = 0
        val_y_pred = list()
        val_true = list()
        for idx, (x, y) in tqdm(enumerate(val_loader), total=len(val_loader)):
            x, y = x.to(device), y.to(device)
            u, m = model(x)
            predict = torch.sigmoid(torch.sum(u*m, 1))
            val_y_pred.extend(predict.cpu().detach().numpy())
            val_true.extend(y.cpu().detach().numpy())
            loss = loss_function(predict, y.float())
            total_eval_loss += float(loss)
            count_eval += 1
        val_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        print("Epoch %d val loss is %.3fand train auc is %.3f" % (i+1, total_eval_loss / count_eval, val_auc))
        
        # 提前停止策略
        if i == 0:
            eval_loss = total_eval_loss / count_eval
        else:
            if total_eval_loss / count_eval < eval_loss:
                eval_loss = total_eval_loss / count_eval
            else:
                if patience < early_stop:
                    patience += 1
                else:
                    print("val loss is not decrease in %d epoch and break training" % patience)
                    break

In [38]:
# 模型初始化
inp_user = 128
inp_movie = 128
out = 64
input_user_categorical_feature = {0: (6040, 128)}
input_movie_categorical_feature =  {0: (3883, 128)}
hidden_layers = [128, 64]
dropouts = [0.5, 0.5, 0.5]
batch_norm = False

model = DNNModel(inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm)
Initializer.initialize(model=model, initialization=init.xavier_uniform, gain=init.calculate_gain('relu'))
# 模型训练
epoch = 20
loss_function = F.binary_cross_entropy_with_logits
early_stop = 3
learn_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
path = 'model/model_{}.pth'

train_model(model, train_loader, test_loader, epoch, loss_function, optimizer, path, early_stop)

  initialization(m.weight.data, **kwargs)
  initialization(m.bias.data)
100%|██████████| 24922/24922 [02:39<00:00, 156.07it/s]
  1%|          | 34/6231 [00:00<00:18, 331.96it/s]

Epoch 1 train loss is 0.691 and train auc is 0.657


100%|██████████| 6231/6231 [00:17<00:00, 351.52it/s]


Epoch 1 val loss is 0.689and train auc is 0.657


100%|██████████| 24922/24922 [02:40<00:00, 155.62it/s]
  0%|          | 18/6231 [00:00<00:37, 167.51it/s]

Epoch 2 train loss is 0.689 and train auc is 0.658


100%|██████████| 6231/6231 [00:19<00:00, 319.49it/s]


Epoch 2 val loss is 0.689and train auc is 0.658


100%|██████████| 24922/24922 [02:39<00:00, 156.43it/s]
  1%|          | 34/6231 [00:00<00:18, 333.36it/s]

Epoch 3 train loss is 0.689 and train auc is 0.666


100%|██████████| 6231/6231 [00:17<00:00, 355.49it/s]


Epoch 3 val loss is 0.689and train auc is 0.666


100%|██████████| 24922/24922 [02:39<00:00, 156.52it/s]
  1%|          | 44/6231 [00:00<00:14, 435.79it/s]

Epoch 4 train loss is 0.689 and train auc is 0.673


100%|██████████| 6231/6231 [00:17<00:00, 363.47it/s]


Epoch 4 val loss is 0.689and train auc is 0.673


100%|██████████| 24922/24922 [02:40<00:00, 155.16it/s]
  1%|          | 33/6231 [00:00<00:19, 325.21it/s]

Epoch 5 train loss is 0.688 and train auc is 0.668


100%|██████████| 6231/6231 [00:17<00:00, 350.48it/s]
  0%|          | 0/24922 [00:00<?, ?it/s]

Epoch 5 val loss is 0.689and train auc is 0.668


100%|██████████| 24922/24922 [02:38<00:00, 157.30it/s]
  1%|          | 43/6231 [00:00<00:14, 415.17it/s]

Epoch 6 train loss is 0.688 and train auc is 0.667


100%|██████████| 6231/6231 [00:17<00:00, 359.52it/s]


Epoch 6 val loss is 0.688and train auc is 0.667


100%|██████████| 24922/24922 [02:41<00:00, 154.08it/s]
  1%|          | 44/6231 [00:00<00:14, 434.58it/s]

Epoch 7 train loss is 0.688 and train auc is 0.667


100%|██████████| 6231/6231 [00:17<00:00, 360.37it/s]


Epoch 7 val loss is 0.689and train auc is 0.667


100%|██████████| 24922/24922 [02:48<00:00, 148.34it/s]
  0%|          | 12/6231 [00:00<00:55, 112.11it/s]

Epoch 8 train loss is 0.688 and train auc is 0.665


100%|██████████| 6231/6231 [00:18<00:00, 341.97it/s]


Epoch 8 val loss is 0.689and train auc is 0.665
val loss is not decrease in 3 epoch and break training


In [39]:
# 结果验证
model.eval()
user['movie'] = 1
test_x = user[['user', 'movie']].values
x = torch.from_numpy(test_x).cuda()
user_embed, _ = model(x)

movie['user'] = 1
test_x = movie[['user', 'movie']].values
x = torch.from_numpy(test_x).cuda()
_, movie_embed = model(x)

movie_embed = movie_embed.cpu().detach().numpy()
user_embed = user_embed.cpu().detach().numpy()


# faiss索引构建
d = 64
nlist = 10
index = faiss.IndexFlatL2(d)
index.add(movie_embed)

# 验证集数据字典化
user_movie_dict_val = dict()
for idx, rows in tqdm(val_data.iterrows(), total=len(val_data)):
    u = rows['user']
    m = rows['movie']
    if u not in user_movie_dict_val:
        user_movie_dict_val[u] = [m]
    else:
         user_movie_dict_val[u].append(m)
            
# 用户推荐结果索引           
D, I = index.search(user_embed[list(val_data['user'].unique())], 50)

# 召回率计算
hits, total = 0, 0
for uid, rec_list in zip(list(val_data['user'].unique()), I):
    hits += len(set(rec_list)&set(user_movie_dict_val[uid]))
    total += len(user_movie_dict_val[uid])
print("recall is %.3f" % (hits/total))

100%|██████████| 3348/3348 [00:00<00:00, 9357.59it/s]

recall is 0.071





## 基于正负样本距离的损失函数构造方案
上面构造数据集的方案有一个很严重的缺陷，因为构造的都是0，1样本这种样本属于hard级别，对样本的要求非常高而且模型学习起来的难度也非常的大，如果样本没有构造好或者模型参数数据量不够，很难达到理想的效果。因此这里换用一种soft的样本构造方案，即基于triplet loss的方案，triplet loss是保证user向量跟正样本movie向量的距离比负样本movie向量的距离更近一些，loss优化的就是两者之间的距离最小：
$$loss = max(d(u,m_p)-d(u,m_n)+margin, 0)$$
DSSM方案是选取一个正样本和若干负样本，保证正样本在其中的概率最大:
$$P(u|m) = \frac{exp(\gamma R(u,m))}{exp(\gamma R(u,m)) + \sum{m \in D^{-}}{-exp(\gamma R(u,m))}}$$
$$loss = -\sum_{i=1}^{n}{log(P(u|m))}$$

### triplet loss 模型

In [40]:
# triplet loss负采样
sample_list = list(train_data['movie'].unique())
data = list()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    use = rows['user']
    mov = rows['movie']
    for m in np.random.choice(sample_list, 3):
        data.append([use, mov, m])
data = pd.DataFrame(data, columns=['user', 'movie_pos', 'movie_neg'])

100%|██████████| 996861/996861 [08:57<00:00, 1853.04it/s]


In [27]:
class trainset(Dataset):
    def __init__(self, data):
        self.x = data

    def __getitem__(self, index):
        x = self.x[index]
        return x

    def __len__(self):
        return len(self.x)

train_df, test_df = train_test_split(data, test_size=0.2, random_state=2021)
train_x = train_df[['user', 'movie_pos', 'movie_neg']].values
test_x = test_df[['user', 'movie_pos', 'movie_neg']].values

train_dataset = trainset((train_x))
test_dataset = trainset((test_x))
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [28]:
# triplet loss DNNmodel
class DNNModel(nn.Module):
    def __init__(self, inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm):
        super(DNNModel, self).__init__()
        self.user_embed = nn.Embedding(input_user_categorical_feature[0][0], input_user_categorical_feature[0][1])
        self.movie_embed = nn.Embedding(input_movie_categorical_feature[0][0], input_movie_categorical_feature[0][1])
        
        self.user_dnn = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 64),
            nn.LeakyReLU()
        )
        
        self.movie_dnn = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 64),
            nn.LeakyReLU()
        )

        
    def forward(self, x):
        user = self.user_embed(x[:, 0])
        movie_pos = self.movie_embed(x[:, 1])
        movie_neg = self.movie_embed(x[:, 2])
        user = self.user_dnn(user)
        movie_pos = self.movie_dnn(movie_pos)
        movie_neg = self.movie_dnn(movie_neg)
        user = user/torch.sum(user*user, 1).view(-1,1)
        movie_pos = movie_pos/torch.sum(movie_pos*movie_pos, 1).view(-1,1)
        movie_neg = movie_neg/torch.sum(movie_neg*movie_neg, 1).view(-1,1)
        return user, movie_pos, movie_neg


In [29]:
# triplet DNN model train
def train_model(model, train_loader, val_loader, epoch, loss_function, optimizer, path, early_stop):
    """
    pytorch 模型训练通用代码
    :param model: pytorch 模型
    :param train_loader: dataloader, 训练数据
    :param val_loader: dataloader, 验证数据
    :param epoch: int, 训练迭代次数
    :param loss_function: 优化损失函数
    :param optimizer: pytorch优化器
    :param path: save path
    :param early_stop: int, 提前停止步数
    :return: None
    """
    # 是否使用GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     device = torch.device("cpu")
    model = model.to(device)
    
    # 多少步内验证集的loss没有变小就提前停止
    patience, eval_loss = 0, 0
    
    # 训练
    for i in range(epoch):
        total_loss, count = 0, 0
        for idx, x in tqdm(enumerate(train_loader), total=len(train_loader)):
            x = x.to(device)
            u, m_p, m_n = model(x)
            loss = loss_function(u, m_p, m_n)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += float(loss)
            count += 1
            
#         train_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        torch.save(model, path.format(i+1))
        print("Epoch %d train loss is %.3f" % (i+1, total_loss / count))
    
        # 验证
        total_eval_loss = 0
        model.eval()
        count_eval = 0
        for idx, x in tqdm(enumerate(val_loader), total=len(val_loader)):
            x = x.to(device)
            u, m_p, m_n = model(x)
            loss = loss_function(u, m_p, m_n)
            total_eval_loss += float(loss)
            count_eval += 1
        print("Epoch %d val loss is %.3f" % (i+1, total_eval_loss / count_eval))
        
        # 提前停止策略
        if i == 0:
            eval_loss = total_eval_loss / count_eval
        else:
            if total_eval_loss / count_eval < eval_loss:
                eval_loss = total_eval_loss / count_eval
            else:
                if patience < early_stop:
                    patience += 1
                else:
                    print("val loss is not decrease in %d epoch and break training" % patience)
                    break

In [30]:
# 模型初始化
inp_user = 128
inp_movie = 128
out = 64
input_user_categorical_feature = {0: (6040, 128)}
input_movie_categorical_feature =  {0: (3883, 128)}
hidden_layers = [128, 64]
dropouts = [0.5, 0.5, 0.5]
batch_norm = False

model = DNNModel(inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm)

# 模型训练
epoch = 20
loss_function = F.triplet_margin_with_distance_loss
early_stop = 3
learn_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
path = 'model/model_{}.pth'

train_model(model, train_loader, test_loader, epoch, loss_function, optimizer, path, early_stop)

100%|██████████| 18692/18692 [02:02<00:00, 152.44it/s]
  1%|          | 26/4673 [00:00<00:18, 256.10it/s]

Epoch 1 train loss is 0.506


100%|██████████| 4673/4673 [00:11<00:00, 417.44it/s]
  0%|          | 1/18692 [00:00<57:24,  5.43it/s]

Epoch 1 val loss is 0.491


100%|██████████| 18692/18692 [02:01<00:00, 154.33it/s]
  1%|          | 44/4673 [00:00<00:10, 439.67it/s]

Epoch 2 train loss is 0.485


100%|██████████| 4673/4673 [00:11<00:00, 410.35it/s]
  0%|          | 0/18692 [00:00<?, ?it/s]

Epoch 2 val loss is 0.485


100%|██████████| 18692/18692 [02:01<00:00, 154.21it/s]
  1%|          | 49/4673 [00:00<00:09, 489.34it/s]

Epoch 3 train loss is 0.483


100%|██████████| 4673/4673 [00:11<00:00, 405.59it/s]
  0%|          | 0/18692 [00:00<?, ?it/s]

Epoch 3 val loss is 0.484


100%|██████████| 18692/18692 [02:01<00:00, 154.21it/s]
  1%|          | 37/4673 [00:00<00:12, 362.48it/s]

Epoch 4 train loss is 0.482


100%|██████████| 4673/4673 [00:11<00:00, 396.37it/s]
  0%|          | 1/18692 [00:00<57:31,  5.42it/s]

Epoch 4 val loss is 0.488


100%|██████████| 18692/18692 [02:01<00:00, 154.21it/s]
  1%|          | 48/4673 [00:00<00:09, 475.85it/s]

Epoch 5 train loss is 0.482


100%|██████████| 4673/4673 [00:11<00:00, 401.61it/s]
  0%|          | 1/18692 [00:00<59:00,  5.28it/s]

Epoch 5 val loss is 0.487


100%|██████████| 18692/18692 [02:03<00:00, 151.72it/s]
  1%|          | 39/4673 [00:00<00:12, 385.75it/s]

Epoch 6 train loss is 0.482


100%|██████████| 4673/4673 [00:12<00:00, 388.97it/s]
  0%|          | 1/18692 [00:00<46:27,  6.70it/s]

Epoch 6 val loss is 0.483


100%|██████████| 18692/18692 [02:02<00:00, 152.80it/s]
  1%|          | 48/4673 [00:00<00:09, 478.93it/s]

Epoch 7 train loss is 0.482


100%|██████████| 4673/4673 [00:11<00:00, 390.17it/s]
  0%|          | 1/18692 [00:00<58:05,  5.36it/s]

Epoch 7 val loss is 0.481


100%|██████████| 18692/18692 [02:02<00:00, 152.75it/s]
  1%|          | 46/4673 [00:00<00:10, 452.35it/s]

Epoch 8 train loss is 0.481


100%|██████████| 4673/4673 [00:12<00:00, 387.48it/s]
  0%|          | 1/18692 [00:00<49:21,  6.31it/s]

Epoch 8 val loss is 0.486


100%|██████████| 18692/18692 [02:02<00:00, 152.48it/s]
  0%|          | 21/4673 [00:00<00:22, 206.02it/s]

Epoch 9 train loss is 0.482


100%|██████████| 4673/4673 [00:12<00:00, 381.35it/s]

Epoch 9 val loss is 0.484
val loss is not decrease in 3 epoch and break training





In [34]:
# 结果验证
model.eval()
user['movie_pos'] = 1
user['movie_neg'] = 2
test_x = user[['user', 'movie_pos', 'movie_neg']].values
x = torch.from_numpy(test_x).cuda()
user_embed, _, _ = model(x)

movie['user'] = 1
movie['movie_neg'] = 1
test_x = movie[['user', 'movie', 'movie_neg']].values
x = torch.from_numpy(test_x).cuda()
_, movie_embed, _ = model(x)

movie_embed = movie_embed.cpu().detach().numpy()
user_embed = user_embed.cpu().detach().numpy()

# embedding 维度
d = 64
nlist = 10
index = faiss.IndexFlatL2(d)
index.add(movie_embed)

user_movie_dict_val = dict()
for idx, rows in tqdm(val_data.iterrows(), total=len(val_data)):
    u = rows['user']
    m = rows['movie']
    if u not in user_movie_dict_val:
        user_movie_dict_val[u] = [m]
    else:
         user_movie_dict_val[u].append(m)
            
D, I = index.search(user_embed[list(val_data['user'].unique())], 50)

# 召回率计算
hits, total = 0, 0
for uid, rec_list in zip(list(val_data['user'].unique()), I):
    hits += len(set(rec_list)&set(user_movie_dict_val[uid]))
    total += len(user_movie_dict_val[uid])
print("recall is %.3f" % (hits/total))

100%|██████████| 3348/3348 [00:00<00:00, 10039.51it/s]

recall is 0.071





### DSSM loss 模型

In [35]:
# DSSM负采样
sample_list = list(train_data['movie'].unique())
data = list()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    use = rows['user']
    mov = rows['movie']
    m = np.random.choice(sample_list, 3)
    data.append([use, mov] + list(m))
data = pd.DataFrame(data, columns=['user', 'movie_pos', 'movie_neg1', 'movie_neg2', 'movie_neg3'])

100%|██████████| 996861/996861 [08:34<00:00, 1936.10it/s]


In [39]:
class trainset(Dataset):
    def __init__(self, data):
        self.x = data

    def __getitem__(self, index):
        x = self.x[index]
        return x

    def __len__(self):
        return len(self.x)

train_df, test_df = train_test_split(data, test_size=0.2, random_state=2021)
train_x = train_df[['user', 'movie_pos', 'movie_neg1', 'movie_neg2', 'movie_neg3']].values
test_x = test_df[['user', 'movie_pos', 'movie_neg1', 'movie_neg2', 'movie_neg3']].values

train_dataset = trainset((train_x))
test_dataset = trainset((test_x))
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [40]:
# DSSM DNNmodel
class DNNModel(nn.Module):
    def __init__(self, inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm):
        super(DNNModel, self).__init__()
        self.user_embed = nn.Embedding(input_user_categorical_feature[0][0], input_user_categorical_feature[0][1])
        self.movie_embed = nn.Embedding(input_movie_categorical_feature[0][0], input_movie_categorical_feature[0][1])
        
        self.user_dnn = nn.Sequential(
            nn.Linear(128, 64),
#             nn.Dropout(0.5),
            nn.Linear(64, 64)
        )
        
        self.movie_dnn = nn.Sequential(
            nn.Linear(128, 64),
#             nn.Dropout(0.5),
            nn.Linear(64, 64)
        )

        
    def forward(self, x):
        user = self.user_embed(x[:, 0])
        movie_pos = self.movie_embed(x[:, 1])
        movie_neg1 = self.movie_embed(x[:, 2])
        movie_neg2 = self.movie_embed(x[:, 3])
        movie_neg3 = self.movie_embed(x[:, 4])
        
        user = self.user_dnn(user)
        movie_pos = self.movie_dnn(movie_pos)
        movie_neg1 = self.movie_dnn(movie_neg1)
        movie_neg2 = self.movie_dnn(movie_neg2)
        movie_neg3 = self.movie_dnn(movie_neg3)
        user = user/torch.sum(user*user, 1).view(-1,1)
        movie_pos = movie_pos/torch.sum(movie_pos*movie_pos, 1).view(-1,1)
        movie_neg1 = movie_neg1/torch.sum(movie_neg1*movie_neg1, 1).view(-1,1)
        movie_neg2 = movie_neg2/torch.sum(movie_neg2*movie_neg2, 1).view(-1,1)
        movie_neg3 = movie_neg3/torch.sum(movie_neg3*movie_neg3, 1).view(-1,1)
        return user, movie_pos, movie_neg1,  movie_neg2, movie_neg3


In [41]:
# dssm DNN Model模型训练
def loss_function(user, movie_pos, movie_neg1,  movie_neg2, movie_neg3):
    d_n = torch.exp(1.2 * torch.sum(user*movie_pos, 1))
    d_n1 = torch.exp(1.2 * torch.sum(user*movie_neg1, 1))
    d_n2 = torch.exp(1.2 * torch.sum(user*movie_neg2, 1))
    d_n3 = torch.exp(1.2 * torch.sum(user*movie_neg3, 1))
    p = torch.sum(-torch.log(d_n/(d_n + d_n1 + d_n2 + d_n3)))
    return p
    


def train_model(model, train_loader, val_loader, epoch, loss_function, optimizer, path, early_stop):
    """
    pytorch 模型训练通用代码
    :param model: pytorch 模型
    :param train_loader: dataloader, 训练数据
    :param val_loader: dataloader, 验证数据
    :param epoch: int, 训练迭代次数
    :param loss_function: 优化损失函数
    :param optimizer: pytorch优化器
    :param path: save path
    :param early_stop: int, 提前停止步数
    :return: None
    """
    # 是否使用GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     device = torch.device("cpu")
    model = model.to(device)
    
    # 多少步内验证集的loss没有变小就提前停止
    patience, eval_loss = 0, 0
    
    # 训练
    for i in range(epoch):
        total_loss, count = 0, 0
        for idx, x in tqdm(enumerate(train_loader), total=len(train_loader)):
            x = x.to(device)
            u, m_p, m_n1, m_n2, m_n3 = model(x)
            loss = loss_function(u, m_p, m_n1, m_n2, m_n3)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += float(loss)
            count += 1
            
#         train_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        torch.save(model, path.format(i+1))
        print("Epoch %d train loss is %.3f" % (i+1, total_loss / count))
    
        # 验证
        total_eval_loss = 0
        model.eval()
        count_eval = 0
        for idx, x in tqdm(enumerate(val_loader), total=len(val_loader)):
            x = x.to(device)
            u, m_p, m_n1, m_n2, m_n3 = model(x)
            loss = loss_function(u, m_p, m_n1, m_n2, m_n3)
            total_eval_loss += float(loss)
            count_eval += 1
        print("Epoch %d val loss is %.3f" % (i+1, total_eval_loss / count_eval))
        
        # 提前停止策略
        if i == 0:
            eval_loss = total_eval_loss / count_eval
        else:
            if total_eval_loss / count_eval < eval_loss:
                eval_loss = total_eval_loss / count_eval
            else:
                if patience < early_stop:
                    patience += 1
                else:
                    print("val loss is not decrease in %d epoch and break training" % patience)
                    break


In [42]:
inp_user = 128
inp_movie = 128
out = 64
input_user_categorical_feature = {0: (6040, 128)}
input_movie_categorical_feature =  {0: (3883, 128)}
hidden_layers = [128, 64]
dropouts = [0.5, 0.5, 0.5]
batch_norm = False

model = DNNModel(inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm)

epoch = 20
early_stop = 3
learn_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
path = 'model/model_{}.pth'
train_model(model, train_loader, test_loader, epoch, loss_function, optimizer, path, early_stop)

100%|██████████| 6231/6231 [00:47<00:00, 131.80it/s]
  2%|▏         | 36/1558 [00:00<00:04, 350.46it/s]

Epoch 1 train loss is 126.849


100%|██████████| 1558/1558 [00:04<00:00, 350.93it/s]
  0%|          | 3/6231 [00:00<03:36, 28.78it/s]

Epoch 1 val loss is 119.830


100%|██████████| 6231/6231 [00:47<00:00, 129.93it/s]
  2%|▏         | 35/1558 [00:00<00:04, 345.13it/s]

Epoch 2 train loss is 118.201


100%|██████████| 1558/1558 [00:04<00:00, 345.76it/s]
  0%|          | 2/6231 [00:00<05:33, 18.68it/s]

Epoch 2 val loss is 117.911


100%|██████████| 6231/6231 [00:47<00:00, 132.03it/s]
  2%|▏         | 35/1558 [00:00<00:04, 347.60it/s]

Epoch 3 train loss is 116.731


100%|██████████| 1558/1558 [00:04<00:00, 332.72it/s]
  0%|          | 2/6231 [00:00<05:15, 19.77it/s]

Epoch 3 val loss is 116.403


100%|██████████| 6231/6231 [00:46<00:00, 133.76it/s]
  2%|▏         | 32/1558 [00:00<00:04, 313.46it/s]

Epoch 4 train loss is 116.155


100%|██████████| 1558/1558 [00:04<00:00, 315.11it/s]
  0%|          | 2/6231 [00:00<05:22, 19.30it/s]

Epoch 4 val loss is 121.150


100%|██████████| 6231/6231 [00:46<00:00, 133.98it/s]
  2%|▏         | 35/1558 [00:00<00:04, 346.40it/s]

Epoch 5 train loss is 115.750


100%|██████████| 1558/1558 [00:04<00:00, 347.74it/s]
  0%|          | 3/6231 [00:00<03:29, 29.67it/s]

Epoch 5 val loss is 116.955


100%|██████████| 6231/6231 [00:47<00:00, 130.97it/s]
  2%|▏         | 35/1558 [00:00<00:04, 348.97it/s]

Epoch 6 train loss is 115.580


100%|██████████| 1558/1558 [00:04<00:00, 351.10it/s]
  0%|          | 3/6231 [00:00<03:36, 28.74it/s]

Epoch 6 val loss is 115.408


100%|██████████| 6231/6231 [00:47<00:00, 130.32it/s]
  2%|▏         | 35/1558 [00:00<00:04, 346.41it/s]

Epoch 7 train loss is 115.358


100%|██████████| 1558/1558 [00:04<00:00, 342.72it/s]
  0%|          | 3/6231 [00:00<03:38, 28.46it/s]

Epoch 7 val loss is 115.798


100%|██████████| 6231/6231 [00:47<00:00, 130.46it/s]
  2%|▏         | 35/1558 [00:00<00:04, 346.73it/s]

Epoch 8 train loss is 115.267


100%|██████████| 1558/1558 [00:04<00:00, 350.04it/s]
  0%|          | 3/6231 [00:00<03:45, 27.57it/s]

Epoch 8 val loss is 115.401


100%|██████████| 6231/6231 [00:47<00:00, 130.14it/s]
  2%|▏         | 35/1558 [00:00<00:04, 343.39it/s]

Epoch 9 train loss is 115.138


100%|██████████| 1558/1558 [00:04<00:00, 327.42it/s]
  0%|          | 7/6231 [00:00<01:29, 69.73it/s]

Epoch 9 val loss is 115.170


100%|██████████| 6231/6231 [00:47<00:00, 132.51it/s]
  2%|▏         | 35/1558 [00:00<00:04, 346.19it/s]

Epoch 10 train loss is 115.110


100%|██████████| 1558/1558 [00:04<00:00, 329.38it/s]

Epoch 10 val loss is 115.564
val loss is not decrease in 3 epoch and break training





In [59]:
model.eval()
user['movie_pos'] = 1
user['movie_neg1'] = 2
user['movie_neg2'] = 2
user['movie_neg3'] = 2
test_x = user[['user', 'movie_pos', 'movie_neg1', 'movie_neg2', 'movie_neg3']].values
x = torch.from_numpy(test_x).cuda()
user_embed, _, _, _, _ = model(x)

movie['user'] = 1
movie['movie_neg1'] = 1
movie['movie_neg2'] = 1
movie['movie_neg3'] = 1
test_x = movie[['user', 'movie', 'movie_neg1', 'movie_neg2', 'movie_neg3']].values
x = torch.from_numpy(test_x).cuda()
_, movie_embed, _ ,_, _= model(x)

movie_embed = movie_embed.cpu().detach().numpy()
user_embed = user_embed.cpu().detach().numpy()

# embedding 维度
d = 64
nlist = 10
index = faiss.IndexFlatL2(d)
# index = faiss.IndexIVFFlat(article_quantizer, d, nlist, faiss.METRIC_L2)
# index.train(movie_embed)
index.add(movie_embed)

user_movie_dict_val = dict()
for idx, rows in tqdm(val_data.iterrows(), total=len(val_data)):
    u = rows['user']
    m = rows['movie']
    if u not in user_movie_dict_val:
        user_movie_dict_val[u] = [m]
    else:
         user_movie_dict_val[u].append(m)
            
D, I = index.search(user_embed[list(val_data['user'].unique())], 50)

# 召回率计算
hits, total = 0, 0
for uid, rec_list in zip(list(val_data['user'].unique()), I):
    hits += len(set(rec_list)&set(user_movie_dict_val[uid]))
    total += len(user_movie_dict_val[uid])
print("recall is %.3f" % (hits/total))

100%|██████████| 3348/3348 [00:00<00:00, 10164.68it/s]

recall is 0.052





## 添加user 和 movie相关特征
之前的方案仅仅考虑了用户的ID和电影的ID特征，其他跟用户和电影相关的特征是完全没有考虑进来的，添加用户和电影相关的特征理论上是可以提升整个预测的性能

In [94]:
# 随机负采样
sample_list = list(train_data['movie'].unique())
data = list()
for idx, rows in tqdm(train_data.iterrows(), total=len(train_data)):
    use = rows['user']
    mov = rows['movie']
    data.append([use, mov, 1])
    for m in np.random.choice(sample_list, 3):
        data.append([use, m, 0])
data = pd.DataFrame(data, columns=['user', 'movie', 'tag'])

100%|██████████| 996861/996861 [08:40<00:00, 1914.32it/s]


In [95]:
# 合并用户特征和电影特征
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
user['gender'] = le.fit_transform(user['gender'])
user['age'] = le.fit_transform(user['age'])
user['occupation'] = le.fit_transform(user['occupation'])

data = pd.merge(data, user[['user', 'gender', 'age', 'occupation']], how='left', on='user')

genres = list()
tmp = movie['genres'].apply(lambda x: x.split('|'))
for l in tmp.tolist():
    genres += l
    
genres_dict = dict()
for idx, g in  enumerate(list(set(genres))):
    genres_dict[g] = idx + 1
    
movie['genres'] = tmp.apply(lambda x: [genres_dict[i] for i in x])

data = pd.merge(data, movie[['movie', 'genres']], how='left', on='movie')

In [96]:
data.head()

Unnamed: 0,user,movie,tag,gender,age,occupation,genres
0,0,1176,1,0,0,10,[4]
1,0,1360,0,0,0,10,"[13, 12, 10]"
2,0,298,0,0,0,10,"[4, 10]"
3,0,1865,0,0,0,10,[13]
4,0,655,1,0,0,10,"[2, 15, 12]"


In [110]:
class trainset(Dataset):
    def __init__(self, data):
        self.x = data[0]
        self.y = data[1]

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        data = (x, y)
        return data

    def __len__(self):
        return len(self.x)

# 训练集验证集随机分割
train_df, test_df = train_test_split(data, test_size=0.2, random_state=2021)
train_x = train_df[['user', 'gender', 'age', 'occupation', 'movie']].values
train_y = train_df['tag'].values
test_x = test_df[['user', 'gender', 'age', 'occupation', 'movie']].values
test_y = test_df['tag'].values

# 构造dataloader
train_dataset = trainset((train_x, train_y))
test_dataset = trainset((test_x, test_y))
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [115]:
class DNNModel(nn.Module):
    def __init__(self, inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm):
        super(DNNModel, self).__init__()
        self.user_embed = nn.Embedding(input_user_categorical_feature[0][0], input_user_categorical_feature[0][1])
        self.gender_embed = nn.Embedding(input_user_categorical_feature[1][0], input_user_categorical_feature[1][1])
        self.age_embed = nn.Embedding(input_user_categorical_feature[2][0], input_user_categorical_feature[2][1])
        self.occupation_embed = nn.Embedding(input_user_categorical_feature[3][0], input_user_categorical_feature[3][1])
        self.movie_embed = nn.Embedding(input_movie_categorical_feature[0][0], input_movie_categorical_feature[0][1])
#         self.genres_embed = nn.Embedding(input_movie_categorical_feature[1][0], input_movie_categorical_feature[1][1])
        
        self.user_dnn = nn.Sequential(
            nn.Linear(512, 128),
#             nn.Dropout(0.5),
            nn.Linear(128, 64)
        )
        
        self.movie_dnn = nn.Sequential(
            nn.Linear(128, 128),
#             nn.Dropout(0.5),
            nn.Linear(128, 64)
        )

        
    def forward(self, x):
        u = self.user_embed(x[:, 0])
        g = self.gender_embed(x[:, 1])
        a = self.age_embed(x[:, 2])
        oc = self.occupation_embed(x[:, 3])
        m = self.movie_embed(x[:, 4])
        
        u = torch.cat([u, g, a, oc], -1)
        u = self.user_dnn(u)
        m = self.movie_dnn(m)
        u = u/torch.sum(u*u, 1).view(-1,1)
        m = m/torch.sum(m*m, 1).view(-1,1)
        return u, m

In [116]:
def train_model(model, train_loader, val_loader, epoch, loss_function, optimizer, path, early_stop):
    """
    pytorch 模型训练通用代码
    :param model: pytorch 模型
    :param train_loader: dataloader, 训练数据
    :param val_loader: dataloader, 验证数据
    :param epoch: int, 训练迭代次数
    :param loss_function: 优化损失函数
    :param optimizer: pytorch优化器
    :param path: save path
    :param early_stop: int, 提前停止步数
    :return: None
    """
    # 是否使用GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     device = torch.device("cpu")
    model = model.to(device)
    
    # 多少步内验证集的loss没有变小就提前停止
    patience, eval_loss = 0, 0
    
    # 训练
    for i in range(epoch):
        total_loss, count = 0, 0
        y_pred = list()
        y_true = list()
        for idx, (x, y) in tqdm(enumerate(train_loader), total=len(train_loader)):
            x, y = x.to(device), y.to(device) 
            u, m = model(x)
            predict = torch.sigmoid(torch.sum(u*m, 1))
            y_pred.extend(predict.cpu().detach().numpy())
            y_true.extend(y.cpu().detach().numpy())
            loss = loss_function(predict, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += float(loss)
            count += 1
            
        train_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        torch.save(model, path.format(i+1))
        print("Epoch %d train loss is %.3f and train auc is %.3f" % (i+1, total_loss / count, train_auc))
    
        # 验证
        total_eval_loss = 0
        model.eval()
        count_eval = 0
        val_y_pred = list()
        val_true = list()
        for idx, (x, y) in tqdm(enumerate(val_loader), total=len(val_loader)):
            x, y = x.to(device), y.to(device)
            u, m = model(x)
            predict = torch.sigmoid(torch.sum(u*m, 1))
            val_y_pred.extend(predict.cpu().detach().numpy())
            val_true.extend(y.cpu().detach().numpy())
            loss = loss_function(predict, y.float())
            total_eval_loss += float(loss)
            count_eval += 1
        val_auc = roc_auc_score(np.array(y_true), np.array(y_pred))
        print("Epoch %d val loss is %.3fand train auc is %.3f" % (i+1, total_eval_loss / count_eval, val_auc))
        
        # 提前停止策略
        if i == 0:
            eval_loss = total_eval_loss / count_eval
        else:
            if total_eval_loss / count_eval < eval_loss:
                eval_loss = total_eval_loss / count_eval
            else:
                if patience < early_stop:
                    patience += 1
                else:
                    print("val loss is not decrease in %d epoch and break training" % patience)
                    break

In [117]:
# 模型初始化
inp_user = 128
inp_movie = 128
out = 64
input_user_categorical_feature = {0: (6040, 128), 1: (2, 128), 2: (7, 128), 3: (21, 128)}
input_movie_categorical_feature =  {0: (3883, 128), 1:(18, 128)}
hidden_layers = [128, 64]
dropouts = [0.5, 0.5, 0.5]
batch_norm = False

model = DNNModel(inp_user, inp_movie, out, input_user_categorical_feature, input_movie_categorical_feature, 
                 hidden_layers, dropouts, batch_norm)

# 模型训练
epoch = 20
loss_function = F.binary_cross_entropy_with_logits
early_stop = 3
learn_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
path = 'model/model_{}.pth'

train_model(model, train_loader, test_loader, epoch, loss_function, optimizer, path, early_stop)

100%|██████████| 24922/24922 [03:27<00:00, 120.34it/s]
  1%|          | 41/6231 [00:00<00:15, 407.20it/s]

Epoch 1 train loss is 0.700 and train auc is 0.697


100%|██████████| 6231/6231 [00:19<00:00, 313.63it/s]


Epoch 1 val loss is 0.692and train auc is 0.697


100%|██████████| 24922/24922 [03:10<00:00, 131.15it/s]
  1%|          | 44/6231 [00:00<00:14, 435.12it/s]

Epoch 2 train loss is 0.693 and train auc is 0.744


100%|██████████| 6231/6231 [00:19<00:00, 325.07it/s]


Epoch 2 val loss is 0.692and train auc is 0.744


100%|██████████| 24922/24922 [02:40<00:00, 155.62it/s]
  1%|          | 45/6231 [00:00<00:14, 441.30it/s]

Epoch 3 train loss is 0.692 and train auc is 0.738


100%|██████████| 6231/6231 [00:14<00:00, 430.35it/s]


Epoch 3 val loss is 0.691and train auc is 0.738


100%|██████████| 24922/24922 [02:37<00:00, 158.64it/s]
  1%|          | 44/6231 [00:00<00:14, 432.15it/s]

Epoch 4 train loss is 0.691 and train auc is 0.743


100%|██████████| 6231/6231 [00:14<00:00, 435.80it/s]


Epoch 4 val loss is 0.690and train auc is 0.743


100%|██████████| 24922/24922 [02:35<00:00, 160.22it/s]
  1%|          | 44/6231 [00:00<00:14, 432.69it/s]

Epoch 5 train loss is 0.691 and train auc is 0.743


100%|██████████| 6231/6231 [00:14<00:00, 415.44it/s]


Epoch 5 val loss is 0.690and train auc is 0.743


100%|██████████| 24922/24922 [02:34<00:00, 161.64it/s]
  1%|          | 43/6231 [00:00<00:14, 426.91it/s]

Epoch 6 train loss is 0.691 and train auc is 0.742


100%|██████████| 6231/6231 [00:14<00:00, 431.93it/s]


Epoch 6 val loss is 0.691and train auc is 0.742


100%|██████████| 24922/24922 [02:34<00:00, 161.24it/s]
  1%|          | 43/6231 [00:00<00:14, 427.99it/s]

Epoch 7 train loss is 0.691 and train auc is 0.742


100%|██████████| 6231/6231 [00:14<00:00, 427.78it/s]


Epoch 7 val loss is 0.690and train auc is 0.742


100%|██████████| 24922/24922 [02:35<00:00, 160.77it/s]
  1%|          | 34/6231 [00:00<00:18, 328.96it/s]

Epoch 8 train loss is 0.692 and train auc is 0.738


100%|██████████| 6231/6231 [00:14<00:00, 417.40it/s]


Epoch 8 val loss is 0.690and train auc is 0.738


100%|██████████| 24922/24922 [02:34<00:00, 161.28it/s]
  1%|          | 42/6231 [00:00<00:14, 413.43it/s]

Epoch 9 train loss is 0.691 and train auc is 0.740


100%|██████████| 6231/6231 [00:14<00:00, 423.71it/s]


Epoch 9 val loss is 0.693and train auc is 0.740
val loss is not decrease in 3 epoch and break training


In [118]:
# 结果验证
model.eval()
user['movie'] = 1
test_x = user[['user', 'gender', 'age', 'occupation', 'movie']].values
x = torch.from_numpy(test_x).cuda()
user_embed, _ = model(x)

movie['user'] = 1
movie['gender'] = 1
movie['age'] = 1
movie['occupation'] = 1
test_x = movie[['user', 'gender', 'age', 'occupation','movie']].values
x = torch.from_numpy(test_x).cuda()
_, movie_embed = model(x)

movie_embed = movie_embed.cpu().detach().numpy()
user_embed = user_embed.cpu().detach().numpy()


# faiss索引构建
d = 64
nlist = 10
index = faiss.IndexFlatL2(d)
index.add(movie_embed)

# 验证集数据字典化
user_movie_dict_val = dict()
for idx, rows in tqdm(val_data.iterrows(), total=len(val_data)):
    u = rows['user']
    m = rows['movie']
    if u not in user_movie_dict_val:
        user_movie_dict_val[u] = [m]
    else:
         user_movie_dict_val[u].append(m)
            
# 用户推荐结果索引           
D, I = index.search(user_embed[list(val_data['user'].unique())], 50)

# 召回率计算
hits, total = 0, 0
for uid, rec_list in zip(list(val_data['user'].unique()), I):
    hits += len(set(rec_list)&set(user_movie_dict_val[uid]))
    total += len(user_movie_dict_val[uid])
print("recall is %.3f" % (hits/total))

100%|██████████| 3348/3348 [00:00<00:00, 8214.79it/s]


recall is 0.063


## 总结
目前实验效果较好的方案是基于0，1 label的随机负采样方案和基于triplet loss的正负样本距离方案，考虑加入用户相关的属性特征融合到用户向量中目前效果不是特别理想后续会尝试优化，目前实验的效果稳定性还有一定的优化空间参数也有一定的调优空间，具体调惨和初始化方案可以深入尝试。希望通过上述的方案能直观感受到各种深度召回方案之间的差距和问题，实践结合理论才能提升自我。