In [95]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data

import time, json, datetime 
from tqdm import tqdm

import numpy as np 
import pandas as pd 
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [83]:
class DeepFM(nn.Module):
    def __init__(self, cate_fea_nuniqs, nume_fea_size=0, emb_size=8, 
                 hid_dims=[256, 128], num_classes=1, dropout=[0.2, 0.2]): 
        """
        cate_fea_nuniqs: 类别特征的唯一值个数列表，也就是每个类别特征的vocab_size所组成的列表
        nume_fea_size: 数值特征的个数，该模型会考虑到输入全为类别型，即没有数值特征的情况 
        """
        super().__init__()
        self.cate_fea_size = len(cate_fea_nuniqs)
        self.nume_fea_size = nume_fea_size
        
        """FM部分"""
        # 一阶
        if self.nume_fea_size != 0:
            self.fm_1st_order_dense = nn.Linear(self.nume_fea_size, 1)  # 数值特征的一阶表示
        self.fm_1st_order_sparse_emb = nn.ModuleList([
            nn.Embedding(voc_size, 1) for voc_size in cate_fea_nuniqs])  # 类别特征的一阶表示
        
        # 二阶
        self.fm_2nd_order_sparse_emb = nn.ModuleList([
            nn.Embedding(voc_size, emb_size) for voc_size in cate_fea_nuniqs])  # 类别特征的二阶表示
        
        """DNN部分"""
        self.all_dims = [self.cate_fea_size * emb_size] + hid_dims
        self.dense_linear = nn.Linear(self.nume_fea_size, self.cate_fea_size * emb_size)  # 数值特征的维度变换到FM输出维度一致
        self.relu = nn.ReLU()
        # for DNN 
        for i in range(1, len(self.all_dims)):
            setattr(self, 'linear_'+str(i), nn.Linear(self.all_dims[i-1], self.all_dims[i]))
            setattr(self, 'batchNorm_' + str(i), nn.BatchNorm1d(self.all_dims[i]))
            setattr(self, 'activation_' + str(i), nn.ReLU())
            setattr(self, 'dropout_'+str(i), nn.Dropout(dropout[i-1]))
        # for output 
        self.dnn_linear1 = nn.Linear(hid_dims[-1]+2, 10)
        self.dnn_linear2 = nn.Linear(hid_dims[-1], 1)
        
    def forward(self, X_sparse, X_dense=None):
        """
        X_sparse: 类别型特征输入  [bs, cate_fea_size]
        X_dense: 数值型特征输入（可能没有）  [bs, dense_fea_size]
        """
        
        """FM 一阶部分"""
        
        print('self.fm_1st_order_sparse_emb', len(self.fm_1st_order_sparse_emb))
        
        fm_1st_sparse_res = [emb(X_sparse[:, i].unsqueeze(1)).view(-1, 1) 
                             for i, emb in enumerate(self.fm_1st_order_sparse_emb)]
        fm_1st_sparse_res = torch.cat(fm_1st_sparse_res, dim=1)  # [bs, cate_fea_size]
        fm_1st_sparse_res = torch.sum(fm_1st_sparse_res, 1,  keepdim=True)  # [bs, 1]
        
        if X_dense is not None:
            fm_1st_dense_res = self.fm_1st_order_dense(X_dense) 
            fm_1st_part = fm_1st_sparse_res + fm_1st_dense_res
        else:
            fm_1st_part = fm_1st_sparse_res   # [bs, 1]
        
        """FM 二阶部分"""
        fm_2nd_order_res = [emb(X_sparse[:, i].unsqueeze(1)) for i, emb in enumerate(self.fm_2nd_order_sparse_emb)]
        fm_2nd_concat_1d = torch.cat(fm_2nd_order_res, dim=1)  # [bs, n, emb_size]  n为类别型特征个数(cate_fea_size)
        
        # 先求和再平方
        sum_embed = torch.sum(fm_2nd_concat_1d, 1)  # [bs, emb_size]
        square_sum_embed = sum_embed * sum_embed    # [bs, emb_size]
        # 先平方再求和
        square_embed = fm_2nd_concat_1d * fm_2nd_concat_1d  # [bs, n, emb_size]
        sum_square_embed = torch.sum(square_embed, 1)  # [bs, emb_size]
        # 相减除以2 
        sub = square_sum_embed - sum_square_embed  
        sub = sub * 0.5   # [bs, emb_size]
        
        fm_2nd_part = torch.sum(sub, 1, keepdim=True)   # [bs, 1]
        
        """DNN部分"""
        dnn_out = torch.flatten(fm_2nd_concat_1d, 1)   # [bs, n * emb_size]
        
        if X_dense is not None:
            dense_out = self.relu(self.dense_linear(X_dense))   # [bs, n * emb_size]
            dnn_out = dnn_out + dense_out   # [bs, n * emb_size]
        
        for i in range(1, len(self.all_dims)):
            dnn_out = getattr(self, 'linear_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'batchNorm_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'activation_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'dropout_' + str(i))(dnn_out)
        
        out1 = self.dnn_linear1(torch.cat([dnn_out, fm_1st_part, fm_2nd_part],1))   # [bs, N]
        # out1 = fm_1st_part + fm_2nd_part + dnn_out   # [bs, 1]
         
        dnn_out2 = self.dnn_linear2(dnn_out)   # [bs, 1]
        out2 = fm_1st_part + fm_2nd_part + dnn_out2   # [bs, 1]
        
        return out1, out2

In [84]:
model = DeepFM(cate_fea_nuniqs=[5910798+1, 50355+1,34, 340, 1927],
              nume_fea_size=2)

In [45]:
model

DeepFM(
  (fm_1st_order_dense): Linear(in_features=2, out_features=1, bias=True)
  (fm_1st_order_sparse_emb): ModuleList(
    (0): Embedding(5910799, 1)
    (1): Embedding(50356, 1)
    (2): Embedding(34, 1)
    (3): Embedding(340, 1)
    (4): Embedding(1927, 1)
  )
  (fm_2nd_order_sparse_emb): ModuleList(
    (0): Embedding(5910799, 8)
    (1): Embedding(50356, 8)
    (2): Embedding(34, 8)
    (3): Embedding(340, 8)
    (4): Embedding(1927, 8)
  )
  (dense_linear): Linear(in_features=2, out_features=40, bias=True)
  (relu): ReLU()
  (linear_1): Linear(in_features=40, out_features=256, bias=True)
  (batchNorm_1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (activation_1): ReLU()
  (dropout_1): Dropout(p=0.2, inplace=False)
  (linear_2): Linear(in_features=256, out_features=128, bias=True)
  (batchNorm_2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (activation_2): ReLU()
  (dropout_2): Dropout(p=0.2, inplace=

In [3]:
import pandas as pd
import glob, gc

%pylab inline
import seaborn as sns

INPUT_PATH = './3'

Populating the interactive namespace from numpy and matplotlib


In [None]:
test_data = pd.read_hdf('digix-data.hdf', 'test_data')
user_features = pd.read_hdf('digix-data.hdf', 'user_features')
video_features = pd.read_hdf('digix-data.hdf', 'video_features')
history_behavior = pd.read_hdf('digix-data.hdf', 'history_behavior')

In [5]:
# history_behavior = history_behavior[history_behavior['user_id'].isin(test_data['user_id'].unique())]
val_behavior = history_behavior[history_behavior['pt_d'] == 20210502]
train_behavior = history_behavior[history_behavior['pt_d'] != 20210502]

In [6]:
train_behavior = pd.concat([
    train_behavior[train_behavior['watch_label'] == 0].sample(1000000),
    train_behavior[train_behavior['watch_label'] != 0],
])

In [7]:
val_behavior = pd.concat([
    val_behavior[val_behavior['watch_label'] == 0].sample(1000000),
    val_behavior[val_behavior['watch_label'] != 0],
])

In [12]:
user_features = user_features.set_index('user_id')

In [13]:
user_features

Unnamed: 0_level_0,age,gender,country,province,city,city_level,device_name
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1757005,3,1,0,9,6,3,327
17938,0,0,0,4,22,3,327
4263520,1,0,0,19,1,5,327
1411600,3,0,0,5,138,1,327
3992242,2,0,0,0,142,0,327
...,...,...,...,...,...,...,...
3223427,4,0,0,3,3,3,28
4707826,4,0,0,17,249,1,28
5907653,0,0,0,11,65,0,28
3633224,3,0,0,2,57,1,28


In [25]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

class MLPDataset(Dataset):
    def __init__(self, history_behavior, train=True):
        self.history_behavior = history_behavior
        self.train = train
        
    def __getitem__(self, index):
        user_id = self.history_behavior.iloc[index]['user_id']
        video_id = self.history_behavior.iloc[index]['video_id']
        
        feed_dict = {
            'user_id': user_id,
            'video_id': video_id,
            'user_province': user_features.loc[user_id]['province'],
            'user_city': user_features.loc[user_id]['city'],
            'user_device': user_features.loc[user_id]['device_name'],
            'user_age': user_features.loc[user_id]['age'],
            'city_level': user_features.loc[user_id]['city_level'],
        }
        
        if self.train:
            watch_label = self.history_behavior.iloc[index]['watch_label']
            share_label = self.history_behavior.iloc[index]['is_share']

            feed_dict['watch_label'] = watch_label
            feed_dict['share_label'] = share_label
            
            return feed_dict
            
#             return user_id, video_id, \
#                 torch.from_numpy(np.array(watch_label)), \
#                 torch.from_numpy(np.array([share_label]))
        else:
            return feed_dict
        
    def __len__(self):
        return len(self.history_behavior)
    
train_loader = torch.utils.data.DataLoader(
    dataset = MLPDataset(train_behavior),
    batch_size=100, shuffle=True, num_workers=5,
)

# batch_size 1000

val_loader = torch.utils.data.DataLoader(
    dataset = MLPDataset(val_behavior, train=True),
    batch_size=200, shuffle=False, num_workers=5,
)

In [26]:
for data in train_loader:
    break

In [28]:
data.keys()

dict_keys(['user_id', 'video_id', 'user_province', 'user_city', 'user_device', 'user_age', 'city_level', 'watch_label', 'share_label'])

In [53]:
sparse_feat = torch.stack([data[x].long() 
                for x in ['user_id', 'video_id', 'user_province', 
                          'user_city', 'user_device']]).T

dense_feat = torch.stack([data[x].float() for x in ['user_age', 'city_level']]).T

In [85]:
model(sparse_feat, dense_feat)[0].shape

self.fm_1st_order_sparse_emb 5


torch.Size([100, 10])

In [86]:
model(sparse_feat, dense_feat)[1].shape

self.fm_1st_order_sparse_emb 5


torch.Size([100, 1])

In [98]:
video_features

Unnamed: 0,video_id,video_name,video_tags,video_description,video_release_date,video_director_list,video_actor_list,video_score,video_second_class,video_duration
0,3460,脱皮爸爸,"院线电影,家庭关系,命运","中年失意的儿子田力行（古天乐饰）在生活上遇到了重重危机：母亲病逝,工作不顺,妻子要求离婚。正...",2017-04-27,司徒慧焯,"吴镇宇,古天乐,春夏,蔡洁",7.398438,"剧情,喜剧,奇幻",5913
1,14553,喜气洋洋小金莲,"古装喜剧,剧情片,喜剧片,内地电影,欢乐喜剧,爱情纠纷",故事始于西门庆为西门药业的“伟哥”产品寻找代言人，西门庆初见潘金莲，一时惊为天人，为成功抱得...,2015-12-30,"杨珊珊,李亚玲","陈南飞,程隆妮,王闯,贾海涛,闫薇儿",5.601562,喜剧,6217
2,1214,风流家族,"男女关系,家庭关系,命运,院线电影",香世仁（钟镇涛 饰）是家财万贯的香港富豪，在满足了一切物质上的要求后，他将生活的重心放在了儿...,2002-03-07,"邱礼涛,杨漪珊","张家辉,卢巧音,钟镇涛,叶童,李蕙敏,张坚庭,袁洁莹,黄佩霞,齐芷瑶,刘以达,叶伟信,邹凯光...",6.800781,"都市,喜剧,爱情,家庭",5963
3,30639,大提琴的故事,"短片,动画片",低音大提琴演奏家史密斯科夫正要去参加某贵族的沙龙，途中他被河边的美丽景色所吸引，驻足观看。兴...,1949-01-01,"伊里·特恩卡,契诃夫",,,"动画,爱情",17371
4,38522,歌舞大王齐格飞,"喜剧片,人物传记,浪漫爱情",罗伯特．Z．伦纳德导演的这部影片以百老汇最大的歌舞团——齐格菲歌舞团的创办人佛罗伦斯．齐格菲...,1936-04-08,"罗伯特·Z·伦纳德,William Anthony McGuire","威廉·鲍威尔,玛娜·洛伊,路易丝·赖纳,弗兰克·摩根,范妮·布莱斯,弗吉尼亚·布鲁斯,雷吉纳...",7.699219,"剧情,歌舞,喜剧",10608
...,...,...,...,...,...,...,...,...,...,...
49726,36024,跆拳震九州岛,"动作片,战争片",日本占领韩国时期，日军为镇压反抗志士，设特务机关，汉城“横山道馆”亦属其行列。跆拳道首徒金正...,1973-09-12,黄枫,"茅瑛,黄家达,黄仁植,安德鲁·摩根,陈会毅,金琪珠",6.898438,"剧情,动作,战争",5736
49727,11306,小豹杜玛,"境外院线,冒险,家庭,人与动物,南非草原",广袤野性的南非草原。入夜，一只迷路的印度豹幼崽跌跌撞撞地闯进高速公路。幸运的是，彼得（坎贝尔...,2005-04-22,拉罗尔·巴尔兰德,"Alex Michaeletos,坎贝尔·斯科特,霍普·戴维斯,伊默恩·沃克,Mary Ma...",8.898438,"剧情,冒险,家庭,儿童",6034
49728,16178,父子,"家庭关系,人际关系,命运",在马来西亚的一个华人社区里，伴随着一串年轻稚嫩的歌声，小男孩阿宝梦见爸爸骑着自行车载着他穿过...,2006-11-30,"谭家明,田开良","郭富城,吴澋滔,杨采妮,林熙蕾",8.000000,"剧情,家庭",6918
49729,5337,泪痕,"院线电影,矛盾冲突,冲破万难,人性,人际关系,家庭关系,阴谋,挽救局面,恶势力",“四人帮”被打倒后，朱克实（李仁堂 饰）受上级指派，作为金县的县委书记走马上任。此前该县书记...,1979-01-01,"李文化,马烽,孙谦","李仁堂,谢芳,杨威,邵万林,方辉,许福印,茂路,侯冠群",7.898438,剧情,6885


In [90]:
import numpy as np
import pandas as pd 
from gensim import models

def to_text_vector(txt, model):
    '''
        将文本txt转化为文本向量
    '''
    words = txt.split(',') 
    array = np.asarray([model.wv[w] for w in words if w in words],dtype='float32') 
    return array.mean(axis=0)

## 案例
sentences = ["1,2,3",'3,4,1','1,4,2']
model = models.Word2Vec(sentences, workers=8, size=20, min_count = 1, window = 2)
to_text_vector(txt="1,2,3", model= model)

array([ 5.2690147e-03, -1.8744482e-02, -3.9180042e-03, -3.1476885e-03,
       -4.5331237e-03, -1.0070135e-02, -4.8246565e-03, -1.4685404e-04,
        1.5172020e-02,  2.6108501e-03, -6.7785494e-03,  1.1122740e-02,
        1.0333024e-03, -3.7350107e-03, -3.3984147e-04,  2.9818466e-05,
       -1.7731095e-03,  1.9154087e-02,  7.0528746e-05, -1.8537654e-02],
      dtype=float32)

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])