论文:https://arxiv.org/pdf/1606.07792.pdf

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Layer
from tensorflow import keras
from tensorflow.keras.regularizers import l2

import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
import sys
sys.path.append('./util/')
from utils import load_data_embdding 

users, movies, ratings = load_data_embdding()

In [9]:
data1 = pd.merge(ratings.drop(columns = ['timestamp'],axis = 1), movies, how = 'left', on = 'movieid')
data = pd.merge(data1, users, how = 'left', on = 'userid')

X = data.drop(columns = ['userid', 'movieid', 'title', 'rating'])
Y = data['rating'].values
#genres, gender, age, occupationid四个需要embedding的特征，可以分别emb也可以合并以后emb，这里分别做emb
set_genres = []
for i in movies.index:
    set_genres += movies['genres'].iloc[i]
set_genres = list(set(set_genres))
dic_genres = dict([(j, i) for i,j in enumerate(set_genres)])
dic_genres['UNK'] = len(dic_genres)
X['genres'] = X['genres'].apply(lambda x: [dic_genres[i] for i in x])
x_genres = tf.keras.preprocessing.sequence.pad_sequences(list(X['genres'].values),
                                                        value = dic_genres['UNK'],
                                                        padding = 'post',
                                                        maxlen = 6)


dic_gender = {'F':0, 'M':1}
X['gender'] = X['gender'].apply(lambda x: [dic_gender[i] for i in x])
dic_age = {1:0, 56:1, 25:2, 45:3, 50:4, 35:5, 18:6}
X['age'] = X['age'].apply(lambda x: [dic_age[x]])
list_occ = list(pd.unique(data['occupationid']))
dic_occ = dict([(j, i) for i,j in enumerate(list_occ)])
X['occupationid'] = X['occupationid'].apply(lambda x: [dic_occ[x]])

x_gender = list(X['gender'].values)
x_age = list(X['age'].values)
x_occupationid = list(X['occupationid'].values)


train_x_genres, test_x_genres, train_y, test_y = train_test_split(np.array(x_genres), Y, random_state=11)
train_x_gender, test_x_gender = train_test_split(np.array(x_gender), random_state=11)
train_x_age, test_x_age = train_test_split(np.array(x_age), random_state=11)
train_x_occupationid, test_x_occupationid = train_test_split(np.array(x_occupationid), random_state=11)

In [13]:
input_genres = keras.layers.Input(shape=(6,), name="genres")  
embedding_genres = keras.layers.Embedding(output_dim=16, input_dim=len(dic_genres), input_length=6)(input_genres)

input_gender = keras.layers.Input(shape=(1,), name="gender")  
embedding_gender = keras.layers.Embedding(output_dim=16, input_dim=2, input_length=1)(input_gender)

input_age = keras.layers.Input(shape=(1,), name="age")  
embedding_age = keras.layers.Embedding(output_dim=16, input_dim=7, input_length=1)(input_age)

input_occ = keras.layers.Input(shape=(1,), name="occupationid")  
embedding_occ = keras.layers.Embedding(output_dim=16, input_dim=21, input_length=1)(input_occ)

embedding_combine = keras.layers.concatenate(inputs=[embedding_genres, embedding_gender, embedding_age,
                                                    embedding_occ], axis=1)
embedding_combine = keras.layers.GlobalAveragePooling1D()(embedding_combine) 

In [14]:
##wide
wide_layer = keras.layers.Dense(1)(embedding_combine)
##dnn
dnn_layer = keras.layers.Dense(64, activation = 'relu')(embedding_combine)
dnn_layer = keras.layers.BatchNormalization()(dnn_layer)
dnn_layer = keras.layers.Dense(32, activation = 'relu')(dnn_layer)
dnn_layer = keras.layers.BatchNormalization()(dnn_layer)
dnn_layer = keras.layers.Dense(1)(dnn_layer)
##wide_deep
outputs = keras.layers.average([wide_layer, dnn_layer])  #因为是做回归，就用了average，分类的话得add后再接Activation

optimizer = keras.optimizers.RMSprop(learning_rate = 0.001)
model = tf.keras.Model(inputs = [input_genres, input_gender, input_age, input_occ], outputs = [outputs])

model.compile(loss='mean_squared_error',
        optimizer=optimizer,
        metrics=['mean_absolute_error', 'mean_squared_error'],
         )

In [15]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
#checkpoint_path = "./model/deepcross.h5"
# cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path,
#                                               save_weights_only=True,
#                                               save_best_only=True,
#                                               verbose=1)

model.fit(
    [train_x_genres, train_x_gender, train_x_age, train_x_occupationid], train_y,
    epochs=100, 
    validation_data=([test_x_genres, test_x_gender, test_x_age, test_x_occupationid], test_y,),
    batch_size=256, shuffle=True,
    callbacks=[early_stopping]
)

Train on 750156 samples, validate on 250053 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<tensorflow.python.keras.callbacks.History at 0x7fd7198b07b8>