In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler


class SENET(layers.Layer):
    def __init__(self, field_size,emb_size, r):
        super(SENET, self).__init__()
        self.f = field_size
        self.m = emb_size
        self.MLP1 = layers.Dense(units=(field_size//r), activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.01))
        self.MLP2 = layers.Dense(units=field_size,kernel_regularizer=tf.keras.regularizers.l2(0.01), activation='relu')
        
    def call(self, inputs):
        inputs = tf.reshape(inputs,shape=[-1, self.f, self.m])
        x = inputs
        x = tf.reduce_mean(x, axis=2)
        x = tf.reshape(x,shape=[-1, self.f])
        x = self.MLP1(x)
        x = self.MLP2(x)
        outputs = inputs*tf.reshape(x,shape=[-1, self.f,1])
        return tf.reshape(outputs,shape=[-1, self.f*self.m])


class ResNet(layers.Layer):
    def __init__(self, hidden_unit, dim_stack):
        super(ResNet, self).__init__()
        self.layer1 = layers.Dense(units=hidden_unit, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.01))
        self.layer2 = layers.Dense(units=dim_stack,kernel_regularizer=tf.keras.regularizers.l2(0.01), activation=None)

    def call(self, inputs):
        x = inputs
        x = self.layer1(x)
        x = self.layer2(x)
        outputs = x + inputs
        return outputs

class DeepFM(Model):
    def __init__(self, spare_feature_columns, k, w_reg, v_reg, hidden_units, output_dim, activation, drop_out,Use_DNN=True,Use_Res=False):
        super(DeepFM, self).__init__()
        self.spare_feature_columns = spare_feature_columns
        self.w_reg = w_reg
        self.v_reg = v_reg
        self.k = k
        self.Use_DNN = Use_DNN
        self.Use_Res = Use_Res

        
        # embedding
        self.embedding_layer = {'embed_layer{}'.format(i): layers.Embedding(feat['vocabulary_size'], self.k)
                                for i, feat in enumerate(self.spare_feature_columns)}

        # 做完embedding后的维度
        
    
        # for feat in self.spare_feature_columns:
        #     self.spare_dim +=  feat['embed_dim']
        self.onedim = self.k* len(self.spare_feature_columns)
        
        
        
        # self.SENET = tf.keras.Sequential()
        # self.SENET.add(SENET(len(self.spare_feature_columns), self.k, 2))
        


        if(self.Use_Res):
            #Res
            self.DNN = tf.keras.Sequential()
            for hidden in hidden_units:
                self.DNN.add(ResNet(hidden,self.onedim*2))
                self.DNN.add(layers.BatchNormalization())
                self.DNN.add(layers.Activation(activation))
                self.DNN.add(layers.Dropout(drop_out))
            self.DNN.add(layers.Dense(output_dim, activation=None))    
        
        if(self.Use_DNN):
            # dnn
            self.DNN = tf.keras.Sequential()
            for hidden in hidden_units:
                self.DNN.add(layers.Dense(hidden, kernel_regularizer=tf.keras.regularizers.l2(0.01)))
                self.DNN.add(layers.BatchNormalization())
                self.DNN.add(layers.Activation(activation))
                self.DNN.add(layers.Dropout(drop_out))
            self.DNN.add(layers.Dense(output_dim, activation=None))

    def build(self, input_shape):
        self.b = self.add_weight(name='b', shape=(1,), initializer=tf.zeros_initializer(), trainable=True, )
        self.w = self.add_weight(name='w', shape=(self.onedim, 1), initializer=tf.random_normal_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(self.w_reg))
        self.v = self.add_weight(name='v', shape=(self.onedim, self.k), initializer=tf.random_normal_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(self.v_reg))

    def call(self, inputs, training=None, mask=None):
        
        sparse_inputs = inputs

        # embedding
        sparse_embed = tf.concat([self.embedding_layer['embed_layer{}'.format(i)](sparse_inputs[:, i]) for i in range(sparse_inputs.shape[1])], axis=1)  # (batchsize, 26*k)
        
        # SENET_embed = self.SENET(sparse_embed)
        
        # FM、Deep 共享embedding
        # FM_x = tf.concat([dense_inputs, sparse_embed], axis=1)  # (batchsize, 26*embed_dim + 13)
        FM_x = sparse_embed
        # deep_x = tf.concat([FM_x, SENET_embed], axis=1)
       

        # FM part
        linear_part = tf.matmul(FM_x, self.w) + self.b  # (batchsize, 1)
        inter_cross1 = tf.square(FM_x @ self.v)  # (batchsize, k)
        inter_cross2 = tf.matmul(tf.pow(FM_x, 2), tf.pow(self.v, 2))  # (batchsize, k)
        cross_part = 0.5 * tf.reduce_sum(inter_cross1 - inter_cross2, axis=1, keepdims=True)  # (batchsize, 1)
        fm_output = linear_part + cross_part

        # Deep part
        dnn_out = self.DNN(FM_x)  # (batchsize, 1)
        
        output = tf.nn.sigmoid(fm_output + dnn_out)
        #output = tf.nn.sigmoid(fm_output)
        
        # output = tf.nn.sigmoid(0.5 * (fm_output + dnn_out))
        return output




In [2]:
def sparseFeature(feat, vocabulary_size, embed_dim):
    return {'feat': feat, 'vocabulary_size': vocabulary_size, 'embed_dim': embed_dim}

def denseFeature(feat):
    return {'feat': feat}

def Focal_Loss(y_true, y_pred):
    y_pred = tf.convert_to_tensor(y_pred)
    y_true = tf.cast(y_true, y_pred.dtype)
    loss = 0.7*K.pow(1.0 - y_pred,1.0)*y_true * K.log(y_pred + 1e-10)+0.3*K.pow(y_pred,1.0)*(1.0 - y_true) * K.log(1.0 - y_pred + 1e-10)
    #loss = 0.5*y_true * K.log(y_pred + 1e-15)+0.5*(1.0 - y_true) * K.log(1.0 - y_pred + 1e-15)
    #loss = y_true * K.log(y_pred + 1e-10)+(1.0 - y_true) * K.log(1.0 - y_pred + 1e-10)
    return -K.mean(loss, axis=-1)

In [3]:
pd.set_option('display.max_columns', None)  # 显示完整的列
pd.set_option('display.max_rows', None)  # 显示完整的行
pd.set_option('display.expand_frame_repr', False)  # 设置不折叠数据
pd.set_option('display.max_colwidth', 100) # 设置行宽度

header = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
df_user = pd.read_csv('data/MovieLens/u.user', sep='|', names=header)
df_user['age'] = pd.cut(df_user['age'], [0,8,14,16,18,22,35,50,60,75,100], labels=['0-8','8-14','14-16','16-18','18-22','22-35','35-50','50-60','60-75','75-100'])
df_user = df_user.drop(columns=['zip_code'])

header = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
df_item = pd.read_csv('data/MovieLens/u.item', sep='|', names=header, encoding = "ISO-8859-1")
df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])

header = ['user_id', 'item_id', 'rating', 'timestamp']
df_train = pd.read_csv('data/MovieLens/ua.base', sep='\t', names=header)
df_train = df_train.drop(columns=['timestamp'])

df_train = df_train.merge(df_user, on='user_id', how='left') 
df_train = df_train.merge(df_item, on='item_id', how='left')

df_test = pd.read_csv('data/MovieLens/ua.test', sep='\t', names=header)
df_test = df_test.drop(columns=['timestamp'])

df_test = df_test.merge(df_user, on='user_id', how='left') 
df_test = df_test.merge(df_item, on='item_id', how='left')



In [4]:
data = pd.concat([df_train, df_test],ignore_index=True)


In [5]:
cols = data.columns.values.tolist()
cols.remove('user_id')
cols.remove('item_id')
cols.remove('rating')
# cols.remove('age')
# cols.remove('gender')
# cols.remove('occupation')

In [6]:
#data.insert(loc=0, column='spare_feature', value='')

In [7]:
# for f in cols:
#     data[f] = data[f].apply(lambda x: f if x==1 else '')

In [8]:
# for f in cols:
#     data['spare_feature'] = data['spare_feature'] + data[f]

In [9]:
data['rating'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)

In [10]:
for f in cols:
    data[f] = LabelEncoder().fit_transform(data[f])

In [11]:
from sklearn.utils import shuffle
data = shuffle(data)

In [12]:
data_X = data[cols]
data_y = data['rating'].values
spare_feature_columns = [sparseFeature(feat, data_X[feat].nunique(),data_X[feat].nunique()) for feat in cols]

train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.1, random_state=0, stratify=data_y)

In [13]:
def trian(emb_k,units,dropout,spare_column,train_X,train_y,test_X, test_y):
    model = DeepFM(spare_feature_columns = spare_column,
                   k = emb_k,
                   w_reg = 0.01,
                   v_reg = 0.001,
                   hidden_units= units,
                   output_dim = 1,
                   activation = 'relu',
                   drop_out = dropout,
                   Use_DNN=True,
                   Use_Res=False)
    
    adam = optimizers.Adam(lr=0.005)
     
    model.compile(
        optimizer=adam,
        loss='binary_crossentropy',
        metrics=[metrics.AUC()]
    )
    units = [str(x) for x in units]
        
    model.fit(
        train_X.values, train_y,
        validation_data=(test_X.values, test_y),
        batch_size=1024,
        epochs=100,
        verbose=1
    )
    return model

In [14]:

for units in [[]]:
    for k in [16]:
        for dropout in [0.7]:
            model = trian(k,units,dropout,spare_feature_columns,train_X,train_y,test_X, test_y)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
