In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
from time import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
import tensorflow as tf
from tensorflow.contrib.layers import xavier_initializer
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
import math
import logging
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse import hstack, vstack
import matplotlib.pyplot as plt
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config)

## tensorflow-net

In [3]:
def MLP(inp, hidden_dims):
    x = tf.layers.Dense(hidden_dims[0], kernel_initializer=tf.keras.initializers.he_normal(), dtype=tf.float32, activation=tf.nn.leaky_relu)(inp)
    x = tf.layers.BatchNormalization(dtype=tf.float32)(x)
    x = tf.nn.leaky_relu(x)
    for i, dim in enumerate(hidden_dims):
        if i > 0:
            x = tf.layers.Dense(dim, kernel_initializer=tf.keras.initializers.he_normal(), dtype=tf.float32, activation=tf.nn.leaky_relu)(x)
            x = tf.layers.BatchNormalization(dtype=tf.float32)(x)
            x = tf.nn.leaky_relu(x)
    return x

In [4]:
class DCFN:
    def __init__(self, learning_rate, embedding_size, dnn_layers, att_layer, cross_layer_num, conti_fea_cnt,
                 cate_embedding_uni_cnt_list, cate_embedding_w_list=None, fm_embedding_w=None, no_nan_w=None,
                 nan_w=None, fm_drop_outs=[1, 1]):
        self.lr = learning_rate
        self.conti_fea_cnt = conti_fea_cnt
        self.embedding_size = embedding_size
        self.fm_drop_outs = fm_drop_outs
        self.dnn_layers = dnn_layers
        self.att_layer = att_layer
        self.cross_layer_num = cross_layer_num
        # cate_embedding_uni_cnt_list离散特征计数
        self.cate_embedding_uni_cnt_list = cate_embedding_uni_cnt_list
        self.cate_embedding_w_list = cate_embedding_w_list

        self.fm_embedding_w = fm_embedding_w
        self.no_nan_w = no_nan_w
        self.nan_w = nan_w

        self.build()

    def build(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)

            self.input_vecs = []

            self.conti_vec = tf.placeholder(tf.float32, shape=[None, self.conti_fea_cnt], name='conti_vec')
            self.cate_indexs = tf.placeholder(tf.int16, shape=[None, sum(self.cate_embedding_uni_cnt_list)],
                                              name='cate_indexs')
            self.label = tf.placeholder(dtype=tf.int8, shape=[None, 1], name='label')

            self.cate_embeddings = []
            self.fm_fea_size = 0

            # 第一层embedding：降维
            cate_offset = 0
            for cate_idx, uni_cnt in enumerate(self.cate_embedding_uni_cnt_list):
                w = self.cate_embedding_w_list[cate_idx] if self.cate_embedding_w_list else tf.keras.initializers.he_normal()
                embedding_k = uni_cnt if int(2 * np.power(uni_cnt, 1 / 4)) > uni_cnt else int(
                    2 * np.power(uni_cnt, 1 / 4))
                self.fm_fea_size += embedding_k
                # embedding矩阵
                self.cate_embeddings.append(
                    tf.get_variable('cate_%d_embedding' % cate_idx, shape=[uni_cnt, embedding_k], dtype=tf.float32,
                                    initializer=w))

                crt_vec_index = self.cate_indexs[:, cate_offset:cate_offset + uni_cnt]  # None * uni_cnt
                cate_offset += uni_cnt
                crt_vec_index = tf.Print(crt_vec_index, [crt_vec_index], message='Debug:', summarize=50)

                crt_vec = tf.nn.embedding_lookup(self.cate_embeddings[cate_idx],
                                                 [i for i in range(uni_cnt)])  # uni_cnt * K
                # 等于1的加起来，求平均（embedding相当于多行相加，multi-hot要除1的个数保证一致）
                crt_vec = tf.matmul(tf.cast(crt_vec_index, tf.float32), crt_vec)  # None * K
                one_cnt = tf.cast(tf.reduce_sum(crt_vec_index, axis=1, keep_dims=True), dtype=tf.float32)  # None * 1
                crt_vec = tf.div(crt_vec, one_cnt)  # None * K
                self.input_vecs.append(crt_vec)

            mv_conti_vec = self.conti_vec
#             with tf.variable_scope('Missing-Value-Layer'):
#                 self.no_nan_w = tf.get_variable('no_nan_w', shape=[self.conti_fea_cnt, ],
#                                                 initializer=self.no_nan_w if self.no_nan_w else tf.ones_initializer())
#                 self.nan_w = tf.get_variable('nan_w', shape=[self.conti_fea_cnt, ],
#                                                          initializer=self.nan_w if self.nan_w else tf.zeros_initializer())
#                 mv_conti_vec = tf.multiply(self.conti_vec, self.no_nan_w)
#                 conti_zero_flag = tf.cast(tf.equal(mv_conti_vec, 0), tf.float32)
#                 mv_conti_vec += tf.multiply(conti_zero_flag, tf.reshape(self.nan_w, [-1, self.nan_w.shape[0]]))

            self.input_vecs.append(mv_conti_vec)
            self.fm_fea_size += self.conti_fea_cnt

            # 准备输入-----------------------------------------------------------------------------------------------------
            fm_fea = tf.concat(self.input_vecs, axis=-1)

            self.feat_index = [i for i in range(self.fm_fea_size)]
            if self.fm_embedding_w is not None:
                self.fea_embedding = tf.Variable(self.fm_embedding_w, name='fea_embedding', dtype=tf.float32)
            else:
                self.fea_embedding = tf.get_variable('fea_embedding', shape=[self.fm_fea_size, self.embedding_size],
                                                     initializer=tf.keras.initializers.he_normal(), dtype=tf.float32)
            # FM一阶部分权重
            self.feature_bias = tf.get_variable('fea_bias', shape=[self.fm_fea_size, 1],
                                                initializer=tf.keras.initializers.he_normal(), dtype=tf.float32)
            # attention部分权重
            self.attention_h = tf.Variable(np.random.normal(loc=0, scale=1, size=[self.att_layer,]), 
                                           dtype=np.float32, name='attention_h')
            self.attention_p = tf.Variable(np.ones([self.embedding_size, ], dtype=np.float32), 
                                           dtype=tf.float32, name='attention_p')
            # cross部分权重
            self.cross_w = [tf.get_variable(name='cross_weight_%d' % i, shape=[self.fm_fea_size, 1],
                                            initializer=tf.keras.initializers.he_normal(), dtype=tf.float32) for i in
                            range(self.cross_layer_num)]
            self.cross_b = [tf.get_variable(name='cross_bias_%d' % i, shape=[self.fm_fea_size, 1],
                                            initializer=tf.keras.initializers.he_normal(), dtype=tf.float32) for i in
                            range(self.cross_layer_num)]

            # 构造输入
            # 第二层embedding：潜在隐变量
            embeddings = tf.nn.embedding_lookup(self.fea_embedding, self.feat_index)  # None * F * K
            feat_value = tf.reshape(fm_fea, shape=[-1, self.fm_fea_size, 1])
            embeddings = tf.multiply(embeddings, feat_value)  # None * F * K
#             print(embeddings)
#             embeddings = tf.Print(embeddings, [embeddings], message='Debug:', summarize=30)

            # 搭建网络-----------------------------------------------------------------------------------------------------
            # FM部分
            with tf.variable_scope('FM-part'):
                # first order term:输入为原始sparse features
                y_first_order = tf.nn.embedding_lookup(self.feature_bias, self.feat_index)  # None * F * 1
                y_first_order = tf.reduce_sum(tf.multiply(y_first_order, feat_value), 2)  # None * F(对1、2维求和都可以)
                y_first_order = tf.nn.dropout(y_first_order, self.fm_drop_outs[0])  # None * F
                # second order term:输入为dense embedding
                summed_features_emb = tf.reduce_sum(embeddings, 1)  # None * K
                summed_features_emb_square = tf.square(summed_features_emb)  # None * K
                squared_features_emb = tf.square(embeddings)
                squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1)  # None * K
                y_second_order = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb)  # None * K
                y_second_order = tf.nn.dropout(y_second_order, self.fm_drop_outs[1])  # None * K
#                 # second order term:加入attention
#                 # Pair-wise Interation Layer
#                 element_wise_product = None
#                 for i in range(0, self.fm_fea_size):
#                     if element_wise_product is None:
#                         element_wise_product = tf.multiply(tf.gather(embeddings, [i], axis=1), 
#                                                            embeddings[:, i+1:self.fm_fea_size, :])
#                     else:
#                         element_wise_product = tf.concat([element_wise_product,
#                                                          tf.multiply(tf.gather(embeddings, [i], axis=1), 
#                                                                      embeddings[:, i+1:self.fm_fea_size, :])],
#                                                          axis=1) # None * (F*(F-1))/2 * K
#                 # Attention-based Pooling Layer
#                 attention_mul = tf.layers.Dense(self.att_layer)(element_wise_product)
#                 attention_exp = tf.exp(tf.reduce_sum(tf.multiply(self.attention_h, tf.nn.relu(attention_mul)),
#                                                      2, keep_dims=True))  # None * (H*(H-1)) * 1
#                 attention_sum = tf.reduce_sum(attention_exp, 1, keep_dims=True)  # None * 1 * 1
#                 attention_out = tf.div(attention_exp, attention_sum, name='attention_out')  #  None * (H*(H-1)) * 1
#                 y_second_order = tf.reduce_sum(tf.multiply(attention_out, element_wise_product), 1, name='afm')  # None * K
#                 y_second_order= tf.multiply(y_second_order, self.attention_p)  # None * K
#                 y_second_order = tf.nn.dropout(y_second_order, self.fm_drop_outs[1])  # None * K
    
            # Cross Layer部分
            with tf.variable_scope('Cross-part'):
                x_0 = feat_value
                x_l = x_0
                for l in range(self.cross_layer_num):
                    x_l = tf.tensordot(tf.matmul(x_0, x_l, transpose_b=True), self.cross_w[l], 1) + self.cross_b[
                        l] + x_l
                cross_output = tf.reshape(x_l, shape=[-1, self.fm_fea_size])

            # DNN部分
            with tf.variable_scope('Deep-part'):
                y_deep = tf.reshape(embeddings, shape=[-1, self.fm_fea_size * self.embedding_size])  # None*(F*K)
                y_deep = MLP(y_deep, self.dnn_layers)

                # 合并
            print('y_deep:{}, cross_output:{}, y_first_order:{}, y_second_order:{}'
                  .format(y_deep, cross_output, y_first_order, y_second_order))
#             last_input = tf.concat([y_deep], axis=-1) # DNN
#             last_input = tf.concat([y_first_order, y_second_order], axis=-1) # FM
#             last_input = tf.concat([y_deep, y_first_order, y_second_order], axis=-1) # DeepFM
            last_input = tf.concat([y_deep, cross_output], axis=-1) # DCN
#             last_input = tf.concat([y_deep, y_first_order, y_second_order, cross_output], axis=-1)  # DCFN

            self.y_pre = tf.layers.Dense(1, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())(
                last_input)  # 二分类
#             self.y_pre = tf.layers.Dense(5, activation=tf.nn.softmax, kernel_initializer=tf.keras.initializers.he_normal())(last_input) # 多分类

            # 损失函数(二分类交叉熵等同于logloss)
            self.loss = tf.losses.log_loss(self.label, self.y_pre)  # 二分类
#             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(tf.cast(self.label, tf.int32), 5), logits=self.y_pre)) # 多分类

            # 优化方法
            self.opt = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
            self.saver = tf.train.Saver()

    def save_model(self, model_path):
        self.saver.save(self.sess, model_path)

    def load_model(self, model_path):
        self.saver.restore(self.sess, model_path)

    def shuffle_csr_and_list(self, my_array, rng_state):
        np.random.set_state(rng_state)
        if type(my_array) == csr_matrix:
            index = np.arange(np.shape(my_array)[0])
            np.random.shuffle(index)
            print('shuffle csr_matrix ' + str(my_array.shape))
            return my_array[index, :]
        else:
            np.random.shuffle(my_array)
            return my_array

    def shuffle(self, cate_feas, conti_feas, labels):
        rng_state = np.random.get_state()
        cate_feas = self.shuffle_csr_and_list(cate_feas, rng_state)
        conti_feas = self.shuffle_csr_and_list(conti_feas, rng_state)
        labels = self.shuffle_csr_and_list(labels, rng_state)
        return cate_feas, conti_feas, labels

    def get_feed_dict(self, cate_feas, conti_feas, labels=None):
        feed_dict = {
            self.conti_vec: conti_feas,
            self.cate_indexs: cate_feas.todense(),
        }
        if labels is not None:
            feed_dict[self.label] = labels
        return feed_dict

    def gene_data(self, cate_feas, conti_feas, labels, bs, shuffle=False):
        if shuffle:
            cate_feas, conti_feas, labels = self.shuffle(cate_feas, conti_feas, labels)
        bm = math.ceil(cate_feas.shape[0] / bs)
        for j in range(bm):
            a = cate_feas[j * bs:(j + 1) * bs]
            b = conti_feas[j * bs:(j + 1) * bs]
            c = labels[j * bs:(j + 1) * bs]
            yield a, b, c

    def gene_balance_data(self, cate_feas, conti_feas, labels, bs, shuffle=False):
        pos_flag = np.array([l[0] == 1 for l in labels])
        pos_indexing, neg_indexing = np.arange(len(labels))[pos_flag], np.arange(len(labels))[~pos_flag]
        np.random.shuffle(neg_indexing)

        bm = math.ceil(sum(~pos_flag) / bs)
        for j in range(bm):
            need_cnt = int(bs / 2)
            crt_indexing = np.random.choice(pos_indexing, need_cnt).tolist() + neg_indexing[
                                                                               j * need_cnt:(j + 1) * need_cnt].tolist()

            a = cate_feas[crt_indexing, :]
            b = np.take(conti_feas, crt_indexing, axis=0)
            c = np.take(labels, crt_indexing, axis=0)
            yield a, b, c

    def fit(self, model_path, batch_size, epoch, cate_feas, conti_feas, labels, v_cate_feas, v_conti_feas, v_labels,
            es=5):
        print('start training ---------------------------------------------------')
        logging.info('start train')
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())
            best_auc = 0.0
            no_num = 0
            writer = tf.summary.FileWriter('./logs', self.sess.graph)
            for i in range(epoch):
                t1 = time()
                epoch_losses = []
                for cate_feas_batch, conti_feas_batch, labels_batch in self.gene_data(cate_feas, conti_feas,
                                                                                      labels, batch_size,
                                                                                      shuffle=False):
                    feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch, labels_batch)
                    loss, _ = self.sess.run([self.loss, self.opt], feed_dict=feed)
                    epoch_losses.append(loss)

                v_loss, v_auc = self.eval(batch_size, v_cate_feas, v_conti_feas, v_labels)
                t_loss = np.mean(np.array(epoch_losses))
                logging.info('epoch: %s---train loss %.4f---valid loss: %.4f---valid auc: %.4f'
                             % ((i + 1), t_loss, v_loss, v_auc))
                print('epoch: %s---train loss %.4f---valid loss: %.4f---valid auc: %.4f [%.1f s]'
                      % ((i + 1), t_loss, v_loss, v_auc, time() - t1))
                if v_auc > best_auc:
                    no_num = 0
                    self.save_model(model_path)
                    logging.info('---------- auc from %.4f to %.4f, saving model' % (best_auc, v_auc))
                    print('---------- auc from %.4f to %.4f, saving model' % (best_auc, v_auc))
                    best_auc = v_auc
                else:
                    no_num += 1
                    self.lr = self.lr / 5
                    if no_num >= es:
                        break

    def eval(self, batch_size, cate_feas, conti_feas, labels):
        with self.graph.as_default():
            y_pre = []
            for cate_feas_batch, conti_feas_batch, label_batch in self.gene_data(cate_feas, conti_feas, labels,
                                                                                 batch_size, shuffle=False):
                feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch, label_batch)
                y_ = self.sess.run([self.y_pre], feed_dict=feed)[0]
                y_pre += y_.tolist()
            y_pre = np.array(y_pre)
            #             print(y_pre)
            y_pre = np.reshape(y_pre, (y_pre.shape[0],))
            labels = np.reshape(labels, (labels.shape[0],))
            loss = log_loss(labels, y_pre)
            auc = roc_auc_score(labels, y_pre)
            return loss, auc

    def predict(self, cate_feas, conti_feas, batch_size):
        def gd(cate_feas, conti_feas, bs):
            bm = math.ceil(len(conti_feas) / bs)
            for j in range(bm):
                a = cate_feas[j * bs: (j + 1) * bs]
                b = conti_feas[j * bs: (j + 1) * bs]
                yield a, b

        with self.graph.as_default():
            y_pre = []
            for cate_feas_batch, conti_feas_batch in gd(cate_feas, conti_feas, batch_size):
                feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch)
                y_ = self.sess.run([self.y_pre], feed_dict=feed)[0]
                y_pre += y_.tolist()
            y_pre = np.array(y_pre)
            y_pre = np.reshape(y_pre, (y_pre.shape[0],))
            return y_pre

    def embedding_weights(self):
        cate_embeddings, fea_embedding = self.sess.run([self.cate_embeddings, self.fea_embedding])
        return cate_embeddings, fea_embedding

    def miss_value_layer_w(self):
        nan_embeddings, no_nan_embedding = self.sess.run([self.nan_w, self.no_nan_w])
        return nan_embeddings, no_nan_embedding

## dataset

In [6]:
# from py2ifttt import IFTTT

# # 这里要填你之前的url后面的那段英文字母，然后 event_name 名字也要和你之前的一致
# ifttt = IFTTT('your key str', 'event_name')
# # 执行这句话，你就会收到推送
# ifttt.notify('value1', 'value2', 'value3')

In [7]:
m_cols = ['movie_id', 'title', 'genres']
u_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

movies = pd.read_csv('./dataset/movieLens/movies.dat', sep='::', names=m_cols, encoding='latin-1')
users = pd.read_csv('./dataset/movieLens/users.dat', sep='::', names=u_cols, encoding='latin-1')
ratings = pd.read_csv('./dataset/movieLens/ratings.dat', sep='::', names=r_cols, encoding='latin-1')

movies.shape
movies.head()
users.shape
users.head()
ratings.shape
ratings.head()

(3883, 3)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


(6040, 5)

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


(1000209, 4)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
def split_genres(x):
    return x.split('|')

movies['split_genres'] = movies.genres.apply(split_genres)
movies['len_genres'] = movies.split_genres.apply(lambda x:len(x))

for i in range(6):
    movies['genres' + str(i)] = movies.split_genres.apply(lambda x:x.pop() if len(x)>0 else None)
movies['split_genres'] = movies.genres.apply(split_genres)

In [9]:
movie_genres = list(movies.genres0) + list(movies.genres1) + list(movies.genres2) + list(movies.genres3) + list(movies.genres4) + list(movies.genres5)
movie_genres = list(set(movie_genres))
movie_genres.remove(None)
len(movie_genres)

18

In [10]:
def multi_hot(x):
    tmp = np.zeros(len(movie_genres))
    for i, genre in enumerate(movie_genres):
        if genre in x:
            tmp[i] = 1
    return list(map(int, tmp))

movies['genres_multi_hot'] = movies.split_genres.apply(multi_hot)

In [11]:
for i in range(len(movies.genres_multi_hot[0])):
    movies['genres_multi_hot_' + str(i)] = movies.genres_multi_hot.apply(lambda x:x.pop() if len(x)>0 else None)

In [12]:
# movies = movies[['movie_id', 'title', 'genres', 'len_genres', 'genres_multi_hot']]
movies = movies.drop(['title','genres','split_genres','genres0','genres1','genres2','genres3','genres4','genres5','genres_multi_hot'], axis=1)
movies.head()

Unnamed: 0,movie_id,len_genres,genres_multi_hot_0,genres_multi_hot_1,genres_multi_hot_2,genres_multi_hot_3,genres_multi_hot_4,genres_multi_hot_5,genres_multi_hot_6,genres_multi_hot_7,genres_multi_hot_8,genres_multi_hot_9,genres_multi_hot_10,genres_multi_hot_11,genres_multi_hot_12,genres_multi_hot_13,genres_multi_hot_14,genres_multi_hot_15,genres_multi_hot_16,genres_multi_hot_17
0,1,3,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2,3,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,4,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,5,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
def encode_feature(values):
    uniq = values.unique()
    mapping = dict(zip(uniq,range(1,len(uniq) + 1)))
    return values.map(mapping)

# movies.title = encode_feature(movies.title)
users.gender = encode_feature(users.gender)

In [14]:
df = ratings[['user_id', 'movie_id', 'rating']].merge(movies, on=['movie_id'], how='left')
df = df.merge(users[['user_id', 'gender', 'age', 'occupation']], on=['user_id'], how='left')

In [15]:
for i in ['gender', 'age', 'occupation']:
    df = pd.concat([df, pd.get_dummies(df[i], prefix=i)], axis=1)
    df = df.drop([i], axis=1)

In [16]:
df.groupby(['rating']).user_id.count()

rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: user_id, dtype: int64

In [17]:
del movies
del users
del ratings
gc.collect()

134

In [18]:
df['rating_2'] = df.rating.apply(lambda x:0 if x<4 else 1)

In [19]:
df = df.astype('int16')

df.shape
df.head()

(1000209, 53)

Unnamed: 0,user_id,movie_id,rating,len_genres,genres_multi_hot_0,genres_multi_hot_1,genres_multi_hot_2,genres_multi_hot_3,genres_multi_hot_4,genres_multi_hot_5,genres_multi_hot_6,genres_multi_hot_7,genres_multi_hot_8,genres_multi_hot_9,genres_multi_hot_10,genres_multi_hot_11,genres_multi_hot_12,genres_multi_hot_13,genres_multi_hot_14,genres_multi_hot_15,genres_multi_hot_16,genres_multi_hot_17,gender_1,gender_2,age_1,age_18,age_25,age_35,age_45,age_50,age_56,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,rating_2
0,1,1193,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,1,661,3,3,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,914,3,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,3408,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,1,2355,5,3,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['rating', 'rating_2'], axis=1), df.rating_2, test_size=0.2, random_state =1024)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
print(X_train.shape, X_test.shape)

(800167, 51) (200042, 51)


## train

In [21]:
import re
pattern = re.compile(r'[a-z]_[0-9]+')
conti_feas = ['len_genres']
cate_feas = ['user_id', 'movie_id', 'genres_multi_hot', 'gender', 'age', 'occupation']
cate_long_feas = ['user_id', 'movie_id']

cate_embedding_uni_cnt = {'user_id':6040, 'movie_id':3952, 'genres_multi_hot':18, 'gender':2, 'age':7, 'occupation':21}
cate_embedding_uni_cnt_list = [cate_embedding_uni_cnt[i] for i in cate_feas]

In [22]:
def build_ori_cate_feas(data_list, cate_feas):
#     col_num = sum([cate_embedding_uni_cnt[i] for i in cate_feas])
    format_data_list = []
    offset = 0
    for ii, i in enumerate(cate_feas):
        rows, cols, data = [], [], []
        tmp_len = max(data_list[i])
        for j in range(data_list.shape[0]):
            if(df[i][j] != 0):
                rows += [j]
                cols += [data_list[i][j]-1]
#                 cols += [df[i][j]-1+offset]
                data += [1]
        if ii == 0:
            tmp_csr = csr_matrix((data, (rows, cols)), shape=(data_list.shape[0], tmp_len))
        else:
            tmp_csr = hstack([tmp_csr, csr_matrix((data, (rows, cols)), shape=(data_list.shape[0], tmp_len))])
#         offset += tmp_len
#         format_data_list.append(csr_matrix((data, (rows, cols)), shape=(data_list.shape[0], col_num)))
    return tmp_csr

In [23]:
from sklearn.preprocessing import StandardScaler
conti_cols = conti_feas
data = pd.concat([X_train[conti_cols], X_test[conti_cols]])
for fea in conti_cols:
    scaler_val = data[fea][~data[fea].isnull()].values
    scaler = StandardScaler().fit(scaler_val.reshape((len(scaler_val), 1)))
    data[fea].fillna(scaler.mean_[0], inplace=True)
    data[fea] = scaler.transform(data[fea].values.reshape((len(data), 1))).reshape((len(data),)).tolist()

In [25]:
print('loading conti data...')
# train_conti_feas, val_conti_feas = X_train[[]].as_matrix(), X_test[[]].as_matrix()
# train_conti_feas, val_conti_feas = data[:X_train.shape[0]].as_matrix(), data[X_train.shape[0]:].as_matrix()
print('train conti feas shape: {}, val conti feas shape: {}'.format(np.shape(train_conti_feas),
                                                                    np.shape(val_conti_feas)))

print('loading ori cate data...')
# train_cate_csr = build_ori_cate_feas(X_train, cate_long_feas)
# train_cate_csr = hstack([train_cate_csr, csr_matrix(X_train.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# train_cate_csr = csr_matrix(train_cate_csr)
# val_cate_csr = build_ori_cate_feas(X_test, cate_long_feas)
# val_cate_csr = hstack([val_cate_csr, csr_matrix(X_test.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# val_cate_csr = csr_matrix(val_cate_csr)
print('train cate shape:{}, val cate shape:{}'.format(train_cate_csr.shape, val_cate_csr.shape))

print('training...')
model_name = 'movieLens'
cate_embedding_w_list, fm_embedding_w, no_nan_w, nan_w = None, None, None, None

dcfn_params = {
    'learning_rate': 0.001,
    'embedding_size': 8,
    'dnn_layers': [2048, 512, 128],
    'att_layer': 8,
    'cross_layer_num': 1,
    'conti_fea_cnt': train_conti_feas.shape[1],
    'cate_embedding_uni_cnt_list': cate_embedding_uni_cnt_list,
    'cate_embedding_w_list': cate_embedding_w_list,
    'fm_embedding_w': fm_embedding_w,
    'no_nan_w': no_nan_w,
    'nan_w': nan_w,
    'fm_drop_outs': [0.5, 0.5]
}
model = DCFN(**dcfn_params)

fit_params = {
    'model_path': './model/nn/dcfm_%s.ckpt' % model_name,
    'batch_size': 1024,
    'epoch': 100,
    'cate_feas': train_cate_csr,
    'conti_feas': train_conti_feas,
    'labels': y_train.values.reshape(-1, 1),
    'v_cate_feas': val_cate_csr,
    'v_conti_feas': val_conti_feas,
    'v_labels': y_test.values.reshape(-1, 1),
    'es': 2
}
model.fit(**fit_params)
# cate_w, fm_em_w = model.embedding_weights()
# nan_w, no_nan_w = model.miss_value_layer_w()
# np.array(cate_w).dump('./model/nn/%s_cate_w.np' % model_name)
# np.array(fm_em_w).dump('./model/nn/%s_fm_em_w.np' % model_name)
# np.array(nan_w).dump('./model/nn/%s_nan_w.np' % model_name)
# np.array(no_nan_w).dump('./model/nn/%s_no_nan_w.np' % model_name)

loading conti data...
train conti feas shape: (800167, 1), val conti feas shape: (200042, 1)
loading ori cate data...
train cate shape:(800167, 10040), val cate shape:(200042, 10040)
training...
y_deep:Tensor("Deep-part/LeakyRelu_2/Maximum:0", shape=(?, 128), dtype=float32), cross_output:Tensor("Cross-part/Reshape:0", shape=(?, 46), dtype=float32), y_first_order:Tensor("FM-part/dropout/mul:0", shape=(?, 46), dtype=float32), y_second_order:Tensor("FM-part/dropout_1/mul:0", shape=(?, 8), dtype=float32)
start training ---------------------------------------------------
epoch: 1---train loss 0.5577---valid loss: 0.5395---valid auc: 0.7919 [326.8 s]
---------- auc from 0.0000 to 0.7919, saving model
epoch: 2---train loss 0.5313---valid loss: 0.5347---valid auc: 0.7972 [227.4 s]
---------- auc from 0.7919 to 0.7972, saving model
epoch: 3---train loss 0.5172---valid loss: 0.5291---valid auc: 0.8037 [413.5 s]
---------- auc from 0.7972 to 0.8037, saving model
epoch: 4---train loss 0.5010---val

In [None]:
# DCFN
0.001
epoch: 4---train loss 0.4972---valid loss: 0.5265---valid auc: 0.8074 [93.0 s]
---------- auc from 0.8059 to 0.8074, saving model
0.0005
epoch: 7---train loss 0.4818---valid loss: 0.5312---valid auc: 0.8073 [73.3 s]
---------- auc from 0.8068 to 0.8073, saving model

0.001+tf.keras.initializers.he_normal
epoch: 4---train loss 0.4977---valid loss: 0.5250---valid auc: 0.8081 [92.6 s]
---------- auc from 0.8073 to 0.8081, saving model

0.001+tf.keras.initializers.he_normal+tf.nn.leaky_relu
epoch: 5---train loss 0.4876---valid loss: 0.5266---valid auc: 0.8089 [100.8 s]
---------- auc from 0.8078 to 0.8089, saving model

In [None]:
# FM
epoch: 15---train loss 0.5151---valid loss: 0.5369---valid auc: 0.7981 [108.9 s]
---------- auc from 0.7974 to 0.7981, saving model

In [None]:
# Deep-FM
[512, 256]
epoch: 7---train loss 0.4737---valid loss: 0.5389---valid auc: 0.8041 [75.3 s]
---------- auc from 0.8040 to 0.8041, saving model

[1024, 512, 256, 128]
epoch: 4---train loss 0.5041---valid loss: 0.5278---valid auc: 0.8061 [75.7 s]
---------- auc from 0.8035 to 0.8061, saving model

[2048, 512, 128]
epoch: 4---train loss 0.5003---valid loss: 0.5262---valid auc: 0.8069 [128.1 s]
---------- auc from 0.8054 to 0.8069, saving model

[256, 128]
epoch: 4---train loss 0.5040---valid loss: 0.5278---valid auc: 0.8052 [80.3 s]
---------- auc from 0.8042 to 0.8052, saving model

[256, 128, 32]
epoch: 4---train loss 0.5057---valid loss: 0.5292---valid auc: 0.8048 [175.8 s]
---------- auc from 0.8027 to 0.8048, saving model

[2048, 512, 128, 32]
epoch: 4---train loss 0.5023---valid loss: 0.5276---valid auc: 0.8057 [268.1 s]
---------- auc from 0.8042 to 0.8057, saving model

In [None]:
# Cross-layer
1
epoch: 4---train loss 0.5054---valid loss: 0.5288---valid auc: 0.8053 [107.3 s]
---------- auc from 0.8024 to 0.8053, saving model

2
epoch: 6---train loss 0.4865---valid loss: 0.5329---valid auc: 0.8051 [137.2 s]
---------- auc from 0.8049 to 0.8051, saving model

In [None]:
# embedding-size
4
epoch: 2---train loss 0.5316---valid loss: 0.5344---valid auc: 0.7971 [244.2 s]
---------- auc from 0.7919 to 0.7971, saving model

16
epoch: 4---train loss 0.4992---valid loss: 0.5278---valid auc: 0.8058 [311.2 s]
---------- auc from 0.8043 to 0.8058, saving model

In [None]:
# Deep-layer
[512, 512]
epoch: 2---train loss 0.5311---valid loss: 0.5332---valid auc: 0.7976 [118.2 s]
---------- auc from 0.7919 to 0.7976, saving model

[64, 32]
epoch: 2---train loss 0.5328---valid loss: 0.5361---valid auc: 0.7952 [60.0 s]
---------- auc from 0.7908 to 0.7952, saving model

[128, 64]
epoch: 2---train loss 0.5323---valid loss: 0.5355---valid auc: 0.7956 [75.7 s]
---------- auc from 0.7913 to 0.7956, saving model
            
[128, 128]
epoch: 2---train loss 0.5293---valid loss: 0.5332---valid auc: 0.7980 [62.8 s]
---------- auc from 0.7934 to 0.7980, saving model

[256, 128]
epoch: 2---train loss 0.5287---valid loss: 0.5320---valid auc: 0.7990 [67.3 s]
---------- auc from 0.7931 to 0.7990, saving model

[256, 128, 64]
epoch: 2---train loss 0.5300---valid loss: 0.5332---valid auc: 0.7982 [70.5 s]
---------- auc from 0.7923 to 0.7982, saving model

[256, 128, 64, 32]
epoch: 2---train loss 0.5326---valid loss: 0.5348---valid auc: 0.7963 [227.4 s]
---------- auc from 0.7903 to 0.7963, saving model

[512, 256]
epoch: 2---train loss 0.5300---valid loss: 0.5332---valid auc: 0.7979 [67.0 s]
---------- auc from 0.7920 to 0.7979, saving model

[2048, 512, 128]
epoch: 4---train loss 0.5024---valid loss: 0.5245---valid auc: 0.8083 [81.2 s]
---------- auc from 0.8049 to 0.8083, saving model