In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
from time import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
import tensorflow as tf
from tensorflow.contrib.layers import xavier_initializer
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
import math
import logging
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse import hstack, vstack
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config)

## xDeepFM

In [3]:
def MLP(inp, hidden_dims):
    x = tf.layers.Dense(hidden_dims[0], kernel_initializer=tf.keras.initializers.he_normal(), dtype=tf.float32, activation=tf.nn.relu)(inp)
    x = tf.layers.BatchNormalization(dtype=tf.float32)(x)
    x = tf.nn.relu(x)
    for i, dim in enumerate(hidden_dims):
        if i > 0:
            x = tf.layers.Dense(dim, kernel_initializer=tf.keras.initializers.he_normal(), dtype=tf.float32, activation=tf.nn.relu)(x)
            x = tf.layers.BatchNormalization(dtype=tf.float32)(x)
            x = tf.nn.relu(x)
    return x

In [4]:
def Res_Network(inp, hidden_dims):
    x = inp
    res = inp
    for i, dim in enumerate(hidden_dims):
        res = tf.layers.Dense(hidden_dims[i], kernel_initializer=tf.keras.initializers.he_normal(),
                              dtype=tf.float32, activation=tf.nn.relu, name='Dense_inp' + str(i))(res)
        res = tf.layers.BatchNormalization(dtype=tf.float32, name='BN_inp' + str(i))(res)
        res = tf.nn.relu(res)
        res = tf.layers.Dense(dim, kernel_initializer=tf.keras.initializers.he_normal(),
                              dtype=tf.float32, activation=tf.nn.relu, name='Dense_res' + str(i))(res)
        res = tf.concat([res, x], axis=1)
        x = res
        res = tf.layers.BatchNormalization(dtype=tf.float32, name='BN_res' + str(i))(res)
        res = tf.nn.relu(res)
    return res

In [5]:
class xDeepFM:
    def __init__(self, learning_rate, embedding_size, dnn_layers, cross_layers, res_layers, conti_fea_cnt,
                 cate_embedding_uni_cnt_list, cate_embedding_w_list=None, fm_embedding_w=None):
        self.lr = learning_rate
        self.embedding_size = embedding_size
        self.dnn_layers = dnn_layers
        self.cross_layers = cross_layers
        self.res_layers = res_layers
        self.conti_fea_cnt = conti_fea_cnt
        # cate_embedding_uni_cnt_list离散特征计数
        self.cate_embedding_uni_cnt_list = cate_embedding_uni_cnt_list
        self.cate_embedding_w_list = cate_embedding_w_list
        self.fm_embedding_w = fm_embedding_w
        
        self.build()

    def build(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)

            self.input_vecs = []

            self.conti_vec = tf.placeholder(tf.float32, shape=[None, self.conti_fea_cnt], name='conti_vec')
            self.cate_indexs = tf.placeholder(tf.int16, shape=[None, sum(self.cate_embedding_uni_cnt_list)], name='cate_indexs')
            self.label = tf.placeholder(dtype=tf.int8, shape=[None, 1], name='label')

            self.cate_embeddings = []
            self.fm_fea_size = 0

            # 第一层embedding：降维
            cate_offset = 0
            for cate_idx, uni_cnt in enumerate(self.cate_embedding_uni_cnt_list):
                w = self.cate_embedding_w_list[cate_idx] if self.cate_embedding_w_list else tf.keras.initializers.he_normal()
                embedding_k = uni_cnt if int(2 * np.power(uni_cnt, 1 / 4)) > uni_cnt else int(2 * np.power(uni_cnt, 1 / 4))
                self.fm_fea_size += embedding_k
                # embedding矩阵
                self.cate_embeddings.append(
                    tf.get_variable('cate_%d_embedding' % cate_idx, shape=[uni_cnt, embedding_k], dtype=tf.float32,
                                    initializer=w))

                crt_vec_index = self.cate_indexs[:, cate_offset:cate_offset + uni_cnt]  # None * uni_cnt
                cate_offset += uni_cnt

                crt_vec = tf.nn.embedding_lookup(self.cate_embeddings[cate_idx], [i for i in range(uni_cnt)])  # uni_cnt * K
                crt_vec = tf.matmul(tf.cast(crt_vec_index, tf.float32), crt_vec)  # None * K
                self.input_vecs.append(crt_vec)

            mv_conti_vec = self.conti_vec

            self.input_vecs.append(mv_conti_vec)
            self.fm_fea_size += self.conti_fea_cnt

            # 准备输入-----------------------------------------------------------------------------------------------------
            fm_fea = tf.concat(self.input_vecs, axis=-1)

            self.feat_index = [i for i in range(self.fm_fea_size)]
            if self.fm_embedding_w is not None:
                self.fea_embedding = tf.Variable(self.fm_embedding_w, name='fea_embedding', dtype=tf.float32)
            else:
                self.fea_embedding = tf.get_variable('fea_embedding', shape=[self.fm_fea_size, self.embedding_size],
                                                     initializer=tf.keras.initializers.he_normal(), dtype=tf.float32)

            # 构造输入
            # 第二层embedding：潜在隐变量
            embeddings = tf.nn.embedding_lookup(self.fea_embedding, self.feat_index)  # None * F * K
            feat_value = tf.reshape(fm_fea, shape=[-1, self.fm_fea_size, 1])
            embeddings = tf.multiply(embeddings, feat_value)  # None * F * K
#             print(embeddings)

            # 搭建网络-----------------------------------------------------------------------------------------------------
            # CIN部分
            with tf.variable_scope('CIN-part'):
                # step 1:x(0) dot x(k) = z(k+1)
                cin_layers = []
                field_nums = []
                final_result = []
                final_len = 0
                cin_input = tf.reshape(embeddings, [-1, self.fm_fea_size, self.embedding_size]) # None * F * K
                cin_layers.append(cin_input)
                field_nums.append(self.fm_fea_size)
                split_tensor0 = tf.split(cin_layers[0], self.embedding_size * [1], 2) # (None * F * 1) * K
                for i, layer_size in enumerate(self.cross_layers):
                    split_tensor = tf.split(cin_layers[-1], self.embedding_size * [1], 2) # (None * L(k) * 1) * K
                    dot_result = tf.matmul(split_tensor0, split_tensor, transpose_b=True) # K * None * F * L(k)
                    dot_result = tf.reshape(dot_result, shape=[self.embedding_size, -1, field_nums[0]*field_nums[-1]]) # K * None * (F * L(k))
                    dot_result = tf.transpose(dot_result, [1, 0 ,2]) # None * K * (F * L(k))
                # step 2:z(k+1) * cross_w(k+1) = x(k+1)
                    filter_k = tf.get_variable('filter_k'+str(i), shape=[1, field_nums[0]*field_nums[-1], layer_size]) # 1 * (F*L(k)) * K
                    cross_out = tf.nn.conv1d(dot_result, filter_k, stride=1, padding='VALID') # None * K * L(k+1)
                    cross_b = tf.get_variable('cross_b'+str(i), shape=[layer_size], initializer=tf.keras.initializers.he_normal(), dtype=tf.float32)
                    cross_out = tf.nn.bias_add(cross_out, cross_b) # None * K * L(k+1)
                    cross_out = tf.nn.relu(cross_out)
                    cross_out = tf.transpose(cross_out, [0, 2, 1]) # None * L(k+1) * K
                    # direct connect
#                     direct_connect = cross_out
#                     next_hidden = cross_out
#                     final_len += layer_size
#                     field_nums.append(int(layer_size))
                    # split connect
                    if i != len(self.cross_layers) - 1:
                        next_hidden, direct_connect = tf.split(cross_out, 2 * [int(layer_size / 2)], 1)
                        final_len += int(layer_size / 2)
                    else:
                        direct_connect = cross_out
                        next_hidden = 0
                        final_len += layer_size
                    field_nums.append(int(layer_size / 2))
                    final_result.append(direct_connect)
                    cin_layers.append(next_hidden)
                # step 3:sum pooling
                result = tf.concat(final_result, 1) # None * sum(layer_size) * K
                y_cin = tf.reduce_sum(result, -1) # None * sum(layer_size)

            # DNN部分
            with tf.variable_scope('Deep-part'):
                y_deep = tf.reshape(embeddings, shape=[-1, self.fm_fea_size * self.embedding_size])  # None*(F*K)
                y_deep = MLP(y_deep, self.dnn_layers)

                # 合并
            print('y_deep:{}, y_cin:{}'.format(y_deep, y_cin))
#             last_input = tf.concat([y_deep], axis=-1) # DNN
            last_input = tf.concat([y_deep, y_cin], axis=-1) # xDeepFM

            # dense
#             self.y_pre = tf.layers.Dense(1, activation=tf.nn.sigmoid,
#                                          kernel_initializer=tf.keras.initializers.he_normal())(last_input)  # 二分类
            # residual network
            with tf.variable_scope('Res-network'):
                with tf.variable_scope('Res'):
                    res = Res_Network(last_input, self.res_layers)
                with tf.variable_scope('MLP'):
                    res = MLP(res, [1024, 256, 64])
                    print(res)
                
            self.y_pre = tf.layers.Dense(1, activation=tf.nn.sigmoid,
                                         kernel_initializer=tf.keras.initializers.he_normal())(res)  # 二分类

            # 损失函数(二分类交叉熵等同于logloss)
            self.loss = tf.losses.log_loss(self.label, self.y_pre)  # 二分类

            # 优化方法
            self.opt = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
            self.saver = tf.train.Saver()

    def save_model(self, model_path):
        self.saver.save(self.sess, model_path)

    def load_model(self, model_path):
        self.saver.restore(self.sess, model_path)

    def shuffle_csr_and_list(self, my_array, rng_state):
        np.random.set_state(rng_state)
        if type(my_array) == csr_matrix:
            index = np.arange(np.shape(my_array)[0])
            np.random.shuffle(index)
            print('shuffle csr_matrix ' + str(my_array.shape))
            return my_array[index, :]
        else:
            np.random.shuffle(my_array)
            return my_array

    def shuffle(self, cate_feas, conti_feas, labels):
        rng_state = np.random.get_state()
        cate_feas = self.shuffle_csr_and_list(cate_feas, rng_state)
        conti_feas = self.shuffle_csr_and_list(conti_feas, rng_state)
        labels = self.shuffle_csr_and_list(labels, rng_state)
        return cate_feas, conti_feas, labels

    def get_feed_dict(self, cate_feas, conti_feas, labels=None):
        feed_dict = {
            self.conti_vec: conti_feas,
            self.cate_indexs: cate_feas.todense(),
        }
        if labels is not None:
            feed_dict[self.label] = labels
        return feed_dict

    def gene_data(self, cate_feas, conti_feas, labels, bs, shuffle=False):
        if shuffle:
            cate_feas, conti_feas, labels = self.shuffle(cate_feas, conti_feas, labels)
        bm = math.ceil(cate_feas.shape[0] / bs)
        for j in range(bm):
            a = cate_feas[j * bs:(j + 1) * bs]
            b = conti_feas[j * bs:(j + 1) * bs]
            c = labels[j * bs:(j + 1) * bs]
            yield a, b, c

    def gene_balance_data(self, cate_feas, conti_feas, labels, bs, shuffle=False):
        pos_flag = np.array([l[0] == 1 for l in labels])
        pos_indexing, neg_indexing = np.arange(len(labels))[pos_flag], np.arange(len(labels))[~pos_flag]
        np.random.shuffle(neg_indexing)

        bm = math.ceil(sum(~pos_flag) / bs)
        for j in range(bm):
            need_cnt = int(bs / 2)
            crt_indexing = np.random.choice(pos_indexing, need_cnt).tolist() + neg_indexing[
                                                                               j * need_cnt:(j + 1) * need_cnt].tolist()

            a = cate_feas[crt_indexing, :]
            b = np.take(conti_feas, crt_indexing, axis=0)
            c = np.take(labels, crt_indexing, axis=0)
            yield a, b, c

    def fit(self, model_path, batch_size, epoch, cate_feas, conti_feas, labels, v_cate_feas, v_conti_feas, v_labels,
            es=5):
        print('start training ---------------------------------------------------')
        logging.info('start train')
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())
            best_f1 = 0.0  # 二分类
            no_num = 0
            writer = tf.summary.FileWriter('./logs', self.sess.graph)
            for i in range(epoch):
                t1 = time()
                epoch_losses = []
                for cate_feas_batch, conti_feas_batch, labels_batch in self.gene_data(cate_feas, conti_feas,
                                                                                      labels, batch_size,
                                                                                      shuffle=False):
                    feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch, labels_batch)
                    loss, _ = self.sess.run([self.loss, self.opt], feed_dict=feed)
                    epoch_losses.append(loss)

                # 二分类
                v_loss, v_f1 = self.eval(batch_size, v_cate_feas, v_conti_feas, v_labels)
                t_loss = np.mean(np.array(epoch_losses))
                logging.info('epoch: %s---train loss %.4f---valid loss: %.4f---valid f1: %.4f'
                             % ((i + 1), t_loss, v_loss, v_f1))
                print('epoch: %s---train loss %.4f---valid loss: %.4f---valid f1: %.4f [%.1f s]'
                      % ((i + 1), t_loss, v_loss, v_f1, time() - t1))
                if v_f1 > best_f1:
                    no_num = 0
                    self.lr = self.lr * 0.8
                    self.save_model(model_path)
                    logging.info('---------- f1 from %.4f to %.4f, saving model' % (best_f1, v_f1))
                    print('---------- f1 from %.4f to %.4f, saving model' % (best_f1, v_f1))
                    best_f1 = v_f1
                else:
                    no_num += 1
                    self.lr = self.lr / 2
                    if no_num >= es:
                        break

    def eval(self, batch_size, cate_feas, conti_feas, labels):
        with self.graph.as_default():
            y_pre = []
            for cate_feas_batch, conti_feas_batch, label_batch in self.gene_data(cate_feas, conti_feas, labels,
                                                                                 batch_size, shuffle=False):
                feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch, label_batch)
                y_ = self.sess.run([self.y_pre], feed_dict=feed)[0]
                y_pre += y_.tolist()
            y_pre = np.array(y_pre)
            # 二分类
            y_pre = np.reshape(y_pre, (y_pre.shape[0],))
#             print(y_pre)
            labels = np.reshape(labels, (labels.shape[0],))
            loss = log_loss(labels, y_pre)
            f1s = []
            for limit in np.arange(0.4, 0.44, 0.01):
                pred = [int(i>limit) for i in y_pre]
                f1s.append(f1_score(labels, pred))
            return loss, max(f1s)

    def predict(self, cate_feas, conti_feas, batch_size):
        def gd(cate_feas, conti_feas, bs):
            bm = math.ceil(len(conti_feas) / bs)
            for j in range(bm):
                a = cate_feas[j * bs: (j + 1) * bs]
                b = conti_feas[j * bs: (j + 1) * bs]
                yield a, b

        with self.graph.as_default():
            y_pre = []
            for cate_feas_batch, conti_feas_batch in gd(cate_feas, conti_feas, batch_size):
                feed = self.get_feed_dict(cate_feas_batch, conti_feas_batch)
                y_ = self.sess.run([self.y_pre], feed_dict=feed)[0]
                y_pre += y_.tolist()
            y_pre = np.array(y_pre)
            y_pre = np.reshape(y_pre, (y_pre.shape[0],))
            return y_pre
        
    def embedding_weights(self):
        cate_embeddings, fea_embedding = self.sess.run([self.cate_embeddings, self.fea_embedding])
        return cate_embeddings, fea_embedding

## data

In [6]:
usecols = ['register_type', 'device_type', 'user_reg_days', 'user_lastweek_launchday', 'user_last_launch_dist', 'user_hist_launch_freq', 'user_hist_launchday', 'user_mean_continue_launch_times_lastweek', 'user_max_continue_launch_times_lastweek', 'user_activity_days_hist', 'user_activity_days_lastweek', 'user_min_continue_launch_times_lastweek', 'user_mean_continue_launch_days_lastweek', 'user_max_continue_launch_days_lastweek', 'user_mean_continue_activity_days_lastweek', 'user_max_continue_activity_days_lastweek', 'user_lastweek_act_0_freq', 'user_lastweek_actcount', 'user_lastweek_act_video_uniquecount', 'user_max_continue_launch_times_hist', 'user_min_continue_launch_days_lastweek', 'user_mean_continue_launch_times_hist', 'user_min_continue_activity_days_lastweek', 'user_hist_act_0_count', 'user_hist_actcount', 'user_mean_continue_activity_days_hist', 'user_max_continue_launch_days_hist', 'user_hist_act_freq', 'user_mean_continue_launch_days_hist', 'user_hist_act_author_count', 'user_mean_no_launch_days_hist', 'user_min_activity_daytimes_lastweek', 'user_lastweek_act', 'user_lastweek_act_0', 'user_mean_continue_activity_times_hist', 'user_max_launch_daytimes_lastweek', 'user_lastweek_launch', 'user_lastweek_act_page_3_count', 'user_lastweek_act_page_1_count', 'user_max_no_launch_days_hist', 'user_last_act_date', 'user_lastweek_act_2_freq', 'user_lastweek_video_freq', 'user_lastweek_act_2_count', 'user_lastweek_launch_freq', 'user_max_no_activity_days_lastweek_hist_dist', 'user_lastweek_act_page_2_count', 'user_var_continue_activity_times_lastweek', 'user_kurt_continue_activity_days_hist', 'user_launch_range_percent', 'user_activity_div_launch_days_hist', 'user_hist_act_video_meancount', 'user_hist_video_activity_types', 'user_activity_range_percent', 'user_5daybefore_act_page_1_count', 'user_min_continue_activity_times_5daywin', 'user_lastweek_hist_act_3_count_dist', 'user_lastweek_hist_act_page_3_count_dist', 'user_mean_createvideo_date_lastweek_hist_dist', 'user_4daybefore_act_page_4_count', 'user_kurt_no_launch_days_6daywin', 'user_max_continue_createvideo_days_4daywin', 'user_2daybefore_act_1_count', 'user_max_continue_createvideo_days_hist']
by = ['user_id', 'data_weeknum']
target = ['label']
len(usecols)

64

In [7]:
df = pd.read_csv('./features/b/baseline_features12_ab.csv', usecols=usecols+by+target)
df.shape

(198057, 67)

In [8]:
cate_feas = ['register_type', 'device_type']
conti_feas = list(set(usecols) - set(cate_feas))
cate_long_feas = ['device_type']
cate_embedding_uni_cnt = {'register_type':12, 'device_type':4760}
cate_embedding_uni_cnt_list = [cate_embedding_uni_cnt[i] for i in cate_feas]

In [9]:
# 标准化
from sklearn.preprocessing import StandardScaler
conti_cols = conti_feas
data = df[conti_cols]
for fea in conti_cols:
    scaler_val = data[fea][~data[fea].isnull()].values
    scaler = StandardScaler().fit(scaler_val.reshape((len(scaler_val), 1)))
    data[fea].fillna(scaler.mean_[0], inplace=True)
    data[fea] = scaler.transform(data[fea].values.reshape((len(data), 1))).reshape((len(data),)).tolist()
df = pd.concat([data, df[cate_feas+by+target]], axis=1)

In [10]:
def encode_feature(values):
    uniq = values.unique()
    mapping = dict(zip(uniq,range(1,len(uniq) + 1)))
    return values.map(mapping)

for i in cate_feas:
    df[i] = encode_feature(df[i])

In [11]:
for i in list(set(cate_feas) - set(cate_long_feas)):
    df = pd.concat([df, pd.get_dummies(df[i], prefix=i)], axis=1)
    usecols += list(pd.get_dummies(df[i], prefix=i).columns)
    df = df.drop([i], axis=1)
    usecols.remove(i)

In [12]:
df.shape
df.head()

(198057, 78)

Unnamed: 0,user_max_continue_activity_days_lastweek,user_min_continue_activity_days_lastweek,user_mean_continue_launch_times_lastweek,user_max_no_launch_days_hist,user_lastweek_act_video_uniquecount,user_lastweek_launchday,user_launch_range_percent,user_lastweek_actcount,user_max_continue_launch_times_hist,user_mean_no_launch_days_hist,user_var_continue_activity_times_lastweek,user_max_launch_daytimes_lastweek,user_lastweek_act_page_3_count,user_mean_continue_launch_days_hist,user_lastweek_video_freq,user_max_no_activity_days_lastweek_hist_dist,user_reg_days,user_hist_launchday,user_mean_continue_launch_times_hist,user_hist_act_author_count,user_hist_act_freq,user_4daybefore_act_page_4_count,user_max_continue_launch_days_hist,user_activity_days_hist,user_min_activity_daytimes_lastweek,user_lastweek_act_2_count,user_lastweek_act_0_freq,user_max_continue_createvideo_days_hist,user_mean_continue_activity_days_lastweek,user_lastweek_act,user_hist_actcount,user_lastweek_act_page_1_count,user_mean_continue_activity_days_hist,user_last_launch_dist,user_2daybefore_act_1_count,user_hist_act_0_count,user_lastweek_act_2_freq,user_min_continue_launch_times_lastweek,user_mean_continue_launch_days_lastweek,user_hist_launch_freq,user_activity_days_lastweek,user_lastweek_launch,user_lastweek_act_0,user_activity_div_launch_days_hist,user_mean_continue_activity_times_hist,user_last_act_date,user_hist_act_video_meancount,user_kurt_continue_activity_days_hist,user_5daybefore_act_page_1_count,user_lastweek_hist_act_3_count_dist,user_max_continue_createvideo_days_4daywin,user_kurt_no_launch_days_6daywin,user_lastweek_act_page_2_count,user_lastweek_launch_freq,user_max_continue_launch_days_lastweek,user_mean_createvideo_date_lastweek_hist_dist,user_min_continue_activity_times_5daywin,user_hist_video_activity_types,user_lastweek_hist_act_page_3_count_dist,user_min_continue_launch_days_lastweek,user_max_continue_launch_times_lastweek,user_activity_range_percent,device_type,user_id,data_weeknum,label,register_type_1,register_type_2,register_type_3,register_type_4,register_type_5,register_type_6,register_type_7,register_type_8,register_type_9,register_type_10,register_type_11,register_type_12
0,0.247543,0.349208,0.379038,1.498626,-0.206576,-0.009945,1.034716,-0.249634,-0.045693,1.242095,-0.076443,1.004443,-0.315589,-0.335039,-0.166874,-1.6385,-0.17852,-0.215638,-0.064201,-0.108227,-0.343108,-0.073178,-0.278441,-0.09474,-0.185099,-0.238821,-0.318459,-0.293678,0.30216,0.770586,-0.281889,-0.281456,-0.219169,-0.609325,-0.131104,-0.276376,-0.210116,0.422373,0.158963,-0.4096,0.124359,0.614676,0.77066,0.588383,-0.269638,-0.661824,-0.059767,-0.320314,-0.187274,0.12472,-0.228157,-0.609547,-0.266604,-0.365386,0.107558,0.164084,-0.151078,-0.3402,0.246627,0.205738,0.329071,1.176723,1,744025_a,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,-0.806428,-0.748524,-0.490522,-0.313167,-0.438637,-0.956922,-0.73403,-0.435995,-0.045693,-0.120225,-0.076443,-0.190467,-0.326267,-0.16662,-0.166874,0.312403,-0.17852,-0.446747,0.079728,-0.389216,-0.38464,-0.073178,-0.278441,-0.334261,-0.327347,-0.238821,-0.464357,-0.293678,-0.787085,-1.297714,-0.317967,-0.281456,-0.038162,0.39762,-0.131104,-0.316652,-0.210116,-0.45665,-0.882014,-0.736365,-0.847477,-1.626874,-1.297588,0.588383,-0.218673,-1.288577,0.049961,-0.320314,-0.187274,-1.211189,-0.228157,1.440857,-0.266604,-1.11773,-0.90589,0.164084,-0.392555,-0.3402,0.164714,-0.836967,-0.51617,-0.611905,2,1270299_a,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,-0.806428,-0.748524,-0.490522,-0.313167,-0.438637,-0.956922,-0.73403,-0.435995,-0.045693,-0.120225,-0.076443,-0.190467,-0.326267,-0.16662,-0.166874,0.312403,-0.17852,-0.446747,0.079728,-0.293182,-0.407004,-0.073178,-0.278441,-0.334261,-0.327347,-0.238821,-0.464357,-0.293678,-0.787085,-1.297714,-0.337394,-0.281456,-0.038162,0.39762,-0.131104,-0.332474,-0.210116,-0.45665,-0.882014,-0.736365,-0.847477,-1.626874,-1.297588,0.588383,-0.242859,-1.288577,-0.040997,-0.320314,-0.187274,0.12472,-0.228157,1.440857,-0.266604,-1.11773,-0.90589,0.164084,-0.392555,-0.3402,0.173815,-0.836967,-0.51617,-0.611905,3,571220_a,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,-0.279443,-0.199658,-0.055742,0.049192,-0.363559,-0.009945,0.150343,-0.278507,0.20699,0.061418,-0.06936,1.004443,-0.144741,-0.16662,-0.166874,0.312403,-0.17852,0.015471,0.079728,-0.421227,-0.295185,-0.073178,0.013898,0.144782,0.070947,-0.238821,-0.378903,-0.293678,-0.242462,0.770586,-0.240261,0.160326,-0.038162,-0.105852,-0.131104,-0.257677,-0.210116,-0.017138,-0.361526,-0.082835,0.124359,0.614676,0.77066,0.588383,-0.243723,-0.975201,1.445931,-0.320314,1.156722,0.12472,-0.228157,-0.609547,-0.266604,-0.365386,-0.399166,0.164084,-0.34494,-0.3402,0.087352,-0.315615,-0.093549,0.282409,4,1308501_a,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0.774528,0.898074,1.248598,0.049192,-0.380622,0.937032,0.739925,-0.36775,0.459673,-0.011239,-0.076443,1.004443,-0.326267,0.170219,-0.166874,-0.467958,-0.17852,0.477688,0.367587,-0.378545,-0.4709,-0.073178,0.306238,0.384303,-0.241998,-0.096063,-0.414335,-0.293678,0.846782,0.770586,-0.392898,-0.281456,0.142846,-0.4415,-0.131104,-0.391449,-0.132553,1.301396,1.199941,0.570694,0.610277,0.614676,0.77066,0.12615,-0.338743,-0.766283,0.096189,-0.320314,-0.187274,0.12472,-0.228157,-0.609547,0.190579,0.386958,1.121006,0.164084,-0.304127,-0.3402,0.246627,1.248444,1.174313,0.878618,5,745554_a,0,1,0,0,1,0,0,0,0,0,0,0,0,0


In [32]:
# df['device_type'] = df['device_type'].apply(lambda x:2385 if x > 2384 else x)

In [13]:
df['user_id'] = df.user_id.astype(str)

train = df[(df.data_weeknum < df.data_weeknum.max()-1)]
val = df[(df.data_weeknum == df.data_weeknum.max()-1) & (df.user_id.map(lambda x:'_a' not in x))]
test = df[df.data_weeknum == df.data_weeknum.max()]
trainval = df[df.data_weeknum <df.data_weeknum.max()]

## 全离散化处理

In [300]:
# 连续特征转排名
for fea in tqdm_notebook(usecols):
    uniq = df[fea].unique()
    uniq = sorted(uniq, reverse=True)
    mapping = dict(zip(uniq, range(1,len(uniq) + 1)))
    df[fea] = df[fea].map(mapping)




In [301]:
cate_feas = usecols
conti_feas = []
cate_long_feas = usecols

In [302]:
def get_uni_cnt_dict():
    tmp_dict = {}
    for i in cate_feas:
        tmp_dict[i] = max(df[i])
    return tmp_dict

cate_embedding_uni_cnt = get_uni_cnt_dict()
cate_embedding_uni_cnt_list = [cate_embedding_uni_cnt[i] for i in cate_feas]

In [304]:
train = df[(df.data_weeknum < df.data_weeknum.max()-1)] #& (df.data_weeknum>0)]
val = df[df.data_weeknum == df.data_weeknum.max()-1]
test = df[df.data_weeknum == df.data_weeknum.max()]
trainval = df[df.data_weeknum <df.data_weeknum.max()]

## train

In [18]:
X_train, X_val = train[usecols], val[usecols]
y_train, y_val = train['label'], val['label']

X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [16]:
def build_ori_cate_feas(data_list, cate_feas):
    format_data_list = []
    offset = 0
    for ii, i in enumerate(cate_feas):
        rows, cols, data = [], [], []
        tmp_len = cate_embedding_uni_cnt[i]
        for j in range(data_list.shape[0]):
            if(df[i][j] != 0):
                rows += [j]
                cols += [data_list[i][j]-1]
                data += [1]
        if ii == 0:
            tmp_csr = csr_matrix((data, (rows, cols)), shape=(data_list.shape[0], tmp_len))
        else:
            tmp_csr = hstack([tmp_csr, csr_matrix((data, (rows, cols)), shape=(data_list.shape[0], tmp_len))])
    return tmp_csr

In [20]:
print('loading conti data...')
# train_conti_feas, val_conti_feas = X_train[conti_feas].as_matrix(), X_val[conti_feas].as_matrix()
print('train conti feas shape: {}, val conti feas shape: {}'.format(np.shape(train_conti_feas),
                                                                    np.shape(val_conti_feas)))

print('loading ori cate data...')
# train_cate_csr = build_ori_cate_feas(X_train, cate_long_feas)
# train_cate_csr = hstack([train_cate_csr, csr_matrix(X_train.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# train_cate_csr = csr_matrix(train_cate_csr)
# val_cate_csr = build_ori_cate_feas(X_val, cate_long_feas)
# val_cate_csr = hstack([val_cate_csr, csr_matrix(X_val.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# val_cate_csr = csr_matrix(val_cate_csr)
print('train cate shape:{}, val cate shape:{}'.format(train_cate_csr.shape, val_cate_csr.shape))

print('training...')
model_name = 'kuaishou'
cate_embedding_w_list, fm_embedding_w = None, None

dcfn_params = {
    'learning_rate': 0.0005,
    'embedding_size': 8,
    'dnn_layers': [2048, 512, 128],
    'cross_layers': [60, 60, 60],
    'res_layers': [128, 64, 32],
    'conti_fea_cnt': train_conti_feas.shape[1],
    'cate_embedding_uni_cnt_list': cate_embedding_uni_cnt_list,
    'cate_embedding_w_list': cate_embedding_w_list,
    'fm_embedding_w': fm_embedding_w
}
model = xDeepFM(**dcfn_params)

fit_params = {
    'model_path': './model/nn/xdeepfm_%s.ckpt' % model_name,
    'batch_size': 4096,
    'epoch': 100,
    'cate_feas': train_cate_csr,
    'conti_feas': train_conti_feas,
    'labels': y_train.values.reshape(-1, 1),
    'v_cate_feas': val_cate_csr,
    'v_conti_feas': val_conti_feas,
    'v_labels': y_val.values.reshape(-1, 1),
    'es': 2
}

model.fit(**fit_params)

loading conti data...
train conti feas shape: (71796, 62), val conti feas shape: (37335, 62)
loading ori cate data...
train cate shape:(71796, 4772), val cate shape:(37335, 4772)
training...
y_deep:Tensor("Deep-part/Relu_2:0", shape=(?, 128), dtype=float32), y_cin:Tensor("CIN-part/Sum:0", shape=(?, 120), dtype=float32)
Tensor("Res-network/MLP/Relu_2:0", shape=(?, 64), dtype=float32)
start training ---------------------------------------------------
epoch: 1---train loss 0.5619---valid loss: 0.4556---valid f1: 0.8035 [7.4 s]
---------- f1 from 0.0000 to 0.8035, saving model
epoch: 2---train loss 0.4610---valid loss: 0.4429---valid f1: 0.8049 [6.7 s]
---------- f1 from 0.8035 to 0.8049, saving model
epoch: 3---train loss 0.4491---valid loss: 0.4406---valid f1: 0.8049 [6.9 s]
epoch: 4---train loss 0.4449---valid loss: 0.4490---valid f1: 0.8055 [6.9 s]
---------- f1 from 0.8049 to 0.8055, saving model
epoch: 5---train loss 0.4402---valid loss: 0.4523---valid f1: 0.8062 [6.9 s]
---------- f

In [None]:
start:epoch: 2---train loss 0.4466---valid loss: 0.4388---valid f1: 0.8047 [3.4 s]
'cross_layers': [50, 50, 25]:epoch: 2---train loss 0.4463---valid loss: 0.4346---valid f1: 0.8042 [4.1 s]
'cross_layers': [100, 100, 50]:epoch: 2---train loss 0.4473---valid loss: 0.4313---valid f1: 0.8035 [5.1 s]
'cross_layers': [30, 30, 15]:epoch: 2---train loss 0.4472---valid loss: 0.4325---valid f1: 0.8034 [3.8 s]
'cross_layers': [200, 200, 100]:epoch: 3---train loss 0.4322---valid loss: 0.4435---valid f1: 0.8025 [7.0 s]
'cross_layers': [70, 50, 30]:epoch: 2---train loss 0.4466---valid loss: 0.4371---valid f1: 0.8041 [3.7 s]

In [None]:
# b
epoch: 5---train loss 0.4511---valid loss: 0.4203---valid f1: 0.8108 [6.1 s]

## submit

In [14]:
X_trainval, X_test = trainval[usecols], test[usecols]
y_trainval = trainval['label']

X_trainval.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_trainval.reset_index(drop=True, inplace=True)

In [23]:
print('loading conti data...')
# trainval_conti_feas, test_conti_feas = X_trainval[conti_feas].as_matrix(), X_test[conti_feas].as_matrix()
print('trainval conti feas shape: {}, test conti feas shape: {}'.format(np.shape(trainval_conti_feas),
                                                                        np.shape(test_conti_feas)))

print('loading ori cate data...')
# trainval_cate_csr = build_ori_cate_feas(X_trainval, cate_long_feas)
# trainval_cate_csr = hstack([trainval_cate_csr, csr_matrix(X_trainval.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# trainval_cate_csr = csr_matrix(trainval_cate_csr)
# test_cate_csr = build_ori_cate_feas(X_test, cate_long_feas)
# test_cate_csr = hstack([test_cate_csr, csr_matrix(X_test.iloc[:, -sum([cate_embedding_uni_cnt[i] for i in list(set(cate_feas)-set(cate_long_feas))]):].as_matrix())])
# test_cate_csr = csr_matrix(test_cate_csr)
print('trainval cate shape:{}, test cate shape:{}'.format(trainval_cate_csr.shape, test_cate_csr.shape))

print('submitting...')
model_name = 'kuaishou'
cate_embedding_w_list, fm_embedding_w = None, None

dcfn_params = {
    'learning_rate': 0.0005,
    'embedding_size': 8,
    'dnn_layers': [2048, 512, 128],
    'cross_layers': [60, 60, 60],
    'res_layers': [128, 64, 32],
    'conti_fea_cnt': trainval_conti_feas.shape[1],
    'cate_embedding_uni_cnt_list': cate_embedding_uni_cnt_list,
    'cate_embedding_w_list': cate_embedding_w_list,
    'fm_embedding_w': fm_embedding_w
}
model = xDeepFM(**dcfn_params)

submit_params = {
    'model_path': './model/nn/xdeepfm_%s.ckpt' % model_name,
    'batch_size': 4096,
    'epoch': 8,
    'cate_feas': trainval_cate_csr,
    'conti_feas': trainval_conti_feas,
    'labels': y_trainval.values.reshape(-1, 1),
    'v_cate_feas': val_cate_csr,
    'v_conti_feas': val_conti_feas,
    'v_labels': y_val.values.reshape(-1, 1),
    'es': 2
}

pre_params = {
    'batch_size': 4096,
    'cate_feas': test_cate_csr,
    'conti_feas': test_conti_feas
}

for i in range(114, 115):
    submit_params['epoch'] = i // 100 + 3
    model.fit(**submit_params)
    result = pd.DataFrame(model.predict(**pre_params))
    result.to_csv('./result/b/nn/submit_' + str(i) + '.csv', header=None, index=None)

loading conti data...
trainval conti feas shape: (146577, 62), test conti feas shape: (51480, 62)
loading ori cate data...
trainval cate shape:(146577, 4772), test cate shape:(51480, 4772)
submitting...
y_deep:Tensor("Deep-part/Relu_2:0", shape=(?, 128), dtype=float32), y_cin:Tensor("CIN-part/Sum:0", shape=(?, 120), dtype=float32)
Tensor("Res-network/MLP/Relu_2:0", shape=(?, 64), dtype=float32)
start training ---------------------------------------------------
epoch: 1---train loss 0.4780---valid loss: 0.4253---valid f1: 0.8105 [12.8 s]
---------- f1 from 0.0000 to 0.8105, saving model
epoch: 2---train loss 0.4349---valid loss: 0.4137---valid f1: 0.8126 [12.3 s]
---------- f1 from 0.8105 to 0.8126, saving model
epoch: 3---train loss 0.4296---valid loss: 0.4099---valid f1: 0.8144 [12.0 s]
---------- f1 from 0.8126 to 0.8144, saving model
epoch: 4---train loss 0.4256---valid loss: 0.4059---valid f1: 0.8167 [11.9 s]
---------- f1 from 0.8144 to 0.8167, saving model


In [24]:
result = pd.DataFrame()
for i in tqdm_notebook(range(300)):
    tmp = pd.read_csv('./result/b/nn/submit_' + str(i) + '.csv', header=None)
    if i == 0:
        result = tmp
    else:
        result = result + tmp
result = result / 300
result.columns = ['pred']




In [25]:
test.reset_index(drop=True, inplace=True)
submit = pd.concat([test[['user_id']], result], axis=1)
submit.shape
submit.head()

(51480, 2)

Unnamed: 0,user_id,pred
0,167777,0.027161
1,886972,0.06435
2,921231,0.045289
3,904908,0.919658
4,460291,0.997321


In [38]:
submit.to_csv('./result/b/xdeepfm_zero.csv',index=False, header=None)

In [36]:
save = submit[submit.pred>0.41].sort_values('pred', ascending=False)[['user_id']]
print(save.shape)

(23962, 1)


In [30]:
save.to_csv('./result/b/xdeepfm_zero_0627_limit041.csv',index=False, header=None)

In [None]:
a:23726/51709
b:23678/51480