In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 


import warnings
warnings.filterwarnings("ignore")

In [2]:
from utils import *

In [3]:
from tqdm.autonotebook import tqdm
from collections import Counter

# 1. 数据的准备

In [4]:
data = pd.read_csv("../data/criteo/criteo_data.csv")

In [5]:
# 取出连续型特征和类别型特征对应的列
con = [f for f in data.columns if f.startswith("I")]
cat = [f for f in data.columns if f.startswith("C")]

In [6]:
field_handler = FieldHandler(train_file_path="../data/criteo/criteo_data.csv",
                            continuation_columns=con,
                            category_columns=cat)

In [7]:
# 获取要输入的特征和标签值
features, labels = transformation_data(data, 
                                      field_handler=field_handler,
                                      label="Label")

cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26
con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13


In [8]:
# 基础参数设置
class Config(dict):
    def __init__(self, field_handler):
        # 模型参数
        self['field_size'] = len(field_handler.field_dict)
        self['feature_size'] = field_handler.feature_nums
        self['embedding_size'] = 50 
        self['dropout_prob'] = [0.8, 0.8, 0.8]
        self['deep_layers'] = [64, 64, 64]
        self['seed'] = 2019 
        self['l2_reg'] = 0.001
        
        # 训练参数
        self['num_epochs'] = 5 
        self['batch_size'] = 128 
        self['evaluateEvery'] = 1000
        self['checkpointEvery'] = 1000
        self['lr'] = 0.01 
        self['decay_steps'] = 200 
        self['decay_rate'] = 0.9 
        self['grad_clip'] = 5.0 
        
        # 其他参数
        self['num_classes'] = 1 
        self['train_size'] = 0.8 
        self.threshold = 0.5 
        self['checkpoint_dir'] = "../model/NFM/checkpoint"
        self['summary_dir'] = "../model/NFM/summary"
        self['max_to_keep'] = 5 

# 2. 定义模型

In [9]:
class NFM(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        tf.set_random_seed(self.config['seed'])
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        # 输入占位符
        self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index")
        self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value")
        self.labels = tf.placeholder(tf.float32, shape=[None, self.config['num_classes']], name="label")
        self.dropout_keep_prob = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_prob")
        self.is_training = tf.placeholder(tf.bool, name="is_training")
        
        self.weights = self._init_weights()
        
        # embedding层
        with tf.name_scope("embedding"):
            ## 输出shape: [batch, feat_size, embed_size]
            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],
                                                    self.feat_index)
            feat_value = tf.expand_dims(self.feat_value, 2)
            self.embeddings = tf.multiply(self.embeddings, feat_value)
            
        # 一阶的结果
        with tf.name_scope("first-order"):
            self.y_first_order = tf.nn.embedding_lookup(self.weights['feature_weights'], self.feat_index)
            ## 输出shape: [batch, feat_size]
            self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2)
        
        # 二阶交叉结果
        with tf.name_scope("second-order"):
            ## sum_square_part: [batch, feat_size]
            self.summed_features_emb = tf.reduce_sum(self.embeddings, 1)
            self.summed_features_emb_square = tf.square(self.summed_features_emb)
            
            ## square_sum_part: [batch, feat_size]
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)
            
            self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb)
        
        # Deep层
        with tf.name_scope("deep-layer"):
            self.y_deep = self.y_second_order
            for i in range(len(self.config['deep_layers'])):
                self.y_deep = tf.layers.dense(self.y_deep, self.config['deep_layers'][i],
                                             kernel_initializer=tf.initializers.glorot_normal(),
                                             bias_initializer=tf.initializers.constant(0.1),
                                             activation=None,
                                             name=f"dense_{i}")
                self.y_deep = tf.nn.relu(self.y_deep)
                self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_prob[i])
                
        
        # 输出层
        with tf.name_scope("output"):
            self.y_bias = self.weights['bias'] * tf.ones_like(self.labels)
            self.logits = tf.add_n([tf.reduce_sum(self.y_first_order, axis=1, keepdims=True),
                                   tf.reduce_sum(self.y_deep, axis=1, keepdims=True),
                                   self.y_bias])
            self.predictions = tf.nn.sigmoid(self.logits)
            
        # 损失函数
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels,
                                                            logits=self.logits)
            self.loss = tf.reduce_mean(losses)
            if self.config['l2_reg'] > 0: 
                l2_loss = tf.add_n([tf.nn.l2_loss(cand_var) 
                                    for cand_var in tf.trainable_variables() 
                                    if "bia" not in cand_var.name and "embedding" not in cand_var.name])
                self.loss += self.config['l2_reg'] * l2_loss
                
        
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            learning_rate = tf.train.exponential_decay(self.config['lr'],
                                                      self.global_step_tensor,
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            # 使用梯度削减防止梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)
        
            
            
    def _init_weights(self):
        weights = dict()
        
        # embeddings初始化
        weights['feature_embeddings'] = tf.Variable(tf.truncated_normal([self.config['feature_size'], self.config['embedding_size']],
                                                                       0.0, 0.01), name="feature_embeddings")
        weights['feature_weights'] = tf.Variable(tf.truncated_normal([self.config['feature_size'], 1], 0.0, 1.0),
                                             name="feature_weights")
        weights['bias'] = tf.Variable(tf.constant(0.1), name="bias")
        
        return weights
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

# 3. 训练类

In [10]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        # 定义迭代次数
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        
        for _ in tqdm(range(num_iter_per_epoch)):
            loss, metrics, step = self.train_step()
            train_acc, train_f_score = metrics['accuracy'], metrics['f_score']
            
            ## 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer='train', scope="train_summary",
                                 summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0:
                print("Train - Step: {} | Loss: {} | Acc: {} | F1_score: {}".format(
                    step, loss, train_acc, train_f_score))
                # 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval)
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                acc_mean = np.round(metrics['accuracy'], 5)
                gini_mean = np.round(metrics['gini_norm'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                print("Eval | Loss: {} | Accuracy: {} | Gini: {}".format(
                    loss_mean, acc_mean, gini_mean))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "accuracy":np.array(acc_mean),
                                 "gini": np.array(gini_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            if step % self.config['checkpointEvery'] == 0:
                self.model.save(self.sess)
    
    def train_step(self):
        batch_feat_i, batch_feat_v, batch_y = next(self.train.next_batch(self.config['batch_size']))
        feed_dict = {self.model.feat_index: batch_feat_i, 
                    self.model.feat_value: batch_feat_v, 
                    self.model.labels: batch_y,
                    self.model.dropout_keep_prob: self.config['dropout_prob'],
                    self.model.is_training: True}
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss, 
                                                   self.model.predictions,
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)

        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step 
    
    def eval_step(self, batch):
        feed_dict = {self.model.feat_index: batch[0],
                    self.model.feat_value: batch[1],
                    self.model.labels: batch[2],
                    self.model.dropout_keep_prob: [1.0] * len(self.config['dropout_prob']),
                    self.model.is_training: False}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions

# 4. 训练

In [11]:
# 划分数据集
train_idx = slice(0, int(len(labels)*0.8))
val_idx = slice(int(len(labels)*0.8), int(len(labels)))

train_df_i, train_df_v, train_df_y = (features["df_i"][train_idx], 
                                      features["df_v"][train_idx], 
                                      labels[train_idx])
val_df_i, val_df_v, val_df_y = (features["df_i"][val_idx],
                               features["df_v"][val_idx],
                               labels[val_idx])

train = DataGenerator(train_df_y, train_df_i, train_df_v)
val = DataGenerator(val_df_y, val_df_i, val_df_v)

In [12]:
def main():
    config = Config(field_handler)
    config['num_epochs'] = 2 
    create_dirs([config['summary_dir'], config['checkpoint_dir']])
    tf.reset_default_graph()
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.8 
    session_conf.gpu_options.allow_growth = True
    
    model = NFM(config)
    sess = tf.Session(config=session_conf)
    pack_data = [train, val]
    logger = Logger(sess, config)
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [13]:
main()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.

当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 1000 | Loss: 0.5596783757209778 | Acc: 0.74016 | F1_score: 0.29787
Eval | Loss: 0.5529299974441528 | Accuracy: 0.76815 | Gini: 0.47415
Saving model...
Model saved
Train - Step: 2000 | Loss: 0.5200712084770203 | Acc: 0.75591 | F1_score: 0.2439
Eval | Loss: 0.5337799787521362 | Accuracy: 0.77094 | Gini: 0.48206
Saving model...
Model saved
Train - Step: 3000 | Loss: 0.5162172317504883 | Acc: 0.77165 | F1_score: 0.38298
Eval | Loss: 0.5192099809646606 | Accuracy: 0.7702 | Gini: 0.4905
Saving model...
Model saved
Train - Step: 4000 | Loss: 0.49115726351737976 | Acc: 0.75591 | F1_score: 0.27907
Eval | Loss: 0.5057899951934814 | Accuracy: 0.77305 | Gini: 0.50249
Saving model...
Model saved
Train - Step: 5000 | Loss: 0.479180246591568 | Acc: 0.7874 | F1_score: 0.4
Eval | Loss: 0.49764999747276306 | Accuracy: 0.77358 | Gini: 0.50992
Saving model...
Model saved
Train - Step: 6000 | Loss: 0.45700758695602417 | Acc: 0.7874 | F1_score: 0.4
Eval | Loss: 0.49605000019073486 | Accuracy: 

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 7000 | Loss: 0.42785683274269104 | Acc: 0.8189 | F1_score: 0.54902
Eval | Loss: 0.49608999490737915 | Accuracy: 0.77178 | Gini: 0.50899
Saving model...
Model saved
Train - Step: 8000 | Loss: 0.46074774861335754 | Acc: 0.75591 | F1_score: 0.27907
Eval | Loss: 0.4953700006008148 | Accuracy: 0.77224 | Gini: 0.50954
Saving model...
Model saved
Train - Step: 9000 | Loss: 0.5141562223434448 | Acc: 0.75591 | F1_score: 0.47458
Eval | Loss: 0.4950000047683716 | Accuracy: 0.77134 | Gini: 0.50958
Saving model...
Model saved
Train - Step: 10000 | Loss: 0.47839486598968506 | Acc: 0.77165 | F1_score: 0.50847
Eval | Loss: 0.49480000138282776 | Accuracy: 0.77135 | Gini: 0.5085
Saving model...
Model saved
Train - Step: 11000 | Loss: 0.4579806625843048 | Acc: 0.80315 | F1_score: 0.5283
Eval | Loss: 0.4945699870586395 | Accuracy: 0.77152 | Gini: 0.50829
Saving model...
Model saved
Train - Step: 12000 | Loss: 0.4137086570262909 | Acc: 0.85039 | F1_score: 0.64151
Eval | Loss: 0.49441999197006