In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
from utils import *

In [3]:
from tqdm.autonotebook import tqdm

# 1. 数据的准备

In [4]:
data = pd.read_csv("../data/criteo/criteo_data.csv")

In [5]:
# 取出连续型特征和类别型特征
con = [f for f in data.columns if f.startswith("I")]
cat = [f for f in data.columns if f.startswith("C")]

In [6]:
field_handler = FieldHandler(train_file_path="../data/criteo/criteo_data.csv",
                            continuation_columns=con,
                            category_columns=cat)

In [7]:
# 获取要输入的特征和标签值
features, labels = transformation_data(data,
                                      field_hander=field_handler,
                                      label = "Label")

cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26
con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13


In [15]:
class Config(dict):
    def __init__(self, field_handler):
        # 模型参数
        self['field_size'] = len(field_handler.field_dict)
        self['feature_size'] = field_handler.feature_nums
        self['embedding_size'] = 50
        self['deep_init_size'] = 100
        self['deep_layers'] = [64, 64]
        self['dropout_prob'] = [0.5, 0.8, 0.8] # 要比deep_layers多一个
        self['seed'] = 2019
        self['l2_reg'] = 0.001
        self['use_inner'] = False
        
        # 训练参数
        self['num_epochs'] = 5
        self['batch_size'] = 128
        self['evaluateEvery'] = 1000
        self['checkpointEvery'] = 1000
        self['lr'] = 0.01
        self['decay_steps'] = 200 
        self['decay_rate'] = 0.9
        self['grad_clip'] = 5.0 
        
        # 其他参数
        self['num_classes'] = 1 
        self['train_size'] = 0.8 
        self.threshold = 0.5
        self['checkpoint_dir'] = "../model/PNN/checkpoint"
        self['summary_dir'] = "../model/PNN/summary"
        self['max_to_keep'] = 5
        


# 2. 模型

In [16]:
class PNN(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        tf.set_random_seed(self.config['seed'])
        self.build_model()
        self.init_saver()
    
    def build_model(self):
        # 指定输入
        self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name='feat_index')
        self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name='feat_value')
        self.labels = tf.placeholder(tf.float32, shape=[None, self.config['num_classes']], name='label')
        self.dropout_keep_prob = tf.placeholder(tf.float32, shape=[None], name='dropout_keep_prob')
        self.is_training = tf.placeholder(tf.bool, name="is_training")
        
        self.weights = self._init_weights()
        
        # Embedding层
        with tf.name_scope("embedding"):
            ## 输出 [batch, seq_len, embed_size]
            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.feat_index)
            feat_value = tf.reshape(self.feat_value, shape=[-1, self.config['field_size'], 1])
            ## 输出 [batch, seq_len, embed_size]
            self.embeddings = tf.multiply(self.embeddings, feat_value)
            
        # 线性层
        with tf.name_scope("linear_singal"):
            ## 对计算速度进行优化
            ### embeddings的维度为 [batch, seq_len, embed_size]， product的维度是[deep_size, seq_len, embed_size]
            ### 输出为 [batch, deep_size]
            self.lz = tf.tensordot(self.embeddings, self.weights['product_linear'], axes=((1, 2), (1, 2)), name="linear_dot")

            
            
        # 特征交叉层
        with tf.name_scope("quardatic_singal"):
            if self.config['use_inner']:
                ## 输出 [batch, embed_size, deep_size]
                theta = tf.tensordot(self.embeddings, self.weights['product_quadratic_inner'], axes=((1), (1)), name="inner_dot")
                ## 输出 [batch, deep_size]
                self.lp = tf.norm(theta, axis=1)
            else:
                embedding_sum = tf.reduce_sum(self.embeddings, axis=1)
                ## 输出 [batch, embed_size, embed_size]
                p = tf.matmul(tf.expand_dims(embedding_sum, 2), tf.expand_dims(embedding_sum, 1))
                ## 输出 [batch, deep_size]
                self.lp = tf.tensordot(p, self.weights['product_quadratic_outer'], axes=((1, 2), (1, 2)), name="outer_dot")
                
        ## 将线性层和交叉层相加
        self.y_deep = tf.nn.relu(tf.add(tf.add(self.lz, self.lp), self.weights['product_bias']))
        self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_prob[0])
        
        # 深层网络
        with tf.name_scope("deep_layers"):
            for i in range(0, len(self.config['deep_layers'])):
                '''
                self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights[f"layer_{i}"]), self.weights[f'bias_{i}'], name=f"deep_layer_{i}")
                '''
                self.y_deep = tf.layers.dense(self.y_deep, self.config['deep_layers'][i], 
                                             kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                             bias_initializer=tf.initializers.constant(0.01),
                                             activation=None)
                
                self.y_deep = tf.nn.relu(self.y_deep)
                #self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_prob[i+1])
                self.y_deep = tf.layers.batch_normalization(self.y_deep, training=self.is_training)
        
        with tf.name_scope("output"):
            '''
            self.logits = tf.add(tf.matmul(self.y_deep, self.weights['output']), self.weights['output_bias'])
            '''
            self.logits = tf.layers.dense(self.y_deep, self.config['num_classes'],
                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                        bias_initializer=tf.initializers.constant(0.01),
                                        activation=None)
            
            self.predictions = tf.nn.sigmoid(self.logits)  # 得到概率
            
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.logits)
            #losses = tf.losses.log_loss(self.labels, self.predictions)
            self.loss = tf.reduce_mean(losses)
            # 计算l2正则化损失
            if self.config['l2_reg'] > 0: 
                l2_loss = tf.add_n([tf.nn.l2_loss(cand_var) 
                                    for cand_var in tf.trainable_variables() 
                                    if "bia" not in cand_var.name and "embedding" not in cand_var.name])
                self.l2_loss = l2_loss
                self.loss += self.config['l2_reg'] * self.l2_loss
            
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):

            learning_rate = tf.train.exponential_decay(self.config['lr'],
                                                      self.global_step_tensor,
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            # 使用梯度削减防止梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)
            

            grads_and_vars = optimizer.compute_gradients(self.loss)
            
            
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)

            
            
            
    # 初始化全部的权重
    def _init_weights(self):
        weights = dict()
        
        # embeddings
        weights['feature_embeddings'] = tf.Variable(tf.truncated_normal([self.config['feature_size'], self.config['embedding_size']], 
                                                                     0.0, 0.01),
                                                name='feature_embeddings')
        weights['feature_bias'] = tf.Variable(tf.random_normal([self.config['feature_size'], 1], 0.0, 1.0), name='feature_bias')
        
        # Product Layer
        if self.config['use_inner']:
            weights['product_quadratic_inner'] = tf.Variable(tf.truncated_normal([self.config['deep_init_size'], self.config['field_size']], 
                                                                              0.0, 0.01),
                                                            name="product_inner")
        else:
            weights['product_quadratic_outer'] = tf.Variable(tf.truncated_normal([self.config['deep_init_size'], 
                                                                               self.config['embedding_size'], self.config['embedding_size']], 
                                                                              0.0, 0.01),
                                                            name="product_outer")
        
        weights['product_linear'] = tf.Variable(tf.truncated_normal([self.config['deep_init_size'], self.config['field_size'],
                                                                 self.config['embedding_size']], 0.0, 0.01),
                                               name="product_linear")
        weights['product_bias'] = tf.Variable(tf.truncated_normal([self.config['deep_init_size'],], 0.0, 1.0),
                                             name="product_bias")
        
        '''
        # Deep Layers
        num_layer = len(self.config['deep_layers'])
        input_size = self.config['deep_init_size']
        glorot = np.sqrt(2.0 / (input_size + self.config['deep_layers'][0]))
        
        weights['layer_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, self.config['deep_layers'][0])),
                                        dtype=tf.float32, name="layer_0")
        weights['bias_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.config['deep_layers'][0])),
                                       dtype=tf.float32, name="bias_0")
        
        for i in range(1, num_layer):
            glorot = np.sqrt(2.0 / (self.config['deep_layers'][i-1] + self.config['deep_layers'][i]))
            weights[f"layer_{i}"] = tf.Variable(np.random.normal(loc=0, scale=glorot, 
                                                                 size=(self.config['deep_layers'][i-1], self.config['deep_layers'][i])),
                                               dtype=tf.float32, name=f"layers_{i}")
            weights[f"bias_{i}"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.config['deep_layers'][i])),
                                              dtype=tf.float32, name=f"bias_{i}")
            
        glorot = np.sqrt(2.0 / (self.config['deep_layers'][-1] + 1))
        weights['output'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.config['deep_layers'][-1], self.config['num_classes'])),
                                       dtype=tf.float32, name="output_weights")
        weights['output_bias'] = tf.Variable(tf.constant(0.01), dtype=tf.float32, name="output_bias")
        '''
        return weights
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

# 3. 训练类

In [17]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        # 定义迭代次数
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        
        for _ in tqdm(range(num_iter_per_epoch)):
            loss, metrics, step = self.train_step()
            train_acc, train_f_score = metrics['accuracy'], metrics['f_score']
            
            ## 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer='train', scope="train_summary",
                                 summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0:
                print("Train - Step: {} | Loss: {} | Acc: {} | F1_score: {}".format(
                    step, loss, train_acc, train_f_score))
                # 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval)
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                acc_mean = np.round(metrics['accuracy'], 5)
                gini_mean = np.round(metrics['gini_norm'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                print("Eval | Loss: {} | Accuracy: {} | Gini: {}".format(
                    loss_mean, acc_mean, gini_mean))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "accuracy":np.array(acc_mean),
                                 "gini": np.array(gini_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            if step % self.config['checkpointEvery'] == 0:
                self.model.save(self.sess)
    
    def train_step(self):
        batch_feat_i, batch_feat_v, batch_y = next(self.train.next_batch(self.config['batch_size']))
        feed_dict = {self.model.feat_index: batch_feat_i, 
                    self.model.feat_value: batch_feat_v, 
                    self.model.labels: batch_y,
                    self.model.dropout_keep_prob: self.config['dropout_prob'],
                    self.model.is_training: True}
        _, loss, predictions, step, l2_loss = self.sess.run([self.model.train_op,
                                                   self.model.loss, 
                                                   self.model.predictions,
                                                   self.model.global_step_tensor,
                                                   self.model.l2_loss],
                                                  feed_dict=feed_dict)

        if str(l2_loss) == "nan":
            print("step: {} | loss: {} | l2_loss: {} ".format(step, loss, l2_loss))
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step 
    
    def eval_step(self, batch):
        feed_dict = {self.model.feat_index: batch[0],
                    self.model.feat_value: batch[1],
                    self.model.labels: batch[2],
                    self.model.dropout_keep_prob: [1.0] * len(self.config['dropout_prob']),
                    self.model.is_training: False}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions

# 4. 训练

In [18]:
class DataGenerator:
    def __init__(self, labels, *features):
        self.features = features
        self.labels = labels
        self.length = len(labels)
        ## 计算不同类别的比例
        unique = Counter(self.labels.ravel())
        self.ratio = [(key, value / self.length) for key, value in unique.items()]
        self.indices = []
        for key, _ in self.ratio:
            index = np.where(labels.ravel() == key)
            self.indices.append(index)
        
    def next_batch(self, batch_size):
        '''
        生成每一个batch的数据集
        '''
        choose = np.array([])
        for i in range(len(self.indices)):
            ## 按照在数据集中出现的比例采样
            idx = np.random.choice(self.indices[i][0],
                                   max(1, min(len(self.indices[i][0]), int(batch_size * self.ratio[i][1]))))
            '''
            ## 等比例采样
            idx = np.random.choice(self.indices[i][0],
                                  min(len(self.indices[i][0]), int(batch_size / len(self.indices))))
            '''
            choose = np.append(choose, idx)
        choose = np.random.permutation(choose).astype("int64")
        result = []
        for feat in self.features:
            result.append(feat[choose])
        result.append(self.labels[choose])
        yield result
        
    def iter_all(self, batch_size):
        '''
        按照batch迭代所有数据
        '''
        numBatches = self.length // batch_size + 1 
        for i in range(numBatches):
            result = []
            start = i*batch_size
            end = min(start+batch_size, self.length)
            for feat in self.features:
                result.append(np.asarray(feat[start:end]))
            result.append(np.asarray(self.labels[start:end]))
            yield result

In [19]:
# 划分数据集
train_idx = slice(0, int(len(labels)*0.8))
val_idx = slice(int(len(labels)*0.8), int(len(labels)))

train_df_i, train_df_v, train_df_y = (features["df_i"][train_idx], 
                                      features["df_v"][train_idx], 
                                      labels[train_idx])
val_df_i, val_df_v, val_df_y = (features["df_i"][val_idx],
                               features["df_v"][val_idx],
                               labels[val_idx])

train = DataGenerator(train_df_y, train_df_i, train_df_v)
val = DataGenerator(val_df_y, val_df_i, val_df_v)

In [20]:
def main():
    config = Config(field_handler)
    config['num_epochs'] = 2 
    create_dirs([config['summary_dir'], config['checkpoint_dir']])
    tf.reset_default_graph()
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.8 
    session_conf.gpu_options.allow_growth = True
    
    model = PNN(config)
    sess = tf.Session(config=session_conf)
    sess.run(tf.global_variables_initializer())
    pack_data = [train, val]
    logger = Logger(sess, config)
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [21]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 1000 | Loss: 0.4938005208969116 | Acc: 0.81102 | F1_score: 0.45455
Eval | Loss: 0.6075699925422668 | Accuracy: 0.74884 | Gini: 0.48812
Saving model...
Model saved
Train - Step: 2000 | Loss: 0.466257780790329 | Acc: 0.77953 | F1_score: 0.44
Eval | Loss: 0.5628700256347656 | Accuracy: 0.74884 | Gini: 0.50448
Saving model...
Model saved
Train - Step: 3000 | Loss: 0.4092872738838196 | Acc: 0.7874 | F1_score: 0.49057
Eval | Loss: 0.5942999720573425 | Accuracy: 0.74882 | Gini: 0.50261
Saving model...
Model saved
Train - Step: 4000 | Loss: 0.46430566906929016 | Acc: 0.8189 | F1_score: 0.56604
Eval | Loss: 0.6249200105667114 | Accuracy: 0.74884 | Gini: 0.51305
Saving model...
Model saved
Train - Step: 5000 | Loss: 0.5512120723724365 | Acc: 0.77953 | F1_score: 0.48148
Eval | Loss: 0.5398300290107727 | Accuracy: 0.7535 | Gini: 0.51335
Saving model...
Model saved
Train - Step: 6000 | Loss: 0.4614016115665436 | Acc: 0.75591 | F1_score: 0.39216
Eval | Loss: 0.5640100240707397 | Accura

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 7000 | Loss: 0.37560948729515076 | Acc: 0.85827 | F1_score: 0.67857
Eval | Loss: 0.5682600140571594 | Accuracy: 0.74882 | Gini: 0.50216
Saving model...
Model saved
Train - Step: 8000 | Loss: 0.47730201482772827 | Acc: 0.7874 | F1_score: 0.49057
Eval | Loss: 0.5453500151634216 | Accuracy: 0.75174 | Gini: 0.50011
Saving model...
Model saved
Train - Step: 9000 | Loss: 0.4802996814250946 | Acc: 0.82677 | F1_score: 0.56
Eval | Loss: 0.5201399922370911 | Accuracy: 0.75908 | Gini: 0.49285
Saving model...
Model saved
Train - Step: 10000 | Loss: 0.45368653535842896 | Acc: 0.80315 | F1_score: 0.54545
Eval | Loss: 0.5383599996566772 | Accuracy: 0.75314 | Gini: 0.50078
Saving model...
Model saved
Train - Step: 11000 | Loss: 0.3806925117969513 | Acc: 0.84252 | F1_score: 0.64286
Eval | Loss: 0.5150799751281738 | Accuracy: 0.75926 | Gini: 0.50125
Saving model...
Model saved
Train - Step: 12000 | Loss: 0.40783533453941345 | Acc: 0.81102 | F1_score: 0.53846
Eval | Loss: 0.4935399889945984

- 使用Inner的方法存在梯度消失和梯度爆炸的情况，导致参数为nan