In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 


import warnings
warnings.filterwarnings("ignore")

In [2]:
from utils import *

In [3]:
from tqdm.autonotebook import tqdm
from collections import Counter

# 1. 数据的准备

In [4]:
data = pd.read_csv("../data/criteo/criteo_data.csv")

# 取出连续型特征和类别型特征对应的列
con = [f for f in data.columns if f.startswith("I")]
cat = [f for f in data.columns if f.startswith("C")]

field_handler = FieldHandler(train_file_path="../data/criteo/criteo_data.csv",
                            continuation_columns=con,
                            category_columns=cat)

In [5]:
# 获取要输入的特征和标签值
features, labels = transformation_data(data, 
                                      field_handler=field_handler,
                                      label="Label")

cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26
con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13


## 1.1 基础参数的配置

In [62]:
# 基础参数设置
class Config(dict):
    def __init__(self, field_handler):
        # 模型参数
        self['field_size'] = len(field_handler.field_dict)
        self['feature_size'] = field_handler.feature_nums
        self['embedding_size'] = 50 
        self['dropout_prob'] = [0.8, 0.8, 0.8]
        self['attention_size'] = 10
        self['seed'] = 2019 
        self['l2_reg'] = 0.001
        
        # 训练参数
        self['num_epochs'] = 5 
        self['batch_size'] = 128 
        self['evaluateEvery'] = 1000
        self['checkpointEvery'] = 1000
        self['lr'] = 0.01 
        self['decay_steps'] = 200 
        self['decay_rate'] = 0.9 
        self['grad_clip'] = 5.0 
        
        # 其他参数
        self['num_classes'] = 1 
        self['train_size'] = 0.8 
        self.threshold = 0.5 
        self['checkpoint_dir'] = "../model/AFM/checkpoint"
        self['summary_dir'] = "../model/AFM/summary"
        self['max_to_keep'] = 5 

# 2. 定义模型

In [63]:
class AFM(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        tf.set_random_seed(self.config['seed'])
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        self.feat_index = tf.placeholder(tf.int32, shape=[None, self.config['field_size']], name="feat_index")
        self.feat_value = tf.placeholder(tf.float32, shape=[None, self.config['field_size']], name="feat_value")
        self.labels = tf.placeholder(tf.float32, shape=[None, self.config['num_classes']], name="labels")
        self.dropout_keep_prob = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_prob")
        self.is_training = tf.placeholder(tf.bool, name="is_training")
        
        self.weights = self._init_weights()
        
        # Embedding层
        with tf.name_scope("embedding"):
            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.feat_index)
            feat_value = tf.expand_dims(self.feat_value, 2)
            ## 输出shape: [batch, field_size, embed_size]
            self.embeddings = tf.multiply(self.embeddings, feat_value)
        
        # Attention层
        with tf.name_scope("attention"):
            ## 对每个field和其他field进行element-wise
            element_wise_product_list = []
            for i in range(self.config['field_size']):
                for j in range(i+1, self.config['field_size']):
                    element_wise_product_list.append(tf.multiply(self.embeddings[:, i, :], self.embeddings[:, j, :]))
            ## 输出shape: [(f*(f-1))/2, batch, embed_size]
            self.element_wise_product = tf.stack(element_wise_product_list)
            ## ## 输出shape: [ batch, (f*(f-1))/2, embed_size]
            self.element_wise_product = tf.transpose(self.element_wise_product, perm=[1, 0, 2],
                                                         name="element_wise_product")
            
            num_interactions = int(self.config["field_size"] * (self.config["field_size"] - 1) / 2)
            
            ## 转换到attention_size上
            self.attention_wx_plus_b = tf.reshape(tf.add(tf.matmul(tf.reshape(self.element_wise_product, 
                                                                              [-1, self.config['embedding_size']]),
                                                                  self.weights['attention_w']),
                                                        self.weights['attention_b']),
                                                 shape=[-1, num_interactions, self.config['attention_size']])
            ## 输出shape: [batch, num_inter]
            self.attention_activation = tf.reduce_sum(tf.multiply(tf.nn.relu(self.attention_wx_plus_b),
                                                                 self.weights['attention_h']), axis=2)
            
            ## 归一化，得到每个交叉特征的权重
            self.attention_alpha = tf.nn.softmax(self.attention_activation)
            ## 对上面得到的二阶交叉特征进行加权
            ## 输出shape: [batch, embed_size]
            self.attention_x_product = tf.reduce_sum(tf.multiply(tf.expand_dims(self.attention_alpha, 2), self.element_wise_product), axis=1, name="afm")
            self.attention_x_product = tf.nn.dropout(self.attention_x_product, self.dropout_keep_prob[0])
            
            self.attention_part_sum = tf.matmul(self.attention_x_product,
                                               self.weights['attention_p'])
            
        # 一阶特征
        with tf.name_scope("first_order"):
            self.y_first_order = tf.nn.embedding_lookup(self.weights['feature_weights'], self.feat_index)
            self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2)
            
            ## 最终的偏置
            self.y_bias = self.weights['bias'] * tf.ones_like(self.labels)
        
        # 输出层
        with tf.name_scope("output"):
            self.logits = tf.add_n([self.attention_part_sum, tf.reduce_sum(self.y_first_order, axis=1, keepdims=True), self.y_bias])
            self.predictions = tf.nn.sigmoid(self.logits)
            
        # 损失函数层
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.logits)
            self.loss = tf.reduce_mean(losses)
            if self.config['l2_reg'] > 0: 
                l2_loss = tf.add_n([tf.nn.l2_loss(cand_var) for cand_var in tf.trainable_variables()
                                   if "bia" not in cand_var.name and "embedding" not in cand_var.name])
                self.loss += self.config['l2_reg'] * l2_loss
                
        
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            learning_rate = tf.train.exponential_decay(self.config['lr'],
                                                      self.global_step_tensor,
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate)

            grads_and_vars = optimizer.compute_gradients(self.loss)
            
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)
            
            
    def _init_weights(self):
        weights = dict()
        
        # 设置embeddings参数
        weights["feature_embeddings"] = tf.Variable(
                tf.random_normal([self.config['feature_size'], self.config['embedding_size']], 0.0, 0.01),
                name="feature_embeddings")
        weights["feature_weights"] = tf.Variable(tf.random_normal([self.config['feature_size'], 1], 0.0, 1.0),
                                                name="feature_weights")
        weights['bias'] = tf.Variable(tf.constant(0.1), name="bias")
        
        # Attention部分
        glorot = np.sqrt(2.0 / (self.config['attention_size']+self.config['embedding_size']))
        weights['attention_w'] = tf.Variable(np.random.normal(loc=0, scale=glorot,
                                                             size=(self.config['embedding_size'], self.config['attention_size'])),
                                                             dtype=tf.float32,
                                                             name="attention_w")
        weights['attention_b'] = tf.Variable(np.random.normal(loc=0, scale=glorot,
                                                              size=(self.config['attention_size'])),
                                            dtype=tf.float32, name="attention_b")
        weights['attention_h'] = tf.Variable(np.random.normal(loc=0, scale=1, 
                                                              size=(self.config['attention_size'])),
                                            dtype=tf.float32, name="attention_h")                                     
        weights['attention_p'] = tf.Variable(np.ones((self.config['embedding_size'], 1)),
                                            dtype=tf.float32)
        return weights
    
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

# 3. 训练类

In [64]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        # 定义迭代次数
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        
        for _ in tqdm(range(num_iter_per_epoch)):
            loss, metrics, step = self.train_step()
            train_acc, train_f_score = metrics['accuracy'], metrics['f_score']
            
            ## 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer='train', scope="train_summary",
                                 summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0:
                print("Train - Step: {} | Loss: {} | Acc: {} | F1_score: {}".format(
                    step, loss, train_acc, train_f_score))
                # 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval)
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                acc_mean = np.round(metrics['accuracy'], 5)
                gini_mean = np.round(metrics['gini_norm'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                print("Eval | Loss: {} | Accuracy: {} | Gini: {}".format(
                    loss_mean, acc_mean, gini_mean))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "accuracy":np.array(acc_mean),
                                 "gini": np.array(gini_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            if step % self.config['checkpointEvery'] == 0:
                self.model.save(self.sess)
    
    def train_step(self):
        batch_feat_i, batch_feat_v, batch_y = next(self.train.next_batch(self.config['batch_size']))
        feed_dict = {self.model.feat_index: batch_feat_i, 
                    self.model.feat_value: batch_feat_v, 
                    self.model.labels: batch_y,
                    self.model.dropout_keep_prob: self.config['dropout_prob'],
                    self.model.is_training: True}
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss, 
                                                   self.model.predictions,
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)

        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step 
    
    def eval_step(self, batch):
        feed_dict = {self.model.feat_index: batch[0],
                    self.model.feat_value: batch[1],
                    self.model.labels: batch[2],
                    self.model.dropout_keep_prob: [1.0] * len(self.config['dropout_prob']),
                    self.model.is_training: False}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions

# 4. 训练

In [65]:
# 划分数据集
train_idx = slice(0, int(len(labels)*0.8))
val_idx = slice(int(len(labels)*0.8), int(len(labels)))

train_df_i, train_df_v, train_df_y = (features["df_i"][train_idx], 
                                      features["df_v"][train_idx], 
                                      labels[train_idx])
val_df_i, val_df_v, val_df_y = (features["df_i"][val_idx],
                               features["df_v"][val_idx],
                               labels[val_idx])

train = DataGenerator(train_df_y, train_df_i, train_df_v)
val = DataGenerator(val_df_y, val_df_i, val_df_v)

In [66]:
def main():
    config = Config(field_handler)
    config['num_epochs'] = 2 
    create_dirs([config['summary_dir'], config['checkpoint_dir']])
    tf.reset_default_graph()
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.8 
    session_conf.gpu_options.allow_growth = True
    
    model = AFM(config)
    sess = tf.Session(config=session_conf)
    pack_data = [train, val]
    logger = Logger(sess, config)
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [67]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 1000 | Loss: 0.5081796050071716 | Acc: 0.80315 | F1_score: 0.4898
Eval | Loss: 0.5459700226783752 | Accuracy: 0.77094 | Gini: 0.48676
Saving model...
Model saved
Train - Step: 2000 | Loss: 0.48729002475738525 | Acc: 0.77165 | F1_score: 0.38298
Eval | Loss: 0.5188999772071838 | Accuracy: 0.77526 | Gini: 0.5088
Saving model...
Model saved
Train - Step: 3000 | Loss: 0.46591147780418396 | Acc: 0.82677 | F1_score: 0.54167
Eval | Loss: 0.5061200261116028 | Accuracy: 0.77588 | Gini: 0.51464
Saving model...
Model saved
Train - Step: 4000 | Loss: 0.4764001965522766 | Acc: 0.77953 | F1_score: 0.41667
Eval | Loss: 0.4976100027561188 | Accuracy: 0.77785 | Gini: 0.52122
Saving model...
Model saved
Train - Step: 5000 | Loss: 0.4682236313819885 | Acc: 0.7874 | F1_score: 0.52632
Eval | Loss: 0.4929400086402893 | Accuracy: 0.77794 | Gini: 0.52438
Saving model...
Model saved
Train - Step: 6000 | Loss: 0.3941747546195984 | Acc: 0.83465 | F1_score: 0.55319
Eval | Loss: 0.4897400140762329 | A

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Train - Step: 7000 | Loss: 0.48639893531799316 | Acc: 0.76378 | F1_score: 0.375
Eval | Loss: 0.49160000681877136 | Accuracy: 0.77895 | Gini: 0.52886
Saving model...
Model saved
Train - Step: 8000 | Loss: 0.4770505428314209 | Acc: 0.80315 | F1_score: 0.46809
Eval | Loss: 0.49171000719070435 | Accuracy: 0.77922 | Gini: 0.52886
Saving model...
Model saved
Train - Step: 9000 | Loss: 0.42882412672042847 | Acc: 0.8189 | F1_score: 0.54902
Eval | Loss: 0.48583999276161194 | Accuracy: 0.77949 | Gini: 0.53109
Saving model...
Model saved
Train - Step: 10000 | Loss: 0.4669159948825836 | Acc: 0.77953 | F1_score: 0.41667
Eval | Loss: 0.48695001006126404 | Accuracy: 0.7795 | Gini: 0.53087
Saving model...
Model saved
Train - Step: 11000 | Loss: 0.4677627980709076 | Acc: 0.7874 | F1_score: 0.4
Eval | Loss: 0.4878099858760834 | Accuracy: 0.77954 | Gini: 0.53078
Saving model...
Model saved
Train - Step: 12000 | Loss: 0.41931575536727905 | Acc: 0.85039 | F1_score: 0.61224
Eval | Loss: 0.4857800006866455 |