In [1]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding

In [2]:
categorical_features = ['phone_model', 'browser_family', 'os_family', 'device_brand', 'city_code', 'province_code',
                        'sex', 'hashouse', 'social', 'overdue',
                        'tax', 'married', 'benke', 'kid', 'income', 'consumption', 'shebao']

text_features = ['clicked_products_0009', 'clicked_products_date_0009', 'sms_sent_products_0009',
                 'sms_sent_products_1019', 'sms_sent_products_date_0009', 'sms_sent_products_date_1019',
                 'called_products_0009', 'called_products_1019', 'called_products_date_0009',
                 'called_products_date_1019', 'picked_products_0009', 'picked_products_date_0009',
                 'outbound_sent_products_0009', 'outbound_sent_products_date_0009', 'set_all_ins_host_180', 'set_all_ins_host_360',] \
                + ['keypress_30', 'keypress_60', 'keypress_90', 'keypress_120',
                   'rule_name_30', 'rule_name_60', 'rule_name_90', 'rule_name_120',
                   'semantic_30', 'semantic_60', 'semantic_90', 'semantic_120', 'model_value']

text_feature_types = ['products', 'keypress', 'rules', 'semantics', 'insurances', 'model_value']

In [3]:
train_df = pd.read_excel("../../data/train_data_ifh4.xlsx")

In [4]:
train_df[text_features] = train_df[text_features].astype(str)

In [5]:
train_df[categorical_features].head(5)

Unnamed: 0,phone_model,browser_family,os_family,device_brand,city_code,province_code,sex,hashouse,social,overdue,tax,married,benke,kid,income,consumption,shebao
0,2,0,1,0,24,5,2,0,1,0,0,0,0,0,0,0,0
1,2,1,0,1,28,2,1,0,0,0,0,0,0,0,0,0,0
2,3,0,1,0,15,3,1,0,0,0,0,0,1,0,0,0,0
3,10,1,0,1,103,5,1,0,0,0,0,0,1,0,0,0,0
4,5,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0


In [6]:
train_df[text_features].head(5)

Unnamed: 0,clicked_products_0009,clicked_products_date_0009,sms_sent_products_0009,sms_sent_products_1019,sms_sent_products_date_0009,sms_sent_products_date_1019,called_products_0009,called_products_1019,called_products_date_0009,called_products_date_1019,...,keypress_120,rule_name_30,rule_name_60,rule_name_90,rule_name_120,semantic_30,semantic_60,semantic_90,semantic_120,model_value
0,,,,,,,,,,,...,小助理 运营商提示音,,,,baotai27_其他,,,,144 145 140 143,
1,,,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZTKMF_OPPOJX_BT,,2D 4D,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,baotai27_G,,,,7 9 140 150 151 7 131 131,,
3,FQL IYBPAZX_ZMF_CS_BT NWZAZX_ZNWZAMF_NEWOPPO_B...,12D 12D 15D 24D 30D 32D 36D 38D 41D 44D,IYBPAZX_ZTKMF_OPPOJX_BT NWZAZX_ZNWZAMF_NEWOPPO...,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZMF_CS_BT IYBP...,2D 4D 8D 10D 12D 15D 20D 24D 26D 28D,30D 32D 34D 36D 39D 43D 45D 47D 51D 53D,,,,,...,触发发短信 sendMessage_special 没输入手机号 输入手机号2 输入手机号3...,,,yingdian888_B jiyonghua661_B huirong888_D ying...,mayi02_D baotai14_B mayi02_D mayi02_D,,,2240 2572 2672 2313 2345 2672 2313 2313 2313 2...,23 24 46 3921 35 21 21 2576 2181 2189 24 23 23...,
4,,,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZMF_CS_BT IYBP...,,2D 4D 6D,,IYBPAZX_TKMF_GD_BZ_GZH_WH IYBPAZX_TKMF_GD_BZ_G...,,1D 4D 7D 52D 55D 58D,,...,触发发短信 sendMessage_special,,,baotai27_D,baotai27_其他,,,,,


In [7]:
def get_vocabulary(feature_type):
    words = train_df[feature_type].dropna().str.split(' ')
    exploded_words = words.explode()
    vocabulary = exploded_words.value_counts()
    vocabulary = vocabulary[vocabulary > 5]
    vocab_size = vocabulary.shape[0]
    vocabulary = vocabulary.index.tolist()

    return vocabulary, vocab_size

In [8]:
vocab_dict = {}
for type in text_feature_types:
    vocab_dict[type] = get_vocabulary(type)

In [9]:
vocab_size_dict = {col: vocab_info[1]+100 for col, vocab_info in vocab_dict.items()}

In [10]:
vocab_size_dict

{'products': 273,
 'keypress': 207,
 'rules': 328,
 'semantics': 351,
 'insurances': 141,
 'model_value': 102}

In [11]:
import pickle

with open(f'../../data/dicts/token_dict.pkl', 'rb') as f:
    token_dict = pickle.load(f)

In [12]:
max_len_dict = {
    'called_products_0009': 10,
    'called_products_1019': 6,
    'called_products_date_0009': 10,
    'called_products_date_1019': 6,
    'clicked_products_0009': 3,
    'clicked_products_date_0009': 3,
    'keypress_120': 3,
    'keypress_30': 3,
    'keypress_60': 3,
    'keypress_90': 3,
    'label_0009': 3,
    'label_1019': 3,
    'label_date_0009': 3,
    'label_date_1019': 3,
    'model_value': 3,
    'outbound_sent_products_0009': 3,
    'outbound_sent_products_date_0009': 3,
    'picked_products_0009': 3,
    'picked_products_date_0009': 3,
    'rule_name_120': 3,
    'rule_name_30': 3,
    'rule_name_60': 3,
    'rule_name_90': 3,
    'semantic_120': 5,
    'semantic_30': 3,
    'semantic_60': 3,
    'semantic_90': 5,
    'set_all_ins_host_180': 5,
    'set_all_ins_host_360': 4,
    'sms_sent_products_0009': 10,
    'sms_sent_products_1019': 9,
    'sms_sent_products_date_0009': 10,
    'sms_sent_products_date_1019': 9
 }

category_counts_dict = {col: train_df[col].nunique() for col in categorical_features}


def get_text_feature_type(col):
    if 'set_' in col:
        text_feature_type = 'insurances'
    elif 'host_' in col:
        text_feature_type = 'hosts'
    elif 'products_' in col:
        text_feature_type = 'products'
    elif 'label_' in col:
        text_feature_type = 'labels'
    elif 'keypress_' in col:
        text_feature_type = 'keypress'
    elif 'rule_name_' in col:
        text_feature_type = 'rules'
    elif 'semantic_' in col:
        text_feature_type = 'semantics'
    else:
        text_feature_type = 'model_value'
    return text_feature_type

In [13]:
# restore numerical features
restored_raw = (
    train_df["numerical_features"]
    .astype(str)
    .str.replace("\n", " ", regex=False)
    .str.strip("[]")
    .str.strip()
    .str.split(r"\s+", expand=True)
    .apply(lambda col: pd.to_numeric(col.replace('', pd.NA), errors='coerce'))
    .fillna(0)
    .to_numpy()
)

In [14]:
import numpy as np

y_train_cro = train_df['label'].str.strip("[]").str.split(" ", expand=True).astype(int).to_numpy()

# X
X_train = {'numerical_features': restored_raw}

for col in categorical_features:
    X_train[col] = np.array(train_df[col])

for col in text_features:
    with open(f'../../data/text_features/{col}.pkl', 'rb') as f:
        X_train[col] = pickle.load(f)

In [15]:
from sklearn.model_selection import train_test_split
import numpy as np


# 拆分索引
indices = np.arange(len(y_train_cro))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42, stratify=y_train_cro)

# 拆分 y
y_train_split = y_train_cro[train_idx]
y_test_split = y_train_cro[test_idx]

# 拆分 X
X_train_split = {}
X_test_split = {}
for key in X_train:
    arr = X_train[key]
    X_train_split[key] = arr[train_idx]
    X_test_split[key] = arr[test_idx]

In [16]:
class CrossLayer(tf.keras.layers.Layer):
    """
    Deep Cross Network layer (DCN).
    """
    def __init__(self, num_layers=3, *args, **kwargs):
        super(CrossLayer, self).__init__(**kwargs)
        # 如果第一个位置参数传入的是类似 CrossLayer(1, ...)，则 num_layers==1，其他位置参数被忽略
        self.num_layers = int(num_layers)
        self.input_spec = tf.keras.layers.InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        self.input_dim = int(input_shape[1])
        # 每层使用向量 w (shape = (input_dim, 1)) 和偏置 b (shape = (input_dim,))
        self.kernels = []
        self.biases = []
        for i in range(self.num_layers):
            self.kernels.append(
                self.add_weight(name=f'cross_w_{i}', shape=(self.input_dim, 1),
                                initializer='glorot_uniform', trainable=True)
            )
            self.biases.append(
                self.add_weight(name=f'cross_b_{i}', shape=(self.input_dim,),
                                initializer='zeros', trainable=True)
            )
        super(CrossLayer, self).build(input_shape)

    def call(self, inputs):
        # inputs: (batch, input_dim)
        x0 = tf.cast(inputs, tf.float32)
        xl = x0
        for i in range(self.num_layers):
            # dot = (batch,1) := xl @ w_i
            dot = tf.matmul(xl, self.kernels[i])  # (batch,1)
            # cross = x0 * dot  -> broadcasting to (batch, input_dim)
            cross = x0 * dot  # broadcast multiplication
            # xl_next = cross + b + xl
            xl = cross + self.biases[i] + xl
        return xl  # (batch, input_dim)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1]

    def get_config(self):
        config = super(CrossLayer, self).get_config()
        config.update({'num_layers': self.num_layers})
        return config

In [17]:
def cross_model(output_units=1, dnn_hidden_units=[1024, 512, 256], 
                dropout_rate_dnn_input=0.1, dropout_rate_dnn_logit=0.1, l2_reg_dnn=0.0,
                dropout_rate_cross_input=0.1, dropout_rate_cross_logit=0.1, 
                l2_reg_text_embedding=0.0, l2_reg_categorical_embedding=0.0):
    tf.keras.backend.clear_session()

    # text inputs
    text_inputs = [tf.keras.layers.Input(shape=(max_len_dict[col],), name=col) for col in text_features]
    text_embeddings = []
    for i, col in enumerate(text_features):
        text_feature_type = get_text_feature_type(col)
        text_embeddings.append(
            tf.keras.layers.Embedding(vocab_size_dict[text_feature_type]+2,
                                      int(np.log1p(vocab_size_dict[text_feature_type]) + 2),
                                      embeddings_regularizer=tf.keras.regularizers.l2(l2_reg_text_embedding),
                                      name=col + '_embed')(text_inputs[i])
        )
        
    text_logit = tf.keras.layers.Concatenate(name='text_concat')(
        [tf.keras.layers.GlobalAveragePooling1D()(text_emb) for text_emb in text_embeddings]
    )

    # categorical inputs
    categorical_inputs = [tf.keras.layers.Input(shape=(1,), name=col) for col in categorical_features]
    categorical_embeddings = []
    for i, col in enumerate(categorical_features):
        categorical_embeddings.append(
            tf.keras.layers.Embedding(category_counts_dict[col]+2, int(np.log1p(category_counts_dict[col]) + 2),
                                      embeddings_regularizer=tf.keras.regularizers.l2(l2_reg_categorical_embedding),
                                      name=col + '_embed')(categorical_inputs[i])
        )

    categorical_logit = tf.keras.layers.Concatenate(name='categorical_concat')(
        [tf.keras.layers.Flatten()(cat_emb) for cat_emb in categorical_embeddings]
    )

    # numerical inputs
    numerical_input = tf.keras.layers.Input(shape=(230,), name='numerical_features')

    # dnn
    dnn_input = tf.keras.layers.Concatenate(name='deep_concat')([text_logit, categorical_logit, numerical_input])
    dnn_logit = tf.keras.layers.Dropout(dropout_rate_dnn_input)(dnn_input)
    for n_unit in dnn_hidden_units:
        dnn_logit = tf.keras.layers.Dense(n_unit, activation='relu',
                                          kernel_regularizer=tf.keras.regularizers.l2(l2_reg_dnn))(dnn_logit)
        dnn_logit = tf.keras.layers.Dropout(dropout_rate_dnn_logit)(dnn_logit)

    # cross
    cross_input = tf.keras.layers.Concatenate(name='cross_concat')([text_logit, categorical_logit, numerical_input])
    cross_logit = tf.keras.layers.Dropout(dropout_rate_cross_input)(cross_input)
    cross_logit = CrossLayer(num_layers=3)(cross_logit)
    cross_logit = tf.keras.layers.Dropout(dropout_rate_cross_logit)(cross_logit)

    outputs = tf.keras.layers.Concatenate()([dnn_logit, cross_logit])
    outputs = tf.keras.layers.Dense(output_units, activation='sigmoid')(outputs)

    model = tf.keras.models.Model(inputs=text_inputs + categorical_inputs + [numerical_input], outputs=outputs, name='DeepCross')
    
    return model

In [18]:
lr = 0.0001
n_epochs = 100
batch_size = 1024

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=1, min_lr=0.00005, verbose=1)
callbacks = [es, reduce_lr]


def train_model(x_train, y_train, x_test, y_test):
    output_units = 3
    model = cross_model(output_units)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[tf.keras.metrics.BinaryAccuracy(name='acc'), tf.keras.metrics.AUC(name='auc')],
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
    
    model.fit(x_train, y_train, validation_data=(x_test, y_test),
              epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, verbose=1)

In [19]:
train_model(X_train_split, y_train_split, X_test_split, y_test_split)


Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 494ms/step - acc: 0.7517 - auc: 0.7334 - loss: 2.6554 - val_acc: 0.8508 - val_auc: 0.8225 - val_loss: 1.4131 - learning_rate: 1.0000e-04
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 162ms/step - acc: 0.8761 - auc: 0.8547 - loss: 2.4483 - val_acc: 0.9049 - val_auc: 0.8814 - val_loss: 1.1644 - learning_rate: 1.0000e-04
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 123ms/step - acc: 0.9117 - auc: 0.8932 - loss: 2.2393 - val_acc: 0.9269 - val_auc: 0.9022 - val_loss: 0.9762 - learning_rate: 1.0000e-04
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step - acc: 0.9298 - auc: 0.9127 - loss: 1.4515 - val_acc: 0.9367 - val_auc: 0.9182 - val_loss: 0.8604 - learning_rate: 1.0000e-04
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - acc: 0.9373 - auc: 0.9219 - loss: 1.7660 - val_acc: 0.9406 - val_au