In [None]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding

In [2]:
categorical_features = ['phone_model', 'browser_family', 'os_family', 'device_brand', 'city_code', 'province_code',
                        'sex', 'hashouse', 'social', 'overdue',
                        'tax', 'married', 'benke', 'kid', 'income', 'consumption', 'shebao']

text_features = ['clicked_products_0009', 'clicked_products_date_0009', 'sms_sent_products_0009',
                 'sms_sent_products_1019', 'sms_sent_products_date_0009', 'sms_sent_products_date_1019',
                 'called_products_0009', 'called_products_1019', 'called_products_date_0009',
                 'called_products_date_1019', 'picked_products_0009', 'picked_products_date_0009',
                 'outbound_sent_products_0009', 'outbound_sent_products_date_0009', 'set_all_ins_host_180', 'set_all_ins_host_360',] \
                + ['keypress_30', 'keypress_60', 'keypress_90', 'keypress_120',
                   'rule_name_30', 'rule_name_60', 'rule_name_90', 'rule_name_120',
                   'semantic_30', 'semantic_60', 'semantic_90', 'semantic_120', 'model_value']

text_feature_types = ['products', 'keypress', 'rules', 'semantics', 'insurances', 'model_value']

In [3]:
train_df = pd.read_excel("../../data/train_data_ifh4.xlsx")

In [4]:
train_df[text_features] = train_df[text_features].astype(str)

In [5]:
train_df[categorical_features].head(5)

Unnamed: 0,phone_model,browser_family,os_family,device_brand,city_code,province_code,sex,hashouse,social,overdue,tax,married,benke,kid,income,consumption,shebao
0,2,0,1,0,24,5,2,0,1,0,0,0,0,0,0,0,0
1,2,1,0,1,28,2,1,0,0,0,0,0,0,0,0,0,0
2,3,0,1,0,15,3,1,0,0,0,0,0,1,0,0,0,0
3,10,1,0,1,103,5,1,0,0,0,0,0,1,0,0,0,0
4,5,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0


In [8]:
train_df[text_features].head(5)

Unnamed: 0,clicked_products_0009,clicked_products_date_0009,sms_sent_products_0009,sms_sent_products_1019,sms_sent_products_date_0009,sms_sent_products_date_1019,called_products_0009,called_products_1019,called_products_date_0009,called_products_date_1019,...,keypress_120,rule_name_30,rule_name_60,rule_name_90,rule_name_120,semantic_30,semantic_60,semantic_90,semantic_120,model_value
0,,,,,,,,,,,...,小助理 运营商提示音,,,,baotai27_其他,,,,144 145 140 143,
1,,,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZTKMF_OPPOJX_BT,,2D 4D,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,baotai27_G,,,,7 9 140 150 151 7 131 131,,
3,FQL IYBPAZX_ZMF_CS_BT NWZAZX_ZNWZAMF_NEWOPPO_B...,12D 12D 15D 24D 30D 32D 36D 38D 41D 44D,IYBPAZX_ZTKMF_OPPOJX_BT NWZAZX_ZNWZAMF_NEWOPPO...,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZMF_CS_BT IYBP...,2D 4D 8D 10D 12D 15D 20D 24D 26D 28D,30D 32D 34D 36D 39D 43D 45D 47D 51D 53D,,,,,...,触发发短信 sendMessage_special 没输入手机号 输入手机号2 输入手机号3...,,,yingdian888_B jiyonghua661_B huirong888_D ying...,mayi02_D baotai14_B mayi02_D mayi02_D,,,2240 2572 2672 2313 2345 2672 2313 2313 2313 2...,23 24 46 3921 35 21 21 2576 2181 2189 24 23 23...,
4,,,IYBPAZX_ZTKMF_OPPOJX_BT IYBPAZX_ZMF_CS_BT IYBP...,,2D 4D 6D,,IYBPAZX_TKMF_GD_BZ_GZH_WH IYBPAZX_TKMF_GD_BZ_G...,,1D 4D 7D 52D 55D 58D,,...,触发发短信 sendMessage_special,,,baotai27_D,baotai27_其他,,,,,


In [9]:
def get_vocabulary(feature_type):
    words = train_df[feature_type].dropna().str.split(' ')
    exploded_words = words.explode()
    vocabulary = exploded_words.value_counts()
    vocabulary = vocabulary[vocabulary > 5]
    vocab_size = vocabulary.shape[0]
    vocabulary = vocabulary.index.tolist()

    return vocabulary, vocab_size

In [10]:
vocab_dict = {}
for type in text_feature_types:
    vocab_dict[type] = get_vocabulary(type)

In [11]:
vocab_size_dict = {col: vocab_info[1]+100 for col, vocab_info in vocab_dict.items()}

In [12]:
vocab_size_dict

{'products': 273,
 'keypress': 207,
 'rules': 328,
 'semantics': 351,
 'insurances': 141,
 'model_value': 102}

In [None]:
# only first time
import pickle


max_processes = 5
oov_tok = '<OOV>'


def get_token(text_feature_type):
    print("fitting tokenizer:", text_feature_type)
    token = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size_dict[text_feature_type],
                                                  oov_token=oov_tok, filters='')
    token.fit_on_texts(train_df[text_feature_type].fillna("").astype(str))
    return {text_feature_type: token}


result_list = [get_token(t) for t in text_feature_types]
result_dict = {}
for d in result_list:
    result_dict.update(d)

token_dict = {}

for text_feature_type in text_feature_types:
    token_dict[text_feature_type] = result_dict[text_feature_type]

with open(f'../../data/dicts/token_dict.pkl', 'wb') as f:
    pickle.dump(token_dict, f)

In [13]:
import pickle

with open(f'../../data/dicts/token_dict.pkl', 'rb') as f:
    token_dict = pickle.load(f)

In [14]:
max_len_dict = {
    'called_products_0009': 10,
    'called_products_1019': 6,
    'called_products_date_0009': 10,
    'called_products_date_1019': 6,
    'clicked_products_0009': 3,
    'clicked_products_date_0009': 3,
    'keypress_120': 3,
    'keypress_30': 3,
    'keypress_60': 3,
    'keypress_90': 3,
    'label_0009': 3,
    'label_1019': 3,
    'label_date_0009': 3,
    'label_date_1019': 3,
    'model_value': 3,
    'outbound_sent_products_0009': 3,
    'outbound_sent_products_date_0009': 3,
    'picked_products_0009': 3,
    'picked_products_date_0009': 3,
    'rule_name_120': 3,
    'rule_name_30': 3,
    'rule_name_60': 3,
    'rule_name_90': 3,
    'semantic_120': 5,
    'semantic_30': 3,
    'semantic_60': 3,
    'semantic_90': 5,
    'set_all_ins_host_180': 5,
    'set_all_ins_host_360': 4,
    'sms_sent_products_0009': 10,
    'sms_sent_products_1019': 9,
    'sms_sent_products_date_0009': 10,
    'sms_sent_products_date_1019': 9
 }

category_counts_dict = {col: train_df[col].nunique() for col in categorical_features}


def get_text_feature_type(col):
    if 'set_' in col:
        text_feature_type = 'insurances'
    elif 'host_' in col:
        text_feature_type = 'hosts'
    elif 'products_' in col:
        text_feature_type = 'products'
    elif 'label_' in col:
        text_feature_type = 'labels'
    elif 'keypress_' in col:
        text_feature_type = 'keypress'
    elif 'rule_name_' in col:
        text_feature_type = 'rules'
    elif 'semantic_' in col:
        text_feature_type = 'semantics'
    else:
        text_feature_type = 'model_value'
    return text_feature_type

In [None]:
# tokenization and padding
padding_type = 'post'
truncate_type = 'post'


def tokenize(col):
    print(" tokenize :", col)
    text_feature_type = get_text_feature_type(col)
    token = token_dict[text_feature_type]
    max_len = max_len_dict[col]
    tokenized_seq = token.texts_to_sequences(train_df[col])
    result_train = tf.keras.preprocessing.sequence.pad_sequences(tokenized_seq, maxlen=max_len, padding=padding_type,
                                                                 truncating=truncate_type)
    with open(f'../../data/text_features/{col}.pkl', 'wb') as f:
        pickle.dump(result_train, f)

for col in text_features:
    tokenize(col)

 tokenize : clicked_products_0009
 tokenize : clicked_products_date_0009
 tokenize : sms_sent_products_0009
 tokenize : sms_sent_products_1019
 tokenize : sms_sent_products_date_0009
 tokenize : sms_sent_products_date_1019
 tokenize : called_products_0009
 tokenize : called_products_1019
 tokenize : called_products_date_0009
 tokenize : called_products_date_1019
 tokenize : picked_products_0009
 tokenize : picked_products_date_0009
 tokenize : outbound_sent_products_0009
 tokenize : outbound_sent_products_date_0009
 tokenize : set_all_ins_host_180
 tokenize : set_all_ins_host_360
 tokenize : keypress_30
 tokenize : keypress_60
 tokenize : keypress_90
 tokenize : keypress_120
 tokenize : rule_name_30
 tokenize : rule_name_60
 tokenize : rule_name_90
 tokenize : rule_name_120
 tokenize : semantic_30
 tokenize : semantic_60
 tokenize : semantic_90
 tokenize : semantic_120
 tokenize : model_value


In [15]:
# restore numerical features
restored_raw = (
    train_df["numerical_features"]
    .astype(str)
    .str.replace("\n", " ", regex=False)
    .str.strip("[]")
    .str.strip()
    .str.split(r"\s+", expand=True)
    .apply(lambda col: pd.to_numeric(col.replace('', pd.NA), errors='coerce'))
    .fillna(0)
    .to_numpy()
)

In [16]:
import numpy as np

y_train_cro = train_df['label'].str.strip("[]").str.split(" ", expand=True).astype(int).to_numpy()

# X
X_train = {'numerical_features': restored_raw}

for col in categorical_features:
    X_train[col] = np.array(train_df[col])

for col in text_features:
    with open(f'../../data/text_features/{col}.pkl', 'rb') as f:
        X_train[col] = pickle.load(f)

In [17]:
from sklearn.model_selection import train_test_split
import numpy as np


# 拆分索引
indices = np.arange(len(y_train_cro))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42, stratify=y_train_cro)

# 拆分 y
y_train_split = y_train_cro[train_idx]
y_test_split = y_train_cro[test_idx]

# 拆分 X
X_train_split = {}
X_test_split = {}
for key in X_train:
    arr = X_train[key]
    X_train_split[key] = arr[train_idx]
    X_test_split[key] = arr[test_idx]

In [18]:
class Wide_layer(Layer):
    def __init__(self):
        super().__init__()

    def build(self, input_shape):
        self.b = self.add_weight(name='b', shape=(1,), initializer='zeros', trainable=True)
        self.w = self.add_weight(name='w', shape=(input_shape[-1], 1), initializer='glorot_normal', trainable=True, regularizer=tf.keras.regularizers.l2(1e-4))

    def call(self, inputs, **kwargs):   #输入为 dense_inputs
        x = tf.matmul(inputs, self.w) + self.b     #shape: (batchsize, 1)
        return x

class Deep_layer(Layer):
    def __init__(self, hidden_units, output_dim, activation):
        super().__init__()
        self.hidden_layer = [Dense(i, activation=activation) for i in hidden_units]
        self.output_layer = Dense(output_dim, activation=None)

    def call(self, inputs, **kwargs):
        x = inputs
        for layer in self.hidden_layer:
            x = layer(x)
        output = self.output_layer(x)
        return output

In [19]:
from itertools import combinations

def wide_deep_model(output_unit, l2_reg_text_embedding=0.0, l2_reg_categorical_embedding=0.0):
    tf.keras.backend.clear_session()

    # text inputs
    text_inputs = []
    for col in text_features:
        text_inputs.append(tf.keras.layers.Input(shape=(max_len_dict[col],), name=col))
    text_embeddings = []
    for i, col in enumerate(text_features):
        text_feature_type = get_text_feature_type(col)
        text_embeddings.append(
            tf.keras.layers.Embedding(
                vocab_size_dict[text_feature_type]+1, 
                int(np.log1p(vocab_size_dict[text_feature_type])+2),
                embeddings_regularizer=tf.keras.regularizers.l2(l2_reg_text_embedding), 
                name=col + '_embed')(text_inputs[i])
        )

    text_logit = tf.keras.layers.Concatenate(name='text_concat')(
        [tf.keras.layers.GlobalAveragePooling1D()(text_emb) for text_emb in text_embeddings]
    )

    # categorical inputs
    # one-hot inputs for categorical features (as Keras Input layers + one-hot encoding layers)
    categorical_inputs = []
    onehot_inputs = []
    onehot_vectors = []
    for col in categorical_features:
        inp = tf.keras.layers.Input(shape=(1,), name=col)
        categorical_inputs.append(inp)
        onehot_inputs.append(inp)
        # CategoryEncoding expects integer indices; output_mode='one_hot' produces a vector of length `vocab`
        one_hot_vec = tf.keras.layers.CategoryEncoding(num_tokens=category_counts_dict[col] + 2, output_mode='one_hot', name=col + '_one_hot')(inp)
        onehot_vectors.append(tf.keras.layers.Flatten()(one_hot_vec))

    # feature crossing: use hashing to keep cross dimension bounded
    cross_hash_bins = 128
    crossed_vectors = []
    # limit crosses if you want — here we use all pairwise crosses; comment/change if too many
    for a, b in combinations(categorical_features, 2):
        # get the corresponding input tensors by name
        inp_a = next(x for x in onehot_inputs if x.name.startswith(a))
        inp_b = next(x for x in onehot_inputs if x.name.startswith(b))
        # build a combined string token for the pair and hash it to an index
        pair_str = tf.keras.layers.Lambda(lambda x: tf.strings.join([tf.strings.as_string(x[0]), '_', tf.strings.as_string(x[1])]))([inp_a, inp_b])
        hashed_idx = tf.keras.layers.Hashing(num_bins=cross_hash_bins, name=f'hash_{a}_{b}')(pair_str)
        cross_one_hot = tf.keras.layers.CategoryEncoding(num_tokens=cross_hash_bins, output_mode='one_hot', name=f'cross_oh_{a}_{b}')(hashed_idx)
        crossed_vectors.append(tf.keras.layers.Flatten()(cross_one_hot))

    # final one-hot / cross concatenation (will be available as tensor `onehot_concat` for downstream use)
    onehot_concat = tf.keras.layers.Concatenate(name='onehot_concat')(onehot_vectors + crossed_vectors)

    categorical_embeddings = []
    for i, col in enumerate(categorical_features):
        categorical_embeddings.append(
            tf.keras.layers.Embedding(category_counts_dict[col]+2, int(np.log1p(category_counts_dict[col]) + 1),
                                      embeddings_regularizer=tf.keras.regularizers.l2(l2_reg_categorical_embedding),
                                      name=col + '_embed')(categorical_inputs[i])
        )

    categorical_logit = tf.keras.layers.Concatenate(name='categorical_concat')(
        [tf.keras.layers.Flatten()(cat_emb) for cat_emb in categorical_embeddings]
    )

    # numerical inputs
    numerical_input = tf.keras.layers.Input(shape=(230,), name='numerical_features')

    # wide部分
    wide_input = tf.keras.layers.Concatenate(name='wide_concat')([numerical_input, onehot_concat])
    wide_output = Wide_layer()(wide_input)

    # deep部分
    deep_input = tf.keras.layers.Concatenate(name='deep_concat')([numerical_input, text_logit, categorical_logit])
    deep_output = Deep_layer(hidden_units=[256, 128, 64], output_dim=8, activation='relu')(deep_input)

    # output
    outputs = tf.keras.layers.Dense(output_unit, activation='sigmoid')(tf.keras.layers.Concatenate()([wide_output, deep_output]))

    model = tf.keras.models.Model(inputs=text_inputs + categorical_inputs + [numerical_input], outputs=outputs, name='base_model')
    
    return model

In [20]:
lr = 0.0001
n_epochs = 100
batch_size = 1024

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=1, min_lr=0.00005, verbose=1)
callbacks = [es, reduce_lr]


def train_model(x_train, y_train, x_test, y_test):
    output_units = 3
    model = wide_deep_model(output_units)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[tf.keras.metrics.BinaryAccuracy(name='acc'), tf.keras.metrics.AUC(name='auc')],
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
    
    model.fit(x_train, y_train, validation_data=(x_test, y_test),
              epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, verbose=1)

In [21]:
train_model(X_train_split, y_train_split, X_test_split, y_test_split)


Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 1s/step - acc: 0.9030 - auc: 0.9276 - loss: 0.5325 - val_acc: 0.9347 - val_auc: 0.9413 - val_loss: 0.4809 - learning_rate: 1.0000e-04
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - acc: 0.9428 - auc: 0.9446 - loss: 0.4526 - val_acc: 0.9491 - val_auc: 0.9457 - val_loss: 0.4113 - learning_rate: 1.0000e-04
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step - acc: 0.9514 - auc: 0.9473 - loss: 0.3875 - val_acc: 0.9531 - val_auc: 0.9466 - val_loss: 0.3541 - learning_rate: 1.0000e-04
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - acc: 0.9537 - auc: 0.9477 - loss: 0.3341 - val_acc: 0.9541 - val_auc: 0.9468 - val_loss: 0.3061 - learning_rate: 1.0000e-04
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 92ms/step - acc: 0.9540 - auc: 0.9486 - loss: 0.2887 - val_acc: 0.9541 - val_auc: 0