In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from numba import jit
SEED = 3407
def seed_everything(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(SEED)

In [2]:

def load_data():
    train_set = pd.read_csv('train_data.csv',iterator=True)
    train_set = train_set.get_chunk(size= 100000)
    train_lab = pd.read_csv('train_labels.csv')
    train_set = pd.merge(left=train_set,right=train_lab,on='customer_ID')
    return train_set
dataset = load_data()

In [3]:
from sklearn.preprocessing import OneHotEncoder
def preprocess(dataset):
    cus = dataset.groupby('customer_ID').count()
    cus = cus[['target']]
    dataset = pd.merge(left=dataset,right=cus,left_on='customer_ID',right_on=cus.index)
    dataset = dataset[dataset['target_y'] == 13]
    
    dataset['D_87'].fillna(0,inplace=True)
    cate = dataset[['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']]
    encoder = OneHotEncoder()
    oh = encoder.fit_transform(cate).toarray()
    one_hot = [f'category{i}' for i in range(len(oh[0]))]
    dataset = dataset.drop(columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])
    dataset[one_hot] = oh
    dataset.fillna(method='pad',inplace=True)
    dataset.fillna(value=1e-8,inplace=True)
    dataset['S_2'] = pd.to_datetime(dataset['S_2'])
    return dataset
dataset = preprocess(dataset)

In [4]:
from datetime import datetime
split_time = datetime(year=2018,month=1,day=1)
df_train = dataset[dataset['S_2']<split_time]
df_valid = dataset[dataset['S_2']>=split_time]
fea = [c for c in dataset.columns if c not in ['customer_ID','S_2','target_x','target_y','target_x']]

In [5]:
from sklearn.linear_model import LogisticRegression
logi_input = np.array(df_train[fea])
logi_label = np.array(df_train['target_x'])
valid_input = np.array(df_valid[fea])
valid_label = np.array(df_valid['target_x'])

logitstic = LogisticRegression(random_state=114514)
logitstic.fit(logi_input,logi_label)
logitstic.score(logi_input,logi_label)
#logitstic.score(valid_input,valid_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8815479609419874

In [6]:

def split_feature_label(data,seq_len,batchisze=1741):
    num_sample = int(len(data)/seq_len)
    
    row = np.array(range(len(data))).reshape([num_sample,seq_len])[:,-1]
    label = np.array(data['target_x'])
    label = label[row]
    feature = np.array(data[fea])

    label = tf.constant(label,shape=[num_sample,1,1])
    feature = tf.constant(feature,shape=[num_sample,seq_len,len(fea)])
    return feature,label

tr_feature,tr_label = split_feature_label(df_train,10)
va_feature,va_label = split_feature_label(df_valid,3)
feature,label = split_feature_label(dataset,13)

In [13]:
from datetime import datetime
import tensorboard
OUT_STEPS = 4
num_features =len(fea)
multi_dense_model = tf.keras.Sequential([
    # 提取最后一时间步的数据

    # Shape [batch, time, features] => [batch, 1, features]
    tf.keras.layers.Lambda(lambda x: x[:, -1:, :]),
    # Shape => [batch, 1, dense_units]
    tf.keras.layers.Dense(512, activation='relu'),
    #tf.keras.layers.Dense(365, activation='relu'),
    tf.keras.layers.Dense(3*7, activation='relu'),
    # Shape => [batch, out_steps*features]
    tf.keras.layers.Dense(4,kernel_initializer=tf.initializers.zeros(), activation='sigmoid'),
    # Shape => [batch, out_steps, features]
    tf.keras.layers.Reshape([OUT_STEPS, 1])
])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=5,
                                                    mode='min')

log_dir = "logs/mlp/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

multi_dense_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False,label_smoothing=0.0,
                axis=-1,name='binary_crossentropy'),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy', dtype=None, threshold=0.5)
])

history = multi_dense_model.fit(
    x=feature, y = label, 
    batch_size=256, 
    epochs=80, 
    verbose=1, 
    callbacks=[early_stopping,tensorboard_callback], 
    #valid_input = va_feature,valid_label = va_label,
    validation_split=0.3,  
    shuffle=True, 
    class_weight=None, 
    sample_weight=None, 
    initial_epoch=0, 
    steps_per_epoch=None, 
    validation_steps=None)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80


In [14]:
CONV_WIDTH = 6
OUT_STEPS = 4
num_features = feature.shape[-1]
multi_conv_model = tf.keras.Sequential([
    # Shape [batch, time, features] => [batch, CONV_WIDTH, features]
    tf.keras.layers.Lambda(lambda x: x[:, -CONV_WIDTH:, :]),
    # Shape => [batch, 1, conv_units]
    tf.keras.layers.Conv1D(256, activation='sigmoid', kernel_size=(CONV_WIDTH)),

    # Shape => [batch, 1,  out_steps*features]
    tf.keras.layers.Dense(OUT_STEPS*num_features,
                          kernel_initializer=tf.initializers.zeros(),activation = 'sigmoid'),
    # Shape => [batch, out_steps, features]
    tf.keras.layers.Reshape([OUT_STEPS, num_features])
])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=4, restore_best_weights=True,
                                                    mode='min')

lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20))

log_dir = "logs/CNN/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

multi_conv_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False,label_smoothing=0.0,
                axis=-1,name='binary_crossentropy'),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy', dtype=None, threshold=0.5)
])

history = multi_conv_model.fit(
    x=feature, y = label, 
    batch_size=128, 
    epochs=100, 
    verbose=1, 
    callbacks=[early_stopping,tensorboard_callback], 
    validation_split=0.3,  
    shuffle=True, 
    class_weight=None, 
    sample_weight=None, 
    initial_epoch=0, 
    steps_per_epoch=None, 
    validation_steps=None)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


In [19]:
lstm_model = tf.keras.models.Sequential([
  #tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1),input_shape=[None]),
  tf.keras.layers.LSTM(
    units = 21,
    activation="tanh",
    recurrent_activation="sigmoid",return_sequences= True),
  tf.keras.layers.LSTM(
    units = 7,
    activation="tanh",
    recurrent_activation="sigmoid"),
  tf.keras.layers.Dense(3,activation="sigmoid"),
  tf.keras.layers.Lambda(lambda x: x * 1)
])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=5, restore_best_weights=True,
                                                    mode='min')

lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 0.1 * 10**(-epoch / 20))

log_dir = "logs/LSTM/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

lstm_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False,label_smoothing=0.0,
                axis=-1,name='binary_crossentropy'),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy', dtype=None, threshold=0.5)
])
history = lstm_model.fit(
    x=feature, y = label, 
    batch_size=20, 
    epochs=100, 
    verbose=1, 
    callbacks=[early_stopping,tensorboard_callback], 
    validation_split=0.3,  
    shuffle=True, 
    class_weight=None, 
    sample_weight=None, 
    initial_epoch=0, 
    steps_per_epoch=None, 
    validation_steps=None)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [22]:
import Tfm

tfm_model = Tfm.tsf_model(
    input_shape = feature.shape[1:],
    output_len = 4,
    head_size=64,
    num_heads=8,
    ff_dim=16,
    num_transformer_blocks=4,
    mlp_units=[128],
    #mlp_dropout=4,
    #dropout=2,
)

log_dir = "logs/Transformer/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

tfm_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False,label_smoothing=0.0,
                axis=-1,name='binary_crossentropy'),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy', dtype=None, threshold=0.5)
])

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=5,restore_best_weights=True,
                                                    mode='min'),tensorboard_callback]

history = tfm_model.fit(
    x = feature,
    y = label,
    validation_split=0.3,
    epochs=200,
    batch_size=64,
    callbacks=callbacks,
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
