In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model, Model
from keras.layers import LSTM, GRU, CuDNNGRU, Bidirectional, Masking
from keras.layers import Dense, InputLayer, Dropout, BatchNormalization
from keras.layers import Reshape, Flatten, Conv1D, AveragePooling1D, Activation, Concatenate

from keras.optimizers import Adam, SGD
from keras.callbacks import TensorBoard, Callback
import keras.backend as K
from sklearn.model_selection import train_test_split

from tqdm import tqdm, trange, tnrange
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.decomposition import PCA
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
from data_gen import create_weekly_kplus, create_credit_seq, create_daily_kplus, create_demographic_data
from util import modified_SMAPE
from losses import SMAPE_loss
from callbacks import EvaluateSMAPE

load data gen


In [4]:
demograpgics = pd.read_csv('datasets/exam-1/demographics.csv')
cc = pd.read_csv('datasets/exam-1/cc.csv')
cc.sort_values(by=['cc_no', 'pos_dt'], inplace=True)
kplus = pd.read_csv('datasets/exam-1/kplus.csv')
kplus.sort_values(by=['id', 'sunday'], inplace=True)
train_set = pd.read_csv('datasets/exam-1/train.csv')
test_set = pd.read_csv('datasets/exam-1/test.csv')
raw_demographics = pd.read_csv('datasets/exam-1/raw_demograhgics.csv')

In [5]:
scaled_kplus = kplus.copy()
kplus_scaler =  StandardScaler()
scaled_kplus[['kp_txn_count', 'kp_txn_amt']] = kplus_scaler.fit_transform(kplus[['kp_txn_count', 'kp_txn_amt']])

In [6]:
cc_persons = pd.read_csv('datasets/exam-1/cc_persons.csv')
cc_scaler = StandardScaler()
cc_persons[['cc_txn_amt', 'count']] = cc_scaler.fit_transform(cc_persons[['cc_txn_amt', 'count']])

In [7]:
train_scaler = StandardScaler()
scaled_train_set = train_set.copy()
scaled_train_set.set_index('id', inplace=True)
scaled_train_set['income'] = train_scaler.fit_transform(np.expand_dims(scaled_train_set['income'], axis=2))

  after removing the cwd from sys.path.


# KPLUS

In [9]:
train_kplus = train_kplus[train_kplus['id'] <= 50000]

In [10]:
train_kplus.head()

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt
822506,1,2018-06-03,-0.567504,-0.21137
822505,1,2018-06-17,-0.567504,-0.20746
100625,2,2018-01-07,-0.399415,-0.218408
100629,2,2018-01-14,-0.231325,-0.198075
100641,2,2018-01-21,-0.399415,-0.218408


In [11]:
padding_value = float(-100)
xs = create_weekly_kplus(train_kplus, padding_value)
ys = scaled_train_set.loc[train_kplus['id'].drop_duplicates().to_numpy()].to_numpy()

In [14]:
base_activation = 'tanh'
model = Sequential()
model.add(InputLayer((xs.shape[1], 2)))
model.add(Masking(padding_value, ))
model.add(GRU(50, activation=base_activation, return_sequences=True))
model.add(GRU(50, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))
model.add(GRU(25, activation=base_activation, return_sequences=True))
model.add(GRU(25, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))
model.add(GRU(25, activation=base_activation))
model.add(Dense(1, activation='linear'))

In [15]:
# opt = SGD(lr=4e-2, momentum=0.1)
opt = Adam()

In [17]:
model.compile(loss=SMAPE_loss(kplus_scaler), optimizer=opt )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 25, 2)             0         
_________________________________________________________________
masking_2 (Masking)          (None, 25, 2)             0         
_________________________________________________________________
gru_6 (GRU)                  (None, 25, 50)            7950      
_________________________________________________________________
gru_7 (GRU)                  (None, 25, 50)            15150     
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 50)            0         
_________________________________________________________________
gru_8 (GRU)                  (None, 25, 25)            5700      
_________________________________________________________________
gru_9 (GRU)                  (None, 25, 25)            3825      
__________

In [18]:
x_train, x_val, y_train, y_val = train_test_split(xs, ys, test_size=0.2)

In [20]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(x_val, y_val, 'val', scaler_y=kplus_scaler, use_SMAPE_loss=True),
                     TensorBoard(log_dir='logs/gru')],
          initial_epoch=0,
         epochs=10000)

Instructions for updating:
Use tf.cast instead.
Train on 27725 samples, validate on 6932 samples
Epoch 1/10000
Epoch 1 | val-SMAPE: 91.72752684189692
Epoch 2/10000
Epoch 2 | val-SMAPE: 92.04023577019237
Epoch 3/10000
 4256/27725 [===>..........................] - ETA: 52s - loss: 0.0854

KeyboardInterrupt: 

# Credit

In [21]:
cc_persons.head()

Unnamed: 0,id,pos_dt,cc_no,cc_txn_amt,count,pos_dt_index
0,1,2018-01-20,98397,0.009926,-0.366991,19
1,1,2018-02-17,196794,0.421319,1.115008,47
2,1,2018-05-13,98397,0.152435,-0.366991,132
3,1,2018-06-14,98397,0.152435,-0.366991,164
4,2,2018-01-04,9740,-0.073428,-0.366991,3


In [25]:
train_cc_persons = cc_persons[cc_persons['id'] <= 50000]

In [62]:
def create_credit_x(cc_persons, padding_value):
    xs = [] # [[cc_txn_amt, count], ...]

    empty_seq = np.ones((max_len_seq, 2), dtype=np.float32) * padding_value
    for _id, group in cc_persons.groupby('id'):
        seq = empty_seq.copy()
        seq[group['pos_dt_index'].to_numpy()] = group[['cc_txn_amt', 'count']].to_numpy()

        xs.append(seq)
    xs = np.asarray(xs)
    return xs

In [28]:
padding_value = float(-100)
xs = create_credit_seq(train_cc_persons, padding_value)
ys = scaled_train_set.loc[train_cc_persons['id'].drop_duplicates().to_numpy()].to_numpy()

In [29]:
xs.shape

(39950, 181, 2)

In [30]:
ys.shape

(39950, 1)

In [66]:
base_activation = 'tanh'
RNN = GRU
model = Sequential()
model.add(InputLayer((xs.shape[1] ,2)))
model.add(Masking(padding_value))
model.add(Bidirectional(RNN(64, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Bidirectional(RNN(64, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Dropout(0.5))
model.add(Bidirectional(RNN(32, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Bidirectional(RNN(32, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Dropout(0.5))
model.add(Bidirectional(RNN(16, )))
model.add(Activation(base_activation))
model.add(Dense(1, activation='linear'))

In [22]:
model = load_model('logs/cc_gru_masking/weights.01-90.25.hdf5')

KeyError: 0

In [67]:
# opt = SGD(lr=4e-2, momentum=0.1)
opt = Adam()

In [68]:
model.compile(loss=SMAPE_loss(train_scaler), optimizer=opt )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 181, 2)            0         
_________________________________________________________________
masking_4 (Masking)          (None, 181, 2)            0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 181, 128)          25728     
_________________________________________________________________
activation_6 (Activation)    (None, 181, 128)          0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 181, 128)          74112     
_________________________________________________________________
activation_7 (Activation)    (None, 181, 128)          0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 181, 128)          0         
__________

In [69]:
x_train, x_val, y_train, y_val = train_test_split(xs, ys, test_size=0.2)

In [19]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(x_val, y_val, 'val', scaler_y=train_scaler, use_SMAPE_loss=True),
                     TensorBoard(log_dir='logs/cc_gru')],
          initial_epoch=0,
         epochs=10000)

NameError: name 'model' is not defined

In [74]:
model.save('logs/cc_gru_smape/smape:89.635.h5')

# KPLUS + Credit

In [8]:
kplus_ids = set(scaled_kplus[scaled_kplus['id'] <= 50000]['id'].drop_duplicates())
cc_ids = set(cc_persons[cc_persons['id'] <= 50000]['id'].drop_duplicates())
print(len(kplus_ids), len(cc_ids), len(kplus_ids.union(cc_ids)), len(kplus_ids.intersection(cc_ids)))
ids = kplus_ids.union(cc_ids)
ids_df = pd.DataFrame(sorted(list(ids)), columns=['id'])

38056 39950 47174 30832


In [9]:
train_kplus = pd.merge(scaled_kplus, ids_df, on='id', how='right')
train_cc_persons = pd.merge(cc_persons, ids_df, on='id', how='right')

In [15]:
padding_value = float(-100)
kplus_xs = create_daily_kplus(train_kplus, padding_value)
cc_persons_xs = create_credit_seq(train_cc_persons, padding_value)
xs = np.concatenate((kplus_xs, cc_persons_xs), axis=2)

In [17]:
ys = scaled_train_set.loc[ids]['income'].to_numpy()

In [18]:
base_activation = 'tanh'
RNN = LSTM
model = Sequential()
model.add(InputLayer(xs.shape[1:]))
model.add(Masking(padding_value, ))
model.add(RNN(50, activation=base_activation, return_sequences=True))
model.add(RNN(50, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))

model.add(RNN(25, activation=base_activation, return_sequences=True))
model.add(RNN(25, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))

model.add(RNN(25, activation=base_activation))
model.add(Dense(1, activation='linear'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [19]:
model.compile(loss=SMAPE_loss(train_scaler), optimizer=Adam() )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 181, 4)            0         
_________________________________________________________________
masking_1 (Masking)          (None, 181, 4)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 181, 50)           11000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 181, 50)           20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 181, 50)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 181, 25)           7600      
_________________________________________________________________
lstm_4 (LSTM)                (None, 181, 25)           5100      
__________

In [20]:
x_train, x_val, y_train, y_val = train_test_split(xs, ys, test_size=0.2)

In [21]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(x_val, y_val, 'val', scaler_y=train_scaler, use_SMAPE_loss=True),
                     TensorBoard(log_dir='logs/kplus_cc_lstm')],
          initial_epoch=0,
         epochs=100)

Instructions for updating:
Use tf.cast instead.
Train on 37739 samples, validate on 9435 samples
Epoch 1/100
Epoch 1 | val-SMAPE: 89.6072398622041
Epoch 2/100
Epoch 2 | val-SMAPE: 90.02740939734598
Epoch 3/100
Epoch 3 | val-SMAPE: 90.04824645758055
Epoch 4/100
Epoch 4 | val-SMAPE: 90.1014030134482
Epoch 5/100
Epoch 5 | val-SMAPE: 90.08751664723299
Epoch 6/100
Epoch 6 | val-SMAPE: 90.30785536428071
Epoch 7/100

KeyboardInterrupt: 

In [22]:
model.save('logs/kplus_cc_lstm/ep6-val_SMAPE:90.3078.h5')

# DEMOGRAPHIC + CC + KPLUS

In [37]:
ids_df = pd.DataFrame(demograpgics[demograpgics['id'] <= 50000]['id'].drop_duplicates(), columns=['id'])

In [39]:
demographic_xs = create_demographic_data(raw_demographics[raw_demographics['id'] <= 50000])

In [38]:
train_kplus = pd.merge(scaled_kplus, ids_df, on='id', how='right')
train_cc_persons = pd.merge(cc_persons, ids_df, on='id', how='right')

padding_value = float(-100)
kplus_xs = create_daily_kplus(train_kplus, padding_value)
cc_persons_xs = create_credit_seq(train_cc_persons, padding_value)
transaction_xs = np.concatenate((kplus_xs, cc_persons_xs), axis=2)

In [41]:
ys = scaled_train_set.loc[:50001]['income'].to_numpy()

In [42]:
demographic_xs.shape, transaction_xs.shape, ys.shape

((50000, 21), (50000, 181, 4), (50000,))

In [45]:
demo_x_train, demo_x_val, transaction_x_train, transaction_x_val, y_train, y_val \
= train_test_split(demographic_xs, transaction_xs, ys, test_size=0.2)

In [8]:
transaction_model = load_model('logs/kplus_cc_lstm/ep6-val_SMAPE:90.3078.h5'
                               , custom_objects={'loss_func':SMAPE_loss(train_scaler)})

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [28]:
transaction_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 181, 4)            0         
_________________________________________________________________
masking_1 (Masking)          (None, 181, 4)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 181, 50)           11000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 181, 50)           20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 181, 50)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 181, 25)           7600      
_________________________________________________________________
lstm_4 (LSTM)                (None, 181, 25)           5100      
__________

In [49]:
for layer in transaction_model.layers:
    layer.trainable = True

In [18]:
demographic_model = Sequential()
demographic_model.add(InputLayer((21,), name='demo_input'))
demographic_model.add(Dense(32, activation='relu'))
demographic_model.add(Dense(16, activation='relu'))
demographic_model.add(Dense(1))

In [19]:
demographic_model.compile(optimizer='adam', loss='mse')
demographic_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
demo_input (InputLayer)      (None, 21)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 32)                704       
_________________________________________________________________
dense_16 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 17        
Total params: 1,249
Trainable params: 1,249
Non-trainable params: 0
_________________________________________________________________


In [23]:
x = Concatenate()([demographic_model.layers[-2].output, transaction_model.layers[-2].output])
x = Dense(64)(x)
x = Dense(64)(x)
x = Dense(1)(x)
model = Model(inputs=[demographic_model.input, transaction_model.input], outputs=x)

In [50]:
model.compile(optimizer='adam', loss=SMAPE_loss(train_scaler))
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 181, 4)       0                                            
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 181, 4)       0           input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 181, 50)      11000       masking_1[0][0]                  
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 181, 50)      20200       lstm_1[0][0]                     
__________________________________________________________________________________________________
dropout_1 

In [53]:
model.fit([demo_x_train, transaction_x_train], y_train,
          validation_data=([demo_x_val, transaction_x_val], y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(None, y_val, 'val', scaler_y=train_scaler, use_SMAPE_loss=True),
                     TensorBoard(log_dir='logs/demo_kplus_cc_lstm')],
          initial_epoch=4,
         epochs=12)

Train on 40000 samples, validate on 10000 samples
Epoch 5/12
Epoch 5 | val-SMAPE: 91.1124360704422
Epoch 6/12
Epoch 6 | val-SMAPE: 91.08383689522744
Epoch 7/12
Epoch 7 | val-SMAPE: 91.15018265843392
Epoch 8/12
Epoch 8 | val-SMAPE: 91.2273552262783
Epoch 9/12
Epoch 9 | val-SMAPE: 90.73999512553215
Epoch 10/12
Epoch 10 | val-SMAPE: 91.06627345800399
Epoch 11/12
Epoch 11 | val-SMAPE: 91.20140267252923
Epoch 12/12
Epoch 12 | val-SMAPE: 91.10249617576599


<keras.callbacks.History at 0x1546ee250>

In [52]:
model.save('logs/demo_kplus_cc_lstm/ep4-val_SMAPE:91.1145.h5')

In [54]:
model.save('logs/demo_kplus_cc_lstm/saved.h5')