In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, GRU, CuDNNGRU, Bidirectional, Dense, InputLayer, Dropout, BatchNormalization, Reshape, Flatten, Conv1D, AveragePooling1D, Activation
from keras.optimizers import Adam, SGD
from keras.callbacks import TensorBoard, Callback
import keras.backend as K
from sklearn.model_selection import train_test_split
from util import modified_SMAPE
from losses import SMAPE_loss
from callbacks import EvaluateSMAPE
from tqdm import tqdm, trange, tnrange
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.decomposition import PCA
from keras.preprocessing.sequence import pad_sequences

In [4]:
demograpgics = pd.read_csv('datasets/exam-1/demographics.csv')
cc = pd.read_csv('datasets/exam-1/cc.csv')
cc.sort_values(by=['cc_no', 'pos_dt'], inplace=True)
kplus = pd.read_csv('datasets/exam-1/kplus.csv')
kplus.sort_values(by=['id', 'sunday'], inplace=True)
train_set = pd.read_csv('datasets/exam-1/train.csv')
test_set = pd.read_csv('datasets/exam-1/test.csv')

# KPLUS

In [5]:
n_transaction_threshold = 8

train_kplus = kplus.copy()
grouped = train_kplus.groupby('id')
trainable_ids = grouped.size().keys()[grouped.size() >= n_transaction_threshold]
train_kplus = train_kplus[train_kplus['id'].isin(trainable_ids)]

In [6]:
train_kplus.head()

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt
100625,2,2018-01-07,2,600
100629,2,2018-01-14,3,3200
100641,2,2018-01-21,2,600
100644,2,2018-01-28,6,3000
100643,2,2018-02-04,4,13700


In [7]:
Scaler = StandardScaler

In [8]:
scaler_x = Scaler()
train_kplus[['kp_txn_count', 'kp_txn_amt']] = scaler_x.fit_transform(train_kplus[['kp_txn_count', 'kp_txn_amt']])
train_kplus = train_kplus[train_kplus['id'] <= 50000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
train_kplus.head()

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt
100625,2,2018-01-07,-0.404434,-0.218776
100629,2,2018-01-14,-0.237545,-0.198626
100641,2,2018-01-21,-0.404434,-0.218776
100644,2,2018-01-28,0.263124,-0.200176
100643,2,2018-02-04,-0.070655,-0.117253


In [10]:
padding_value = float(-100)
max_len = len(kplus.groupby('sunday').groups)
sunday_id_hash = {sunday:i for i, sunday in enumerate(kplus.groupby('sunday', sort=True).groups.keys())}

id_grouped = train_kplus.groupby('id')
accept_ids = pd.Series(list(id_grouped.groups.keys()))

def create_sequence(group):
    origin = sunday_id_hash[group.iloc[0]['sunday']]

    seq = group[['kp_txn_amt', 'kp_txn_count']].to_numpy()
    pre_padding = np.ones((origin ,2), dtype=np.float32) * padding_value
    post_padding = np.ones((max_len - origin - len(seq),2), dtype=np.float32) * padding_value
    return np.concatenate((pre_padding, seq, post_padding), axis=0)
xs = np.array([create_sequence(group) for _, group in id_grouped])

ys = train_set[train_set['id'].isin(accept_ids)]['income'].to_numpy()
income_mean = train_set['income'].mean()
income_std = train_set['income'].std()
# ys = (ys - income_mean) / income_std
# ys = (ys - ys.min()) / ys.max()
scaler_y = Scaler()
ys = np.squeeze(scaler_y.fit_transform(np.expand_dims(ys,axis=2)))



In [11]:
xs[-1].astype(int)

array([[   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [   0,    0],
       [-100, -100],
       [-100, -100],
       [-100, -100],
       [-100, -100],
       [-100, -100],
       [-100, -100]])

In [12]:
ys[-1]

-0.6848973340576062

In [274]:
base_activation = 'tanh'
model = Sequential()
model.add(InputLayer((25,2)))
# model.add(BatchNormalization())
# model.add(LSTM(32, return_sequences=True, ))
# model.add(Dropout(0.5))
# model.add(LSTM(32, return_sequences=True))
# model.add(Dropout(0.5))
model.add(GRU(50, activation=base_activation, return_sequences=True))
model.add(GRU(50, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))
model.add(GRU(25, activation=base_activation, return_sequences=True))
model.add(GRU(25, activation=base_activation, return_sequences=True))
model.add(Dropout(0.5))

model.add(Flatten())
# model.add(Dense(100))
model.add(Dense(1, activation='linear'))

In [13]:
model = Sequential()
base_activation = 'linear'
model.add(Conv1D(32, kernel_size=3, activation=base_activation, input_shape=(25,2)))
model.add(Conv1D(32, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(16, kernel_size=3, activation=base_activation))
model.add(Conv1D(16, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(8, kernel_size=3, activation=base_activation))
model.add(Conv1D(8, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(4, kernel_size=3, activation=base_activation))
model.add(Conv1D(4, kernel_size=3, activation=base_activation))
# model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1, activation='linear'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [258]:
model = Sequential()
model.add(InputLayer((25,2)))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

In [14]:
# opt = SGD(lr=4e-2, momentum=0.1)
opt = Adam()

In [15]:
model.compile(loss=SMAPE_loss(scaler_y), optimizer=opt )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 23, 32)            224       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 21, 32)            3104      
_________________________________________________________________
dropout_1 (Dropout)          (None, 21, 32)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 19, 16)            1552      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 17, 16)            784       
_________________________________________________________________
dropout_2 (Dropout)          (None, 17, 16)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 15, 8)             392       
__________

In [16]:
x_train, x_val, y_train, y_val = train_test_split(xs, ys, test_size=0.2)

In [18]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(x_val, y_val, 'val', scaler_y=scaler_y, use_SMAPE_loss=True),
                     TensorBoard(log_dir='logs/cnn-smape')],
          initial_epoch=0,
         epochs=10000)

Train on 27725 samples, validate on 6932 samples
Epoch 1/10000
Epoch 1 | val-SMAPE: 85.04502805701172
Epoch 2/10000
Epoch 2 | val-SMAPE: 87.984437422383
Epoch 3/10000
Epoch 3 | val-SMAPE: 88.2607059285638
Epoch 4/10000
Epoch 4 | val-SMAPE: 88.98974448502648
Epoch 5/10000
Epoch 5 | val-SMAPE: 88.96326039285242
Epoch 6/10000
Epoch 6 | val-SMAPE: 89.19485844822199
Epoch 7/10000
Epoch 7 | val-SMAPE: 88.30363807293699
Epoch 8/10000
Epoch 8 | val-SMAPE: 89.18695283147619
Epoch 9/10000
Epoch 9 | val-SMAPE: 89.33297288483165
Epoch 10/10000
Epoch 10 | val-SMAPE: 89.13977876890277
Epoch 11/10000
Epoch 11 | val-SMAPE: 89.51805483698158
Epoch 12/10000
Epoch 12 | val-SMAPE: 89.49005603618313
Epoch 13/10000
Epoch 13 | val-SMAPE: 89.53882894821065
Epoch 14/10000
Epoch 14 | val-SMAPE: 89.54107987308048
Epoch 15/10000
Epoch 15 | val-SMAPE: 89.55072437279223
Epoch 16/10000
Epoch 16 | val-SMAPE: 89.47588748739796
Epoch 17/10000
Epoch 17 | val-SMAPE: 89.58644758452071
Epoch 18/10000
 4832/27725 [====>....

KeyboardInterrupt: 

# Credit

In [3]:
cc_id = pd.merge(cc, demograpgics[['id','cc_no']], on='cc_no')

In [4]:
cc_id['count'] = 1

In [5]:
cc_id.head(10)

Unnamed: 0,cc_no,pos_dt,cc_txn_amt,id,count
0,2,2018-03-10,800,2,1
1,2,2018-03-12,3800,2,1
2,2,2018-04-27,14700,2,1
3,2,2018-04-29,4000,2,1
4,2,2018-05-07,800,2,1
5,2,2018-05-14,800,2,1
6,2,2018-06-04,1000,2,1
7,2,2018-06-11,1000,2,1
8,4,2018-05-11,20000,4,1
9,4,2018-05-11,30000,4,1


In [6]:
grouped = cc_id.groupby(['id', 'pos_dt'])

In [7]:
cc_persons = grouped.sum().reset_index()
cc_persons.head(10)

Unnamed: 0,id,pos_dt,cc_no,cc_txn_amt,count
0,1,2018-01-20,98397,4700,1
1,1,2018-02-17,196794,20000,2
2,1,2018-05-13,98397,10000,1
3,1,2018-06-14,98397,10000,1
4,2,2018-01-04,9740,1600,1
5,2,2018-01-28,29220,3500,3
6,2,2018-01-29,9740,1500,1
7,2,2018-02-04,9740,1100,1
8,2,2018-02-11,9740,800,1
9,2,2018-02-19,9740,800,1


In [8]:
Scaler = StandardScaler
cc_scaler = Scaler()
train_scaler = Scaler()
cc_persons[['cc_txn_amt', 'count']] = cc_scaler.fit_transform(cc_persons[['cc_txn_amt', 'count']])
scaled_train_set = train_set.copy()
scaled_train_set['income'] = train_scaler.fit_transform(np.expand_dims(scaled_train_set['income'].to_numpy(), axis=1))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
cc_persons.head()

Unnamed: 0,id,pos_dt,cc_no,cc_txn_amt,count
0,1,2018-01-20,98397,0.009926,-0.366991
1,1,2018-02-17,196794,0.421319,1.115008
2,1,2018-05-13,98397,0.152435,-0.366991
3,1,2018-06-14,98397,0.152435,-0.366991
4,2,2018-01-04,9740,-0.073428,-0.366991


In [10]:
scaled_train_set.head()

Unnamed: 0,id,income
0,1,-0.525099
1,2,1.090529
2,3,-0.356022
3,4,0.245142
4,5,-0.562672


In [11]:
cc_persons = cc_persons[cc_persons['id'] <= 50000]

In [12]:
origin_timestamp = pd.Timestamp(cc['pos_dt'].min())
max_len_seq = (pd.Timestamp(cc['pos_dt'].max()) - origin_timestamp).days + 1
cc_persons['pos_dt_index'] = cc_persons.apply(lambda row: (pd.Timestamp(row['pos_dt']) - origin_timestamp).days, axis=1)

In [13]:
padding_value = float(-100)
xs = [] # [[cc_txn_amt, count], ...]
ys = []
train_id_set = scaled_train_set.set_index('id')
for _id, group in cc_persons.groupby('id'):
    seq = np.ones((max_len_seq, 2), dtype=np.float32) * padding_value
    seq[group['pos_dt_index'].to_numpy()] = group[['cc_txn_amt', 'count']].to_numpy()
    
    xs.append(seq)
    ys.append(train_id_set.loc[_id]['income'])
xs = np.asarray(xs)
ys = np.asarray(ys)

In [216]:
xs

array([[[-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        ...,
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02]],

       [[-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        ...,
        [-6.8050250e-02, -3.6699140e-01],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.4273417e-02,  1.1150075e+00]],

       [[-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        ...,
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02]],

       ...,

       [[-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02],
        ...,
        [-9.4938666e-02, -3.6699140e-01],
     

In [215]:
ys

array([-0.52509903,  1.09052903,  0.24514225, ..., -0.2057307 ,
       -0.54388541, -0.65660364])

In [48]:
base_activation = 'tanh'
model = Sequential()
model.add(InputLayer((max_len_seq,2)))
model.add(Bidirectional(CuDNNGRU(128, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Bidirectional(CuDNNGRU(128, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Dropout(0.5))
model.add(Bidirectional(CuDNNGRU(64, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Bidirectional(CuDNNGRU(64, return_sequences=True)))
model.add(Activation(base_activation))
model.add(Dropout(0.5))
model.add(Bidirectional(CuDNNGRU(32, )))
model.add(Activation(base_activation))
# model.add(Flatten())
# model.add(Dense(100))
model.add(Dense(1, activation='linear'))

In [45]:
model = Sequential()
base_activation = 'relu'
model.add(Conv1D(32, kernel_size=3, activation=base_activation, input_shape=(max_len_seq,2)))
model.add(Conv1D(32, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(16, kernel_size=3, activation=base_activation))
model.add(Conv1D(16, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(8, kernel_size=3, activation=base_activation))
model.add(Conv1D(8, kernel_size=3, activation=base_activation))
model.add(Dropout(0.5))
model.add(Conv1D(4, kernel_size=3, activation=base_activation))
model.add(Conv1D(4, kernel_size=3, activation=base_activation))
# model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1, activation='linear'))

In [63]:
model = Sequential()
model.add(InputLayer((max_len_seq,2)))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

In [50]:
# opt = SGD(lr=4e-2, momentum=0.1)
opt = Adam()

In [51]:
model.compile(loss=SMAPE_loss(train_scaler), optimizer=opt )
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 181, 256)          101376    
_________________________________________________________________
activation_1 (Activation)    (None, 181, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 181, 256)          296448    
_________________________________________________________________
activation_2 (Activation)    (None, 181, 256)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 181, 256)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 181, 128)          123648    
_________________________________________________________________
activation_3 (Activation)    (None, 181, 128)         

In [52]:
x_train, x_val, y_train, y_val = train_test_split(xs, ys, test_size=0.2)

In [55]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          shuffle=True,
          batch_size=32,
          callbacks=[EvaluateSMAPE(x_val, y_val, 'val', scaler_y=train_scaler),
                     TensorBoard(log_dir='logs/cc_gru_smape')],
          initial_epoch=0,
         epochs=10000)

W1028 20:19:17.997876 140082931795776 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1375: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W1028 20:19:19.708583 140082931795776 module_wrapper.py:137] From /home/porlolicon/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

W1028 20:19:19.714822 140082931795776 module_wrapper.py:137] From /home/porlolicon/.local/lib/python3.6/site-packages/keras/callbacks/tensorboard_v1.py:200: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.



Train on 31960 samples, validate on 7990 samples
Epoch 1/10000
Epoch 1 | val-SMAPE: 90.57426353704828
Epoch 2/10000
Epoch 2 | val-SMAPE: 90.30955666196212
Epoch 3/10000
Epoch 3 | val-SMAPE: 90.65647471594993
Epoch 4/10000
Epoch 4 | val-SMAPE: 89.86831715661117
Epoch 5/10000

KeyboardInterrupt: 