In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1,2"

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import CuDNNLSTM, Dense, Input, Dropout, Activation
from keras.optimizers import Adam
from keras.utils import multi_gpu_model
from tqdm import tqdm

Using TensorFlow backend.


In [84]:
demo = pd.read_csv('dataset/demo.csv')
demo.rename({
    'c0':'gender',
    'c1':'ocp'}, axis=1, inplace=True)

txn = pd.read_csv('dataset/txn.csv')
_txn = txn.copy()
txn.rename({
    'n3':'date_idx'
}, axis=1, inplace=True)
txn.drop('t0', axis=1, inplace=True)

train_set = pd.read_csv('dataset/train.csv')
test_set = pd.read_csv('dataset/test.csv')

In [3]:
id_ccno = txn[['id', 'old_cc_no']].drop_duplicates()
train_id_ccno = pd.merge(id_ccno, train_set, on='id')

In [4]:
txn_n = txn[['id', 'old_cc_no', 'date_idx', 'n4', 'n5', 'n6', 'n7']].copy()
txn_n['count'] = 1

In [5]:
LENGTH_SEQ = 365
LENGTH_SEQ = 53

In [6]:
txn_n['date_idx'] = txn_n['date_idx'] // 7 + 1 # WEEKLY
txn_n = txn_n.groupby(['id', 'old_cc_no', 'date_idx']).sum()

In [7]:
scaled_txn_n = txn_n

In [8]:
scaled_txn_n.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n4,n5,n6,n7,count
id,old_cc_no,date_idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,7051,1,89000,5181,25825,11475,51
1,7051,2,25000,4286,20996,9450,42
1,7051,3,53500,4155,20589,9225,41
1,7051,4,28300,2562,12798,5625,25
1,7051,5,18000,3284,16822,7200,32


In [9]:
txn_scaler = StandardScaler()
scaled_txn_n[['n4', 'n5', 'n6', 'n7', 'count']] = txn_scaler.fit_transform(txn_n)

In [10]:
scaled_txn_n = scaled_txn_n.reset_index().set_index('old_cc_no')

In [11]:
txn[['old_cc_label', 'c5', 'c6', 'c7']].head()

Unnamed: 0,old_cc_label,c5,c6,c7
0,10,14,10,58
1,10,11,-10,53
2,10,12,-10,58
3,10,11,-10,80
4,10,12,11,58


In [14]:
txn.head()

Unnamed: 0_level_0,id,old_cc_label,c5,c6,c7,date_idx,n4,n5,n6,n7
old_cc_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
256451,11592,10,14,10,58,296,500,159,600,3801
256451,11592,10,11,-10,53,218,500,97,800,3801
256451,11592,10,12,-10,58,133,700,101,300,3801
256451,11592,10,11,-10,80,340,4000,95,0,3801
256451,11592,10,12,11,58,306,400,124,300,3801


In [85]:
def preprocess_credit_class(old_cc_no_ids):

    txn_joined = _txn.drop_duplicates(subset='old_cc_no').set_index('old_cc_no')
    txn_joined = txn_joined.loc[old_cc_no_ids]

    txn_joined = txn_joined.drop(['n3', 'n4', 'n5', 'n6', 'n7', 't0'], axis=1)
    txn_joined = txn_joined.reset_index()

    _x = txn_joined[['old_cc_no','old_cc_label', 'c5', 'c6', 'c7']]


    df = pd.DataFrame(columns=['old_cc_no', 'onehot'])

    for (_, old_cc_no, old_cc_label, c5, c6, c7) in _x.itertuples():
        old_cc_label = to_categorical(old_cc_label, 13)
        c5 = to_categorical(c5, 100)
        c6 = to_categorical(c6, 79)
        c7 = to_categorical(c7, 95)
        onehot = np.concatenate((old_cc_label, c5, c6, c7))
        df = df.append({
            'old_cc_no': old_cc_no,
            'onehot': onehot
        }, ignore_index=True)

    return df

In [86]:
padding_value = txn_scaler.transform([[0, 0, 0, 0, 0]])
def create_seq_from_group(group):
    seq = np.ones((LENGTH_SEQ, 5)) * padding_value
    group = group.sort_values('date_idx')
    for (_, _, date_idx, n4, n5, n6, n7, count) in group.itertuples():
        seq[int(date_idx) - 1] = [n4, n5, n6, n7, count]
    return seq

def create_seq(txn):
    grouped = txn.groupby(['old_cc_no'])
    seqs = []
    for name, group in tqdm(grouped):
        seq = create_seq_from_group(group)
        seqs.append(seq)
    seqs = np.array(seqs)
    return seqs

def create_seq_dataframe(txn):
    grouped = txn.groupby(['old_cc_no'])
    output_df = pd.DataFrame(columns=['old_cc_no', 'seq'])
    for old_cc_no, group in tqdm(grouped):
        seq = create_seq_from_group(group)
        output_df = output_df.append({
            'old_cc_no': old_cc_no,
            'seq': seq
        }, ignore_index=True, )
    return output_df

In [102]:
batch_size = 32
def create_data(ids):
    
    train_old_cc_nos = pd.merge(train_id_ccno, ids, on='id')['old_cc_no']

    train_txn = scaled_txn_n.loc[train_old_cc_nos]
    train_txn = train_txn.sort_values(['old_cc_no', 'date_idx'])

    seqs = create_seq(train_txn)

    onehot = preprocess_credit_class(train_old_cc_nos.to_list())
    onehot = np.array([v for v in onehot['onehot'].to_numpy()])
    labels = train_id_ccno.set_index('old_cc_no').loc[train_old_cc_nos].sort_values('old_cc_no')['label']

    labels = np.array([to_categorical(label, 13) if not np.isnan(label) else to_categorical(0, 13) for label in labels])
    return onehot, seqs, labels

In [3]:
seqs = np.load('data_saved/seq-weekly.npy')
labels = np.load('data_saved/label-weekly.npy')

In [103]:

ids = train_set[train_set['id'] < 500]['id']
ids

75       245
419      144
516      277
731      378
1227     426
1277      43
1306     116
1751      53
1877      77
1945     304
2373      59
2517     384
2546       2
2550     129
2571     190
2760     239
2791      56
2805     187
2846      41
3015     164
3032     463
3244     126
3378      16
3399      39
3540     261
4078      12
4572     473
4701     309
5131     435
5140     252
        ... 
94307    199
94598     31
94744    490
94899    203
94963    433
95128    451
95133     68
95506    464
95641    194
95643    183
96013    443
96111     48
96283    436
96604      8
96762    150
96848    328
96903     98
97927     33
98068    395
98204     13
98224    273
98573    294
98642    406
98771    151
98837     88
98846    210
99384     25
99604    275
99643    386
99986     92
Name: id, Length: 499, dtype: int64

In [104]:

onehot, seqs, labels = create_data(ids)
# train_old_cc_nos = create_data(ids)


  0%|          | 0/1133 [00:00<?, ?it/s][A
  7%|▋         | 76/1133 [00:00<00:01, 756.45it/s][A
 13%|█▎        | 152/1133 [00:00<00:01, 755.04it/s][A
 20%|██        | 228/1133 [00:00<00:01, 754.27it/s][A
 27%|██▋       | 304/1133 [00:00<00:01, 755.29it/s][A
 33%|███▎      | 379/1133 [00:00<00:01, 752.18it/s][A
 40%|████      | 455/1133 [00:00<00:00, 752.29it/s][A
 47%|████▋     | 531/1133 [00:00<00:00, 752.09it/s][A
 53%|█████▎    | 606/1133 [00:00<00:00, 749.35it/s][A
 60%|██████    | 682/1133 [00:00<00:00, 749.87it/s][A
 67%|██████▋   | 758/1133 [00:01<00:00, 750.02it/s][A
 74%|███████▎  | 833/1133 [00:01<00:00, 748.30it/s][A
 80%|████████  | 908/1133 [00:01<00:00, 748.39it/s][A
 87%|████████▋ | 983/1133 [00:01<00:00, 747.79it/s][A
 93%|█████████▎| 1058/1133 [00:01<00:00, 747.12it/s][A
100%|██████████| 1133/1133 [00:01<00:00, 743.21it/s][A

In [106]:
onehot.shape, seqs.shape, labels.shape

((1133, 287), (1133, 53, 5), (1133, 13))

In [96]:
np.save('data_saved/seq-weekly.npy', seqs)
np.save('data_saved/label-weekly.npy', labels)


In [4]:
model = Sequential()
model.add(CuDNNLSTM(32, return_sequences=True, input_shape=(LENGTH_SEQ, 5)))
model.add(CuDNNLSTM(32, return_sequences=True))
model.add(CuDNNLSTM(32, return_sequences=True))
model.add(CuDNNLSTM(32))
model.add(Dense(13, activation='softmax'))
model.summary()
# model = multi_gpu_model(model, gpus=2)
model.compile(optimizer=Adam(),
              loss='categorical_crossentropy',
             metrics=['acc'])


NameError: name 'LENGTH_SEQ' is not defined

In [15]:
class_weights = [1.09518711e+01, 3.07683547e+00, 3.58078232e-01, 2.61727260e-01,
       4.38833908e-01, 1.35073077e+03, 6.72563039e+00, 9.89546351e+00,
       3.99230769e+00, 1.03902367e+01, 4.49295078e-01, 4.26546559e+01,
       1.18329459e+00]

model.fit(seqs,
          labels,
          shuffle=True,
          class_weight=class_weights,
          validation_split=0.3,
          epochs=100)

W1130 18:19:07.400505 140353152771904 module_wrapper.py:137] From /home/porlolicon/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Train on 105357 samples, validate on 45154 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

KeyboardInterrupt: 