# Model

In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

print(tf.__version__)
print(tf.test.is_gpu_available())

2.0.0
True


## 2. Data

In [0]:
transaction = pd.read_csv('../data/transaction.csv',dtype=str)

In [0]:
transaction.sort_values(['clnt_id', 'trans_id', 'trans_seq'], inplace=True)
transaction = transaction.loc[transaction['pd_c'] != 'unknown']
transaction['pd_c'] = transaction['pd_c'].astype(int)
transaction[['buy_am', 'buy_ct']] = transaction[['buy_am', 'buy_ct']].astype(int)
transaction = transaction.loc[(transaction['buy_am'] > 0) & (transaction['buy_ct'] > 0)]
transaction['biz_unit'] = transaction['biz_unit'].apply(lambda x : x[0])

In [0]:
transaction.head()

Unnamed: 0,clnt_id,trans_id,trans_seq,biz_unit,pd_c,de_dt,de_tm,buy_am,buy_ct
583390,2,42449,1,A02,1015,20190704,15:34,46430,1
114604,2,62037,1,A03,92,20190729,23:47,36000,20
118652,2,64691,1,A03,186,20190731,21:25,3790,1
107209,2,64691,2,A03,151,20190731,21:25,3990,1
113315,2,64691,3,A03,351,20190731,21:25,4690,1


In [0]:
split_date = '20190915'
train = transaction.loc[transaction['de_dt'] < split_date]
test = transaction.loc[transaction['de_dt'] >= split_date]

In [0]:
train_online = train.loc[train['biz_unit'] == 'A']
train_offline = train.loc[train['biz_unit'] == 'B']
test_online = test.loc[test['biz_unit'] == 'A']
test_offline = test.loc[test['biz_unit'] == 'B']

## 3. Sequential Model

In [0]:
def prep(data, train):
    data = data.groupby(['clnt_id', 'trans_id'])['pd_c'].apply(list).reset_index()
    data = data.groupby('clnt_id')['pd_c'].apply(list).reset_index()
    
    if train == 'train':
        data = data.loc[data['pd_c'].apply(len) > 1]
    else:
        data['pd_c'] = data['pd_c'].apply(lambda x : list(set(x[0])))
    return data

In [0]:
train_online = prep(train_online, 'train')
train_offline = prep(train_offline, 'train')
test_online = prep(test_online, 'test')
test_offline = prep(test_offline, 'test')

In [0]:
def prep_batch(batch, maxlen):
    batch = [[np.random.choice(s, len(s), replace=False) for s in t] for t in batch]
    batch = [np.concatenate(t)[-maxlen:] for t in batch]
    x = [t[:-1] for t in batch]
    x = np.array([np.pad(t, (maxlen-len(t), 0), 'constant') for t in x])
    y = np.array([np.pad(t, (maxlen-len(t), 0), 'constant') for t in batch])
    return x, y

def build_generator(data, maxlen, batch_size):
    while True:
        batch_idx = np.random.randint(low=0, high=len(data), size=batch_size)
        batch = data.iloc[batch_idx]['pd_c']
        x, y = prep_batch(batch, maxlen)
        yield x, y

In [0]:
# test clnt의 train 기간 구매 횟수
train_num = train.loc[[i in test['clnt_id'].tolist() for i in train['clnt_id']], 'pd_c'].apply(len)
train_num = train_num.value_counts()
train_num.sort_values(ascending=False)

3      421
5      403
4      366
2      349
6      342
      ... 
108      1
92       1
76       1
139      1
189      1
Name: pd_c, Length: 92, dtype: int64

### Network

In [0]:
def build_network(config, model):
    inputs = tf.keras.layers.Input((config['maxlen'],))
    x = tf.keras.layers.Embedding(config['n_item'], config['d_model'], input_length=config['maxlen'])(inputs)
    
    for _ in range(config['n_layer']):
        if model == 'rnn':
            x = tf.keras.layers.SimpleRNN(config['d_model'], return_sequences=True)(x)
        elif model == 'lstm':
            x = tf.keras.layers.LSTM(config['d_model'], return_sequences=True)(x)
        elif model == 'gru':
            x = tf.keras.layers.GRU(config['d_model'], return_sequences=True)(x)
        elif model == 'transformer':
            look_ahead_mask = make_look_ahead_mask(config['maxlen'])
            x = Transformer(2, config['d_model'], 128, 0.2)(x, look_ahead_mask)

    outputs = tf.keras.layers.Dense(config['n_item'], activation='softmax')(x)
    network = tf.keras.Model(inputs, outputs)
    
    network.compile(
        loss = 'sparse_categorical_crossentropy',
        optimizer = 'adam',
        metrics = ['accuracy']
    )
    print(network.summary())
    return network

In [0]:
def train(data, model, config):
    gen = build_generator(data, config['maxlen'], config['batch_size'])
    net = build_network(config, model)
    net.fit_generator(
        gen,
        epochs = 10,
        steps_per_epoch = data.shape[0] // config['batch_size']
    )
    return net

def get_recall_n(y_test, rank, n=10):
    recall_n = []
    for i in range(len(y_test)):
        recall_tmp = np.mean([rank[i][j] <= n for j in y_test[i]])
        recall_n.append(recall_tmp)
    recall_n = np.mean(recall_n)
    recall_n = round(recall_n, 3)
    return recall_n

def evaluate(network, train, test, maxlen):
    data = pd.merge(test, train, how='left', on='clnt_id', suffixes=['_test', '_train'])
    data.dropna(inplace=True)

    x = data['pd_c_train']
    x = [[np.random.choice(s, len(s), replace=False) for s in t] for t in x]
    x = [np.concatenate(t)[-maxlen:] for t in x]
    x = np.array([np.pad(t, (maxlen-len(t), 0), 'constant') for t in x])
    y = data['pd_c_test'].tolist()

    pred = network.predict(x)
    pred = pred[:,-1,:]
    rank = (-pred).argsort().argsort() + 1
    
    recall = []
    for i in [1, 5, 10]:
        recall.append(get_recall_n(y, rank, i))
    print(f'rec@1 : {recall[0]} / rec@5 : {recall[1]} / rec@10 : {recall[2]}')

In [0]:
config = {
    'batch_size' : 32,
    'maxlen' : 100,
    'n_item' : 1669,
    'd_model' : 32,
    'n_layer' : 2
}

In [0]:
rnn_online = train(train_online, 'rnn', config)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 32)           53408     
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 100, 32)           2080      
_________________________________________________________________
simple_rnn_5 (SimpleRNN)     (None, 100, 32)           2080      
_________________________________________________________________
dense_2 (Dense)              (None, 100, 1669)         55077     
Total params: 112,645
Trainable params: 112,645
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
evaluate(rnn_online, train_online, test_online, config['maxlen'])

rec@1 : 0.026 / rec@5 : 0.124 / rec@10 : 0.186


In [0]:
rnn_offline = train(train_offline, 'rnn', config)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 32)           53408     
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 100, 32)           2080      
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 100, 32)           2080      
_________________________________________________________________
dense_3 (Dense)              (None, 100, 1669)         55077     
Total params: 112,645
Trainable params: 112,645
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
evaluate(rnn_offline, train_offline, test_offline, config['maxlen'])

rec@1 : 0.027 / rec@5 : 0.12 / rec@10 : 0.196


In [0]:
gru_online = train(train_online, 'gru', config)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 32)           53408     
_________________________________________________________________
gru (GRU)                    (None, 100, 32)           6336      
_________________________________________________________________
gru_1 (GRU)                  (None, 100, 32)           6336      
_________________________________________________________________
dense_4 (Dense)              (None, 100, 1669)         55077     
Total params: 121,157
Trainable params: 121,157
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
evaluate(gru_online, train_online, test_online, config['maxlen'])

rec@1 : 0.022 / rec@5 : 0.109 / rec@10 : 0.177


In [0]:
gru_offline = train(train_offline, 'gru', config)

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 32)           53408     
_________________________________________________________________
gru_2 (GRU)                  (None, 100, 32)           6336      
_________________________________________________________________
gru_3 (GRU)                  (None, 100, 32)           6336      
_________________________________________________________________
dense_5 (Dense)              (None, 100, 1669)         55077     
Total params: 121,157
Trainable params: 121,157
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
evaluate(gru_offline, train_offline, test_offline, config['maxlen'])

rec@1 : 0.03 / rec@5 : 0.126 / rec@10 : 0.209
