## Batch experiments same as in Leonardo's paper

    1. Pre-train UID ELM on 50 users
    
    2. Repeat for each batch of 5 users:
        3. Take batch of 5 users, LOO single signature for validation
        4. Train on 50+5 users
        5. Predict on LOO single signature of 5 users

    6. Repeat on true/fake classification ELM

Use ELM implementation in `HPELM @ GPU` for speed

In [1]:
from hpelm import HPELM

In [2]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn
%matplotlib inline

In [3]:
data_folder = "/Users/akusok/wrkdir/research-signatures-data/MCYTDB"
overlap = '50p'
samples = 10000000 // 30

data_file = "/Users/akusok/wrkdir/research-signatures-data/MCYTD_overlap{}_n200.pkl".format(overlap)

# 

In [4]:
with open(data_file, 'rb') as f:
    X0 = pickle.load(f)
    
X0 = X0.reset_index(drop=True)
U0 = pd.get_dummies(X0.uid)

Make a `users` list of user batches, sizes `[50, 5, 5, 5, 5, 5]`

In [5]:
uu = list(X0.uid.unique())
random.shuffle(uu)
users = [uu[:50]]
users.extend([uu[i:i+5] for i in range(50,75,5)])

Make a data loader that gives us required batch, optionally setting aside 1 file per user

In [6]:
def load_batch(j, loo=False):
    """Load batch `j` of `users` list, as Numpy array.
    
    loo (bool, default=False): whether to return single LOO file per user separately.
    """    
    data = X0[X0.uid.isin(users[j]) & (X0.sig_true == 1)]
    
    if loo:
        # get LOO split
        t = data.loc[:, 'uid':'fid']
        t = t.sample(frac=1.0, replace=False)
        fid_loo = t.groupby('uid').first()['fid']

        data_loo = data[data.fid.isin(fid_loo)]
        data = data[~data.fid.isin(fid_loo)]
        Y_loo = data_loo.loc[:, 'uid']
        Y_loo = U0.loc[Y_loo.index].to_numpy()
        Z_loo = data_loo.loc[:, '0':'1023'].to_numpy()

    Y = data.loc[:, 'uid']
    Y = U0.loc[Y.index].to_numpy()
    Z = data.loc[:, '0':'1023'].to_numpy()
    
    zm = Z.mean()
    zs = Z.std().clip(min=0.5)
    Z = (Z - zm) / zs
    Z = Z.clip(min=-5, max=5)

    if loo:
        Z_loo = (Z_loo - zm) / zs
        Z_loo = Z_loo.clip(min=-5, max=5)
        return Z, Y, Z_loo, Y_loo
        
    return Z, Y

#  

## pre-train

In [81]:
%%time
model = HPELM(1024, 75, norm=1e3, batch=10000)
model.add_neurons(10000-1024, 'tanh')
model.add_neurons(1024, 'lin')

CPU times: user 365 ms, sys: 66.7 ms, total: 431 ms
Wall time: 441 ms


In [82]:
Xp, Yp = load_batch(0, loo=False)

In [83]:
%time model.add_data(Xp, Yp)

processing batch 1/15, eta 0:03:44
processing batch 2/15, eta 0:03:17
processing batch 3/15, eta 0:03:00
processing batch 4/15, eta 0:02:45
processing batch 5/15, eta 0:02:31
processing batch 6/15, eta 0:02:14
processing batch 7/15, eta 0:01:59
processing batch 8/15, eta 0:01:44
processing batch 9/15, eta 0:01:31
processing batch 10/15, eta 0:01:16
processing batch 11/15, eta 0:01:01
processing batch 12/15, eta 0:00:46
processing batch 13/15, eta 0:00:31
processing batch 14/15, eta 0:00:15
processing batch 15/15, eta 0:00:00
CPU times: user 11min 32s, sys: 41.2 s, total: 12min 13s
Wall time: 3min 56s


#### fine-tune

In [84]:
res = []

for k in range(1,6):
    print(k)
    x, y, xv, yv = load_batch(k, loo=True)
    
    model.add_data(x, y)
    model.nnet.solve()
    
    yh1 = model.predict(xv).argmax(1)
    yv1 = yv.argmax(1)
    b = pd.DataFrame(np.vstack((yv1, yh1)).T).groupby(0)
    val_topk = b.apply(lambda a: list(a[1].value_counts().index)[:10])
    val_true = b.count().index
    
    val_data = {}
    for topk, yy in zip(val_topk, val_true):
        val_data[yy] = [int(yy in topk[:i]) for i in (1,3,5,10)]

    res_k = pd.DataFrame.from_dict(val_data, orient='index', columns=['top1', 'top3', 'top5' ,'top10'])
    
    res.append(res_k)

1
processing batch 1/2, eta 0:00:19
processing batch 2/2, eta 0:00:00
2
processing batch 1/2, eta 0:00:16
processing batch 2/2, eta 0:00:00
3
processing batch 1/2, eta 0:00:17
processing batch 2/2, eta 0:00:00
4
processing batch 1/2, eta 0:00:16
processing batch 2/2, eta 0:00:00
5
processing batch 1/2, eta 0:00:16
processing batch 2/2, eta 0:00:00


#### read results

In [85]:
R = pd.concat(res, axis=0)
R

Unnamed: 0,top1,top3,top5,top10
5,1,1,1,1
9,1,1,1,1
16,1,1,1,1
33,1,1,1,1
39,1,1,1,1
1,1,1,1,1
6,1,1,1,1
11,1,1,1,1
14,1,1,1,1
43,1,1,1,1


In [86]:
R.mean(axis=0)

top1     1.0
top3     1.0
top5     1.0
top10    1.0
dtype: float64

In [80]:
pass

top1     0.84
top3     0.96
top5     1.00
top10    1.00
dtype: float64