# Classify images using data of only 1 user

Take all data from 1 user, split in groups by image id.

Learn to classify True/Fake patches, then combine them into True/Fake images by majority vote.

In [1]:
import os
import numpy as np
import pandas as pd

import skelm

from matplotlib import pyplot as plt
import seaborn as sn
%matplotlib inline

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [3]:
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV, GroupShuffleSplit, RandomizedSearchCV
import scipy
from sklearn.utils.fixes import loguniform
from time import time
import pickle

In [4]:
data_folder = "/Users/akusok/wrkdir/research-signatures-data/MCYTDB"
overlap = '50p'
samples = 10000000 // 30

data_file = "/Users/akusok/wrkdir/research-signatures-data/MCYTD_overlap{}.pkl".format(overlap)

# 

## Load data and run default ELM

In [5]:
with open(data_file, 'rb') as f:
    data = pickle.load(f)
    
data = data.reset_index(drop=True)

In [6]:
Y = data.loc[:, 'sig_true'].to_numpy()

In [7]:
X = data.loc[:, '0':'1023'].to_numpy()

In [8]:
G = data.loc[:, 'uid'].to_numpy()

In [9]:
X = X - X.mean()
X = X / X.std().clip(min=0.5)
X = X.clip(min=-5, max=5)

In [10]:
F = data.loc[:, 'fid'].to_numpy()

In [11]:
del data

#  

## Training 

In [12]:
model = skelm.ELMClassifier(alpha=0.8, n_neurons=10000)

In [13]:
gkf = GroupKFold(n_splits=25)
res = []
for i, (ti, vi) in enumerate(gkf.split(X, Y, G)):
    model.fit(X[ti], Y[ti])
    yh = model.predict(X[vi])
    yv = Y[vi]
    gv = G[vi]
    fv = F[vi]
    for j in set(fv):
        res.append([j, yv[fv == j].mean(), yh[fv == j].mean()])
        
    with open("res_gkf_{}.pkl".format(i), "wb") as fout:
        pickle.dump(res, fout)
    
    print(res[-1])

[2157, 1.0, 0.71900826446281]
[330, 0.0, 0.42699724517906334]
[900, 1.0, 0.6391184573002755]
[1440, 0.0, 0.4462809917355372]
[960, 1.0, 0.7823691460055097]
[2047, 1.0, 0.8264462809917356]
[1020, 1.0, 0.8154269972451791]
[1980, 0.0, 0.15151515151515152]
[1950, 1.0, 0.7603305785123967]
[990, 1.0, 0.6418732782369146]
[450, 1.0, 0.628099173553719]
[2040, 0.0, 0.05234159779614325]
[1530, 1.0, 0.8953168044077136]
[1350, 1.0, 0.6831955922865014]
[1535, 1.0, 0.22038567493112948]
[1589, 1.0, 0.5344352617079889]
[2010, 0.0, 0.24242424242424243]
[510, 1.0, 0.4380165289256198]
[1470, 0.0, 0.27823691460055094]
[480, 1.0, 0.928374655647383]
[870, 0.0, 0.2231404958677686]
[1023, 1.0, 0.7768595041322314]
[511, 1.0, 0.44077134986225897]
[1320, 1.0, 0.2727272727272727]
[930, 1.0, 0.1349862258953168]


In [5]:
with open("res_gkf_{}.pkl".format(25-1), "rb") as fin:
    res = pickle.load(fin)

In [9]:
res = np.array(res)

In [10]:
y = res[:, 1].astype(np.int)
y_pred = res[:, 2]

#  

## EER 

In [13]:
from sklearn.metrics import roc_curve

In [17]:
fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=1)
fnr = 1 - tpr

In [19]:
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]

In [21]:
eer_threshold

0.48760330578512395

In [23]:
EER = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
EER

0.32355555555555554

In [25]:
EER = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
EER

0.32533333333333336

In [None]:
res

In [18]:
set(gv)

{5, 23, 35, 50, 72, 87, 100, 115}

# 90p