# Import packages and modules

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Input, Masking, Dense, Dropout, Flatten, Lambda, LSTM, BatchNormalization, Activation, Concatenate, concatenate
import pandas as pd
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from sklearn.utils import shuffle
from tensorflow.keras.optimizers import RMSprop, SGD
# !pip install 'fsspec>=0.3.3'
import os
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
from numpy import random as rd
import random

'''
Author: Ahmed Anu Wahab
Date: February 2022
'''

# Select Usable Users

In [None]:
in_path = 'ml_features/'
out_path = 'triplets/'

'''
This function gets the usable Users.
NOTE: Usable users must have 15 sentences
'''
def get_usable_participants(in_path, out_path):
    column_names = ["user", "sentence_id", "sentence_number", "m", "ud", "dd", "uu", "id"]
    user_files = os.listdir(in_path)
    usable_par, unusable_par = [],[]
    for user_file in user_files:
        try:
            data = pd.read_csv(in_path + user_file) # Get the data in the user file
            user_sentence_number = data.sentence_number.unique() # Get array-list of unique sentence number
            sentence_num=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] # Reference list of sentence number
            '''Check if they are subset of each other'''
            if (sentence_num and user_sentence_number.tolist()) == sentence_num:
                usable_par.append(user_file[0:-4])
            else:
                unusable_par.append(user_file[0:-4])
        except:
            print('Error found, file skipped!!!')
            continue
    print('USABLE: ', len(usable_par))
    print('UNUSABLE: ', len(unusable_par))
    '''Compress and export'''
    np.savez_compressed(out_path + 'usable.npz', usable_par)
    np.savez_compressed(out_path + 'unusable.npz', unusable_par)

# Call the function----------------------
get_usable_participants(in_path, out_path)
#----------------------------------------

# Select Train and Test Users

In [None]:
'''
Select Train and Test users from the usable users
'''
all_users = np.load('triplets/usable.npz')['arr_0']
np.random.shuffle(all_users) # Shuffle all users
train_users, test_users = all_users[:68000], all_users[68000:] # First 68,000 for training; remaining 100,000 for test
np.savez_compressed('triplets/train_users.npz', train_users) # Save train users
np.savez_compressed('triplets/test_users.npz', test_users) # Save test users

# Data Generator

In [None]:
def get_sample(gen_user, imp_user, seq_length, features):
    gen_sentences=random.sample(list(range(15)),2) # Select 2 random sentence number
    A_sentence = gen_sentences[0] # First is the anchor senetence number
    P_sentence = gen_sentences[1] # Second is the positive senetence number
    N_sentence=rd.randint(15) # Select 1 random imp sentence number
    
    '''Get user sentences'''
    gen_df = pd.read_csv('ml_features/' + gen_user + '.csv')
    
    '''Clean the data by removing duration greater than 5 secs. They are considered outliers'''
    A_arr = gen_df[gen_df.sentence_number==A_sentence].iloc[:, 3:]
    A_arr.drop(A_arr.index[(abs(A_arr.m)>5)|(abs(A_arr.ud)>5)|(abs(A_arr.dd)>5)|(abs(A_arr.uu)>5)], inplace=True)
    A_arr = A_arr.values
    
    P_arr = gen_df[gen_df.sentence_number==P_sentence].iloc[:, 3:]
    P_arr.drop(P_arr.index[(abs(P_arr.m)>5)|(abs(P_arr.ud)>5)|(abs(P_arr.dd)>5)|(abs(P_arr.uu)>5)], inplace=True)
    P_arr = P_arr.values
    
    imp_df = pd.read_csv('ml_features/' + imp_user + '.csv')
    N_arr = imp_df[imp_df.sentence_number==N_sentence].iloc[:, 3:]
    N_arr.drop(N_arr.index[(abs(N_arr.m)>5)|(abs(N_arr.ud)>5)|(abs(N_arr.dd)>5)|(abs(N_arr.uu)>5)], inplace=True)
    N_arr = N_arr.values
    
    '''Truncate if number of keystrokes in the sample is greater than the specified sequence length'''
    if len(A_arr) >= seq_length:
        A = A_arr[:seq_length, :]
    if len(P_arr) >= seq_length:
        P = P_arr[:seq_length, :]
    if len(N_arr) >= seq_length:
        N = N_arr[:seq_length, :]
    
    '''Pad with zeros if number of keystrokes in the sample is less than the specified sequence length'''
    if len(A_arr) < seq_length:
        A = np.concatenate([A_arr, np.zeros((seq_length-len(A_arr), features))])
    if len(P_arr) < seq_length:
        P = np.concatenate([P_arr, np.zeros((seq_length-len(P_arr), features))])
    if len(N_arr) < seq_length:
        N = np.concatenate([N_arr, np.zeros((seq_length-len(N_arr), features))])

    return A,P,N


def gen_batch(batch_size, seq_length, features=5):
    '''Get the train users'''
    train_users = np.load('triplets/train_users.npz')['arr_0']
    '''Batch data and label placeholders'''
    batch = np.zeros(shape=(batch_size, 3, seq_length, features)) # Example (512, 3, 70, 5)
    y = np.zeros(shape=(batch_size,))
    '''Forever loop for generating batch data'''
    while True:
        for i in range(batch_size):
            users = random.sample(train_users.tolist(), 2) # Select two random user
            gen_user, imp_user = users[0], users[1] # First is set as genuine user, the other is set as an imposter
            if gen_user == imp_user: # Sanity check
                print('Same user was chosen for gen and imp')
            A,P,N = get_sample(gen_user, imp_user, seq_length,features) # Generate batch sample
            batch[i,0,:,:] = A
            batch[i,1,:,:] = P
            batch[i,2,:,:] = N
        yield (batch[:,0,:,:], batch[:,1,:,:], batch[:,2,:,:]), y

# Create Base Model

In [None]:
def create_base_network(input_shape):
    '''Base model.
    '''
    input_ = Input(shape=input_shape)
    x = Masking(mask_value=0.)(input_)
    x = BatchNormalization()(x)
    x = LSTM(128,recurrent_dropout=0.20, return_sequences=True)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = LSTM(128,recurrent_dropout=0.20)(x)
    return Model(input_, x)

def triplet_loss(y_true, y_pred):
    '''Triplet loss function
    '''
    margin = 1.5
    out_size = 128
    anchor, positive, negative = y_pred[0,:], y_pred[1,:], y_pred[2,:]
    positive_dist = K.sum(K.square(anchor - positive), axis=-1)
    negative_dist = K.sum(K.square(anchor - negative), axis=-1)
    loss = K.sum(K.relu(margin + positive_dist - negative_dist))
    return loss

'''
 Allocate GPU space for training.
 This allocates 0.8% of GPU memory. This memory space size can be changed as need be.
'''
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8)
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True, gpu_options=gpu_options))

# Train the Model

In [None]:
'''Create checkpoint, the best over training loss.'''
checkpoint = ModelCheckpoint('triplets/new_model/triplet_model(alpha=1.5).h5', monitor='loss', verbose=1, save_best_only=True, mode='min')

# Get the model ready
length=70
steps_per_epoch=100
epoch=150
seq_per_batch = 512
input_shape = (length,5)
base_network = create_base_network(input_shape)
input_anc,input_pos, input_neg = Input(shape=input_shape),Input(shape=input_shape),Input(shape=input_shape)
processed_anc,processed_pos, processed_neg = base_network(input_anc),base_network(input_pos),base_network(input_neg)
output = tf.stack([processed_anc, processed_pos, processed_neg])
model = Model([input_anc, input_pos, input_neg], output)
adam=keras.optimizers.Adam(learning_rate=0.05, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss=triplet_loss, optimizer='adam')
model.summary()

hst = model.fit(gen_batch(seq_per_batch,length), steps_per_epoch=steps_per_epoch, epochs=epoch, verbose=1, callbacks=[checkpoint])

# Plot the training loss
plt.plot(list(hst.history.values())[0], '-b', label="Train loss")
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()