# Grid search code

In [9]:
import random
import json
import os
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from time import time
from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorboard.plugins.hparams import api as hp
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/opt/conda'

def shuffle_batch(features, labels):
    # Calculate batch size
    batch_size = tf.shape(features)[0]
    # Create an index to shuffle features and labels in the same order
    shuffled_indices = tf.random.shuffle(tf.range(start=0, limit=batch_size))
    # Apply gathered indices to shuffle the batch
    shuffled_features = tf.gather(features, shuffled_indices)
    shuffled_labels = tf.gather(labels, shuffled_indices)
    return shuffled_features, shuffled_labels


def read_text_files(directory):
    labels = []
    features = []
    # List all files in the directory and sort them
    file_paths = sorted([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')])
    # Process each file
    for file_path in tqdm(file_paths, total=len(file_paths), desc="Processing text files"):
        # Extract user ID (label) from the filename
        user_id = int(os.path.basename(file_path).split('_')[0])
        # Read the contents of the file, skipping the first line
        with open(file_path, 'r') as file:
            lines = file.readlines()[1:]  # Skip the "TIME_DELTA" line
            keystrokes = [int(line.strip()) for line in lines]  # Convert to integers
        # Append the extracted data to the lists
        labels.append(user_id)
        features.append(keystrokes)
    return labels, features

def calculate_rank_n_accuracy(embeddings1, embeddings2, labels1, labels2, n):
    correct_matches = 0
    for i in range(len(embeddings1)):
        # Compute Euclidean distances from embeddings1[i] to all embeddings2
        distances = np.linalg.norm(embeddings2 - embeddings1[i], axis=1)
        
        # Get the indices of the top 10 closest embeddings in embeddings2
        closest_indices = np.argsort(distances)[:n]
        
        # Check if the correct label is within these top 10 closest embeddings
        if labels1[i] in labels2[closest_indices]:
            correct_matches += 1
    
    # Calculate accuracy
    accuracy = correct_matches / len(embeddings1)
    return accuracy

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(256, input_shape=(None, 1)),
        tf.keras.layers.Dense(128, activation=None),
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ])
    return model


def create_dataset(batch_size, shuffle_set):
    # Example usage
    train_labels, train_features = read_text_files('keystrokes-training')
    test_labels, test_features = read_text_files('keystrokes-training')
    train_intervals_tensor = tf.constant(train_features)
    train_labels_tensor = tf.constant(train_labels)
    dataset = tf.data.Dataset.from_tensor_slices((train_intervals_tensor, train_labels_tensor))
    if (shuffle_set == True): 
        dataset = dataset.map(shuffle_batch)
    elif (shuffle_set == False):
        pass
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    #return dataset, test_labels, test_features

In [10]:
create_dataset(256, False)

Processing text files: 100%|██████████| 233840/233840 [02:46<00:00, 1408.43it/s]
Processing text files: 100%|██████████| 233840/233840 [00:41<00:00, 5665.32it/s]


In [2]:

def evaluate(test_features, test_labels, optimizer,learning_rate):
    intervals_tensor_eval = tf.constant(test_features).numpy()
    labels_tensor_eval = tf.constant(test_labels).numpy()
    labels1 = labels_tensor_eval[::2]  
    labels2 = labels_tensor_eval[1::2]
    input_sequences1 = intervals_tensor_eval[::2]  
    input_sequences2 = intervals_tensor_eval[1::2]

    # Recreate the model architecture
    recreated_model = tf.keras.Sequential([
        tf.keras.layers.LSTM(256, input_shape=(None, 1)),
        #tf.keras.layers.Dropout(.2),
        tf.keras.layers.Dense(128, activation=None),
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ])
    opt = optimizer(learning_rate=learning_rate)
    # Compile the recreated model
    recreated_model.compile(
        optimizer=opt,
        loss=tfa.losses.TripletSemiHardLoss()
    )
                    # Load the weights
    recreated_model.load_weights('./saved_model/taunet_model_test.h5')

    # Generate embeddings
    embeddings1 = recreated_model.predict(input_sequences1)
    embeddings2 = recreated_model.predict(input_sequences2)
    accuracies = calculate_rank_n_accuracy(embeddings1, embeddings2, labels1, labels2, 1)
    return accuracies

In [None]:
def grid_search(param_grid, model_checkpoint_callback):
    best_accuracy = 0
    best_params = {}
    
    for learning_rate in param_grid['learning_rate']:
        for shuffle_set in param_grid['shuffle']:
            for batch_size in param_grid['batch_size']:
                for optimizer in param_grid['optimizer']:
                    #print(f"Trying batch size {batch_size}, learning rate {learning_rate}, optimizer {optimizer}")
                    dataset, test_labels, test_features = create_dataset(batch_size, shuffle_set)
                    model = create_model()
                    opt = optimizer(learning_rate=learning_rate)
                    model.compile(optimizer=opt, loss=tfa.losses.TripletSemiHardLoss())
                    history = model.fit(dataset, epochs=50, verbose=True, callbacks=[model_checkpoint_callback])  # Adjust epochs as needed
                    # Define your parameter grid
                    val_accuracy = (evaluate(test_features, test_labels, optimizer, learning_rate))
                    if val_accuracy > best_accuracy:
                        best_accuracy = val_accuracy
                        best_params = {'shuffle': shuffle_set, 'batch_size': batch_size, 'learning_rate': learning_rate, 'optimizer': optimizer}
            # Writing best parameters and accuracy to a text file
    with open('./best_params_and_accuracy.txt', 'w') as f:
        f.write(f"Best Parameters: {best_params}\n")
        f.write(f"Best Accuracy: {best_accuracy}\n")
        
    return best_params, best_accuracy
param_grid = {
    'shuffle' : [True, False],
    'batch_size': [32, 64, 128, 256, 512],
    'learning_rate': [1e-2, 1e-3, 1e-4],
    'optimizer': [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD]
}

model_checkpoint_callback = ModelCheckpoint(
    filepath='./saved_model/taunet_model_test.h5',
    save_weights_only=True,
    monitor='loss',  # Change this to 'loss'
    mode='min',
    save_best_only=False,
    verbose=0)

grid_search(param_grid, model_checkpoint_callback)

Processing text files: 100%|██████████| 25000/25000 [00:03<00:00, 6482.29it/s]
Processing text files: 100%|██████████| 2500/2500 [00:00<00:00, 6583.38it/s]


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50