# Setup

In [1]:
import os
import re
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/opt/conda'

2024-04-11 20:38:37.470978: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 20:38:37.515502: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 20:38:37.515531: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 20:38:37.517118: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-11 20:38:37.524860: I tensorflow/core/platform/cpu_feature_guar

# Training

#### Define Training Directory

In [2]:
directory = 'keystrokes-training'

#### Defining Functions

In [3]:
# Function to read in the training textfiles 
def read_text_files(directory):
    labels = []
    features = []
    # List all files in the directory and sort them
    file_paths = sorted([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')])
    # Process each file
    for file_path in tqdm(file_paths, total=len(file_paths), desc="Processing text files"):
        # Extract user ID (label) from the filename
        user_id = int(os.path.basename(file_path).split('_')[0])
        # Read the contents of the file, skipping the first line
        with open(file_path, 'r') as file:
            lines = file.readlines()[1:]  # Skip the "TIME_DELTA" line
            keystrokes = [int(line.strip()) for line in lines]  # Convert to integers
        # Append the extracted data to the lists
        labels.append(user_id)
        features.append(keystrokes)
    return labels, features

# Function instance of Taunet
def create_model():
    # Define the Taunet model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(256, input_shape=(None, 1)),  
        tf.keras.layers.Dense(128, activation=None),  
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)) 
    ])
    return model

# Function to shuffle batches if needed
def shuffle_batch(features, labels):
    # Calculate batch size
    batch_size = tf.shape(features)[0]
    # Create an index to shuffle features and labels in the same order
    shuffled_indices = tf.random.shuffle(tf.range(start=0, limit=batch_size))
    # Apply gathered indices to shuffle the batch
    shuffled_features = tf.gather(features, shuffled_indices)
    shuffled_labels = tf.gather(labels, shuffled_indices)
    return shuffled_features, shuffled_labels

#### Loading In Training Data

In [4]:
labels, features = read_text_files(directory)
# Convert intervals and labels into a TensorFlow dataset
intervals_tensor = tf.constant(features)
labels_tensor = tf.constant(labels)
dataset = tf.data.Dataset.from_tensor_slices((intervals_tensor, labels_tensor))
dataset = dataset.batch(256)  # Adjust batch size as needed
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

Processing text files: 100%|██████████| 233840/233840 [00:40<00:00, 5825.51it/s]
2024-04-11 20:39:21.128340: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43457 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:65:00.0, compute capability: 8.6


#### Setup Training Checkpoint

In [5]:
model_checkpoint_callback = ModelCheckpoint(
    filepath='taunet_weights.h5',
    save_weights_only=True,
    monitor='loss',  
    mode='min',
    save_best_only=True,
    verbose=1)

#### Fitting model

In [None]:
# Load in model
model = create_model()

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001, beta_1=.9, beta_2=.999),
    loss=tfa.losses.TripletSemiHardLoss()
)

# Train the model
history = model.fit(dataset, epochs=100, verbose=True, callbacks=[model_checkpoint_callback])  # Adjust epochs as needed

Epoch 1/100


2024-04-11 20:39:27.643212: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-04-11 20:39:29.751510: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f02a849f4a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-04-11 20:39:29.751544: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A40, Compute Capability 8.6
2024-04-11 20:39:29.756629: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1712867969.899787   63226 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1: loss improved from inf to 0.97418, saving model to taunet_weights.h5
Epoch 2/100
Epoch 2: loss improved from 0.97418 to 0.95710, saving model to taunet_weights.h5
Epoch 3/100