# Create models from *Ergo* data
This notebook contains the code for models used to predict the *Ergo* data. See the report [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-doc) or the source code behind the data [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-src).

## Imports and constants

In [None]:
from common_utils import *
gesture_info = get_gesture_info()
dir_files = get_dir_files()

## Preprocessing data

TODO:
- Create some method to delete bad observations

In [None]:
# Get a listing of all directories and their files
dir_files = {d: os.listdir(f'../gesture_data/train/{d}') for d in os.listdir(f'../gesture_data/train') if d != ".DS_Store"}
# Filter out all directories which don't have any files in them
dir_files = {d: files for d, files in dir_files.items() if len(files) > 0}
max_files = max([len(files) for files in dir_files.values()])
dirs = sorted(list(dir_files.keys()))

format_string = "\n- ".join([
    f'{k}: {gesture_info.get(k, {}).get("description", "")} ({len(v)} files)' 
    for k,v in dir_files.items()
])
print(f'The following gestures have data recorded for them:\n- {format_string}')

## Plot an example gesture observation to check everything's working

In [None]:
gesture = list(dir_files.keys())[0]
filename = dir_files[gesture][0]
filename = f'../gesture_data/train/{gesture}/{filename}'
df = read_to_df(filename, normalise=True)
plot_raw_gesture(df, filename)

## Train basic model

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import wandb
from wandb.keras import WandbCallback

In [None]:
wandb.init(project="ergo", entity="beyarkay")
wandb.config = {
#     "learning_rate": 0.001,
    "epochs": 10,
    "batch_size": 32,
    "resample_period": 25,
}

In [None]:
# https://www.tensorflow.org/tutorials/load_data/csv#multiple_files_2

list_ds = tf.data.Dataset.list_files('../gesture_data/train/*/*.txt')

def preprocess_features(numbers):
    # Convert the np.array to a pd.DataFrame
    df = pd.DataFrame(data=numbers.numpy())
    
    # Set the index to be column 0 (The column containing the miliseconds 
    # since the start of the gesture)
    df.index = pd.TimedeltaIndex(df[0], unit='ms', name='offset_ms')
    
    # Delete the milliseconds column (We won't use it for training)
    del df[0]
    
    # If the start and end items don't explicitly exist => add them
    start = pd.Timedelta('0 days 00:00:00.000')
    end = pd.Timedelta('0 days 00:00:00.975')
    if start not in df.index:
        df.loc[start] = pd.Series(dtype='float64')
    if end not in df.index:
        df.loc[end] = pd.Series(dtype='float64')

    # Resample the data so we've got values exactly every 25ms
    df = df.resample(f"{wandb.config['resample_period']}ms").mean().ffill()
    
    # Normalise the data to have zero-mean and unit-variance
    df = (df - df.stack().mean()) / df.stack().std()
    return np.array(df)
    
    
def preprocess_label(label):
#     print(label)
    label //=  2
    return tf.keras.utils.to_categorical(label-1, num_classes=len(dirs))


@tf.function
def process_path(file_path):
    # Get the label of the observation from the file path
    label = tf.strings.split(file_path, os.sep)[-2]    
    label = tf.strings.regex_replace(label, "gesture", "")
    label = tf.strings.to_number(label, tf.int32)
    [label,] = tf.py_function(preprocess_label, [label], [tf.float32])
    label.set_shape(len(dirs))
    
    # Read in the actual file
    file = tf.io.read_file(file_path)
    # Split by newlines to get an array of each line
    lines = tf.strings.split(file, "\n")
    # Split each line by ',' to get an array of arrays of strings
    items = tf.strings.split(lines, ',')
    # Convert the array of array of strings to a 2D array of float32
    nums = tf.strings.to_number(items, out_type=tf.dtypes.float32)
    # preprocess the raw sensor values via pandas
    [nums,] = tf.py_function(preprocess_features, [nums], [tf.float32])
    nums.set_shape((40, 30))
    
    return nums, label

labeled_ds = list_ds.map(process_path)
labeled_ds = labeled_ds.shuffle(buffer_size=1000)
labeled_ds = labeled_ds.batch(wandb.config['batch_size'])

for sensor_data, label in labeled_ds.take(1):
    print(list(zip(sensor_data.numpy(), label.numpy()))[0])


In [None]:
model = tf.keras.Sequential([
    keras.layers.Flatten(input_shape=(40, 30), name='input'),
    keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(len(dirs), name='output'),
], name='Ergo')

model.summary()

metrics = [
#     tf.keras.metrics.Accuracy(),
#     tf.keras.metrics.Precision(),
#     tf.keras.metrics.Recall(),
#     tf.keras.metrics.TrueNegatives(),
#     tf.keras.metrics.TruePositives(),
#     tf.keras.metrics.FalseNegatives(),
#     tf.keras.metrics.FalsePositives(),
    tf.keras.metrics.CategoricalAccuracy(),
    tf.keras.metrics.CategoricalCrossentropy(),
#     tf.keras.metrics.MeanAbsolutePercentageError(),
]

model.compile(
    optimizer='adam',
    # Use `categorical_crossentropy` because the labels are one-hot-encoded
    loss='categorical_crossentropy',
    metrics=metrics,
)
history = model.fit(
    labeled_ds, 
    epochs=wandb.config['epochs'],
    callbacks=[WandbCallback()]
)

# # You can also evaluate or predict on a dataset.
# print("Evaluate")
# result = model.evaluate(labeled_ds)
# dict(zip(model.metrics_names, result))


In [None]:
predictions = model.predict(labeled_ds)
predictions[0]

In [None]:
i = 0
for d, l in labeled_ds.take(5):
    for item in l:
        print(np.argmax(predictions[i]), np.argmax(item))
#         print(item)
        i += 1
        

In [None]:
list(np.argmax(predictions, axis=1))