In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.signal import resample, hilbert, find_peaks
from skimage import util
from tqdm.notebook import tqdm
from collections import Counter
from itertools import chain

In [2]:
sampling_rate = 44100
frame_bits = 16
left_trim_sec = 0.15
right_trim_sec = 0.15
peak_thresh = 0.15
peak_sep_sec = 0.15
left_segment_sec = 0.005
right_segment_sec = 0.02
window_sec = 0.02
window_stride = 0.001
sensor_samples = 256
train_folder = 'datasets/atm_pad_9(726,acce)'

def get_audio_signal_from_file(filepath, frame_bits):
    df = pd.read_csv(filepath, header=None)
    frame_max = 2**(frame_bits - 1)
    signal = df.values[:,1] / frame_max
    assert np.all((-1 <= signal) & (signal <= 1)), 'Invalid PCM data'
#     plt.plot(signal)
#     plt.show()
    return signal

def get_sensor_signal_from_file(filepath):
    df = pd.read_csv(filepath, header=None)
    signal = df.values[:,1:]
#     plt.plot(signal)
#     plt.show()
    return signal

def trim_signal(signal, left_time, right_time, sampling_rate):
    start = int(round(left_time * sampling_rate))
    end = -int(round(right_time * sampling_rate))
#     plt.plot(signal[start:end])
#     plt.show()
    return signal[start:end]

def get_peaks(signal, num_peaks, peak_thresh, peak_sep_sec, sampling_rate):
    envelope = np.abs(hilbert(signal))
    distance = peak_sep_sec * sampling_rate
    peaks, _ = find_peaks(envelope, height=peak_thresh, distance=distance)
    assert len(peaks) == num_peaks, "Found {} instead of {} peaks".format(len(peaks), num_peaks)
#     plt.plot(signal)
#     for peak in peaks:
#         sample_start = peak - int(round(left_segment_sec * sampling_rate))
#         sample_end = peak + int(round(right_segment_sec * sampling_rate))
#         plt.axvline(x=peak, color='orange')
#         plt.axvline(x=sample_start, color='green')
#         plt.axvline(x=sample_end, color='red')
        
#     plt.show()
    return peaks / sampling_rate

def get_audio_segments_from_peaks(signal, peaks, left_segment_sec, right_segment_sec, sampling_rate):
    segments = []
    for peak in peaks:
        sample_start = int(round((peak - left_segment_sec) * sampling_rate))
        sample_end = int(round((peak + right_segment_sec) * sampling_rate))
        segment = signal[sample_start:sample_end + 1]
        segments.append(segment)
#         plt.plot(signal[sample_start - 100:sample_end + 101])
#         plt.axvline(x=100, color='green')
#         plt.axvline(x=100 + sample_end - sample_start, color='red')
#         plt.show()
        
    return segments

def get_fft_features(segment, window_sec, window_stride, sampling_rate):
    N = 2**int(np.ceil(np.log2(window_sec * sampling_rate)))
    assert N <= len(segment), "Segment too short"
    hann = np.hanning(N)
    stride = int(round(window_stride * sampling_rate))
    windows = util.view_as_windows(segment, window_shape=N, step=stride)
    windows = hann * windows
    spectrum = np.fft.fft(windows)
#     xf = np.fft.fftfreq(N, d=1/sampling_rate)[1:N//2 + 1]
#     yf = np.mean(np.abs(spectrum)[:, 1:N//2 + 1], axis=0)
#     plt.plot(xf, yf)
#     plt.show()
    return np.mean(np.abs(spectrum)[:, 1:N//2 + 1], axis=0)

def get_data_from_folder(folder, debug_level=0):
    data = []
    labels = []
    sensor_data = []
    sensor_targets = []
    filepaths = glob.glob(os.path.join(folder, '*'))
    total = len(filepaths)
    discarded = 0
    for filepath in tqdm(filepaths):
        try:
            audio_file = glob.glob(os.path.join(filepath, '*.wave.csv'))[0]
            lnac_files = glob.glob(os.path.join(filepath, '*.*.*.*.acce.csv'))
            
            chars = Path(audio_file).stem.split('.')[0]
        
            # Temporary lists
            curr_features = []
            curr_labels = []
            curr_lnac_features = []
            curr_lnac_labels = []
            
            # Process audio
            signal = get_audio_signal_from_file(audio_file, frame_bits)
            signal = trim_signal(signal, left_trim_sec, right_trim_sec, sampling_rate)
            peaks = get_peaks(signal, len(chars), peak_thresh, peak_sep_sec, sampling_rate)
            segments = get_audio_segments_from_peaks(signal, peaks, left_segment_sec, right_segment_sec, sampling_rate)
            for i, segment in enumerate(segments):
                features = get_fft_features(segment, window_sec, window_stride, sampling_rate)
                curr_features.append(features)
                curr_labels.append(chars[i])
                
            # Process accelerometer data
            for lnac_file in sorted(lnac_files):
                signal = get_sensor_signal_from_file(lnac_file)
                lnac_features = resample(signal, sensor_samples)
                curr_lnac_features.append(lnac_features)
                dx, dy = Path(lnac_file).stem.split('.')[2:4]
                curr_lnac_labels.append([int(dx), int(dy)])
            
            # No errors encountered; append data to main lists
            data.append(curr_features)
            labels.append(curr_labels)
            sensor_data.append(curr_lnac_features)
            sensor_targets.append(curr_lnac_labels)
                    
        except Exception as e:
            if debug_level > 0:
                print('Error in {}: {}'.format(filepath, e))
                if debug_level > 1:
                    plt.plot(signal)
                    plt.show()
                
            discarded += 1
    
    print('{} of {} ({:.1f}%) samples discarded'.format(discarded, total, discarded / total * 100))            
    return data, labels, sensor_data, sensor_targets

print('Loading train data...')
audio_data_groups, audio_label_groups, sensor_data_groups, sensor_target_groups = get_data_from_folder(train_folder, debug_level=1)

Loading train data...


HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))

Error in datasets/atm_pad_9(726,acce)/0320002713.0066: Found 5 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0320002713.0087: Found 5 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0314003037.0067: Segment too short
Error in datasets/atm_pad_9(726,acce)/0320011427.0006: Found 5 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0315000919.0084: Found 7 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0320002713.0024: Found 5 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0320011427.0009: Found 5 instead of 6 peaks
Error in datasets/atm_pad_9(726,acce)/0315000919.0001: Segment too short

8 of 726 (1.1%) samples discarded


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler

val_size = 0.15
test_size = 0.15

def train_val_test_split(*x, val_size, test_size):
    x_train_test = train_test_split(*x, test_size=val_size + test_size)
    x_val_test = train_test_split(*x_train_test[1::2], test_size=test_size / (val_size + test_size))
    x_train = x_train_test[0::2]
    x_val = x_val_test[0::2]
    x_test = x_val_test[1::2]
    return chain.from_iterable(zip(x_train, x_val, x_test))

def flatten_groups(*groups):
    return [np.array(list(chain.from_iterable(group))) for group in groups]

def sensor_fit(sensor_scaler, sensor_x_in):
    _, num_dims = sensor_x_in.shape[1:]
    sensor_x = np.reshape(sensor_x_in, (-1, num_dims))
    sensor_scaler.fit(sensor_x)
    
def sensor_transform(sensor_scaler, sensor_x_in):
    num_features, num_dims = sensor_x_in.shape[1:]
    sensor_x = np.reshape(sensor_x_in, (-1, num_dims))
    sensor_x = sensor_scaler.transform(sensor_x)
    sensor_x = np.reshape(sensor_x, (-1, num_features, num_dims))
    return sensor_x

audio_labels_all, = flatten_groups(audio_label_groups)
encoder = LabelBinarizer()
encoder.fit(audio_labels_all)
audio_target_groups = [encoder.transform(group) for group in audio_label_groups]

audio_data_train, audio_data_val, audio_data_test, \
audio_labels_train, audio_labels_val, audio_labels_test, \
audio_targets_train, audio_targets_val, audio_targets_test, \
sensor_data_train, sensor_data_val, sensor_data_test, \
sensor_targets_train, sensor_targets_val, sensor_targets_test = train_val_test_split(
    audio_data_groups, audio_label_groups, audio_target_groups,
    sensor_data_groups, sensor_target_groups,
    val_size=val_size, test_size=test_size)

audio_x_train, audio_x_val, audio_x_test, audio_y_train, audio_y_val, audio_y_test, \
sensor_x_train, sensor_x_val, sensor_x_test, sensor_y_train, sensor_y_val, sensor_y_test = flatten_groups(
    audio_data_train, audio_data_val, audio_data_test,
    audio_targets_train, audio_targets_val, audio_targets_test,
    sensor_data_train, sensor_data_val, sensor_data_test,
    sensor_targets_train, sensor_targets_val, sensor_targets_test)

audio_scaler = StandardScaler()
audio_scaler.fit(audio_x_train)
audio_x_train = audio_scaler.transform(audio_x_train)
audio_x_val = audio_scaler.transform(audio_x_val)
audio_x_test = audio_scaler.transform(audio_x_test)

sensor_scaler = StandardScaler()
sensor_fit(sensor_scaler, sensor_x_train)
sensor_x_train = sensor_transform(sensor_scaler, sensor_x_train)
sensor_x_val = sensor_transform(sensor_scaler, sensor_x_val)
sensor_x_test = sensor_transform(sensor_scaler, sensor_x_test)

print('audio_x_train shape:', audio_x_train.shape)
print('audio_y_train shape:', audio_y_train.shape)
print('sensor_x_train shape:', sensor_x_train.shape)
print('sensor_y_train shape:', sensor_y_train.shape)
print(len(audio_x_train), 'train samples')
print(len(audio_x_val), 'val samples')
print(len(audio_x_test), 'test samples')

audio_x_train shape: (3012, 512)
audio_y_train shape: (3012, 10)
sensor_x_train shape: (2510, 256, 3)
sensor_y_train shape: (2510, 2)
3012 train samples
648 val samples
648 test samples


In [4]:
def inspect_labels(labels):
    dist = Counter(labels)
    items = sorted(dist.items())
    for label, count in items:
        print('{}: {}'.format(repr(label), count))

print('Train label distribution:')
inspect_labels(audio_labels_all)

Train label distribution:
'0': 426
'1': 388
'2': 469
'3': 446
'4': 441
'5': 462
'6': 423
'7': 398
'8': 454
'9': 401


In [5]:
import keras
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.metrics import top_k_categorical_accuracy
from keras.utils import to_categorical
from keras import backend as K
from tqdm.keras import TqdmCallback

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

batch_size = 128
epochs = 100
patience = 100
num_models = 5
model_name = 'models/numpad_model'

def key_model(num_classes):
    model = Sequential()
    model.add(Dense(64, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax', kernel_initializer='he_normal'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def rounded_acc(y_true, y_pred):
    return K.mean(K.equal(K.round(y_true), K.round(y_pred)))

def train_model(model, x_train, y_train, x_val, y_val, filename):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
    mc = ModelCheckpoint(filename, monitor='val_loss', mode='min', verbose=0, save_best_only=True)
    tc = TqdmCallback(verbose=0)

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=0,
              validation_data=(x_val, y_val),
              callbacks=[es, mc, tc])

    return load_model(filename)

def train_ensemble(model_spec, x_train, y_train, x_val, y_val):
    models = []
    
    for i in range(num_models):
        filename = '{}_{}.h5'.format(model_name, i)
        print('Training model {} of {}...'.format(i + 1, num_models))
        num_classes = len(y_train[0])
        model = model_spec(num_classes)
        model = train_model(model, x_train, y_train, x_val, y_val, filename)
        models.append(model)

    print('Finished training')
    return models
    
models = train_ensemble(key_model, audio_x_train, audio_y_train, audio_x_val, audio_y_val)

Using TensorFlow backend.


Training model 1 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 2 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 3 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 4 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 5 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Finished training


In [6]:
def top_k_accuracy_score(y_pred, y_test, k):
    top_k_marks = top_k_categorical_accuracy(y_test, y_pred, k=k)
    return sum(top_k_marks) / len(top_k_marks)

def rounded_acc_score(y_pred, y_test):
    return np.mean(np.equal(np.round(y_pred), np.round(y_test)))

def predict_ensemble(models, x):
    sum_pred = 0
    for model in models:
        sum_pred += model.predict(x)
        
    return sum_pred / len(models)

def evaluate_ensemble(models, x, y, k):
    avg_pred = predict_ensemble(models, x)
    labels_pred = np.argmax(avg_pred, axis=1)
    y_pred = to_categorical(labels_pred, y.shape[1])
    score = accuracy_score(y_pred, y)
    top_k_score = top_k_accuracy_score(avg_pred, y, k)
    return score, top_k_score

def evaluate_ensemble_reg(models, x, y):
    avg_pred = predict_ensemble(models, x)
    score = rounded_acc_score(avg_pred, y)
    loss = np.mean((avg_pred - y)**2)
    return score, loss

k = 3
score, top_k_score = evaluate_ensemble(models, audio_x_val, audio_y_val, k)

print('Val accuracy: {:.1f}%'.format(score * 100))
print('Top-{} accuracy: {:.1f}%'.format(k, top_k_score * 100))

Val accuracy: 84.0%
Top-3 accuracy: 96.3%


In [7]:
for i, model in enumerate(models):
    score = model.evaluate(audio_x_val, audio_y_val, verbose=0)
    print('Model {} val accuracy: {:.1f}%'.format(i + 1, score[1] * 100))

Model 1 val accuracy: 80.9%
Model 2 val accuracy: 79.9%
Model 3 val accuracy: 81.3%
Model 4 val accuracy: 77.9%
Model 5 val accuracy: 79.9%


In [8]:
score, top_k_score = evaluate_ensemble(models, audio_x_test, audio_y_test, k)

print('Test accuracy: {:.1f}%'.format(score * 100))
print('Top-{} accuracy: {:.1f}%'.format(k, top_k_score * 100))

Test accuracy: 88.1%
Top-3 accuracy: 96.9%


In [9]:
for i, model in enumerate(models):
    score = model.evaluate(audio_x_test, audio_y_test, verbose=0)
    print('Model {} test accuracy: {:.1f}%'.format(i + 1, score[1] * 100))

Model 1 test accuracy: 83.0%
Model 2 test accuracy: 83.3%
Model 3 test accuracy: 84.7%
Model 4 test accuracy: 83.5%
Model 5 test accuracy: 85.3%


In [10]:
from keras.layers import Flatten

def dir_model(num_classes):
    model = Sequential()
    model.add(Flatten())
    model.add(Dense(64, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))

    keras.metrics.rounded_acc = rounded_acc
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['rounded_acc'])

    return model

epochs = 100
patience = 100
sensor_models = train_ensemble(dir_model, sensor_x_train, sensor_y_train, sensor_x_val, sensor_y_val)

Training model 1 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 2 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 3 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 4 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Training model 5 of 5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Finished training


In [11]:
score, loss = evaluate_ensemble_reg(sensor_models, sensor_x_val, sensor_y_val)
print('Val accuracy: {:.1f}%'.format(score * 100))
print('Val loss: {:.3g}\n'.format(loss))

for i, model in enumerate(sensor_models):
    score = model.evaluate(sensor_x_val, sensor_y_val, verbose=0)
    print('Model {} val accuracy: {:.1f}%'.format(i + 1, score[1] * 100))

Val accuracy: 80.3%
Val loss: 0.166

Model 1 val accuracy: 77.8%
Model 2 val accuracy: 77.8%
Model 3 val accuracy: 78.6%
Model 4 val accuracy: 76.8%
Model 5 val accuracy: 78.2%


In [12]:
print(predict_ensemble(sensor_models, sensor_x_test[:5]))
print(sensor_y_test[:5])

[[ 0.7682274   0.7900537 ]
 [-0.842597    0.67386925]
 [ 0.01621329 -1.5949844 ]
 [ 1.7678831   0.29708225]
 [-0.426646    0.60147953]]
[[ 1  1]
 [-1  1]
 [ 0 -2]
 [ 2  0]
 [ 0  0]]


In [13]:
score, loss = evaluate_ensemble_reg(sensor_models, sensor_x_test, sensor_y_test)
print('Test accuracy: {:.1f}%'.format(score * 100))
print('Test loss: {:.3g}\n'.format(loss))

for i, model in enumerate(sensor_models):
    score = model.evaluate(sensor_x_test, sensor_y_test, verbose=0)
    print('Model {} test accuracy: {:.1f}%'.format(i + 1, score[1] * 100))

Test accuracy: 80.8%
Test loss: 0.155

Model 1 test accuracy: 78.6%
Model 2 test accuracy: 78.6%
Model 3 test accuracy: 78.8%
Model 4 test accuracy: 77.5%
Model 5 test accuracy: 77.4%


In [14]:
def key_disp(x, y):
    h_dist = (y - 1) % 3 - (x - 1) % 3
    v_dist = (y - 1) // 3 - (x - 1) // 3
    if x == 0:
        h_dist += 1
        
    if y == 0:
        h_dist -= 1
        
    return np.array([h_dist, v_dist])

def gaussian_rbf(disp1, disp2):
    return np.exp(-c * np.sum((disp1 - disp2)**2))

def evaluate_ensemble_fused_old(models, audio_data_test, audio_labels_test, sensor_data_test):
    correct = 0
    correct2 = 0

    for audio_data_group, audio_label_group, sensor_data_group in zip(
        audio_data_test, audio_labels_test, sensor_data_test):
        x = np.array(audio_data_group)
        x = audio_scaler.transform(x)
        y_pred = predict_ensemble(models, x)
        labels_pred = encoder.inverse_transform(y_pred)
        string_pred = ''.join(labels_pred)
        string_true = ''.join(audio_label_group)

        curr_key = np.argmax(y_pred[0])
        sensor_x = np.array(sensor_data_group)
        sensor_x = sensor_transform(sensor_scaler, sensor_x)
        sensor_y_pred = predict_ensemble(sensor_models, sensor_x)
        for i in range(1, len(audio_label_group)):
            disp_probs = np.array([gaussian_rbf(key_disp(curr_key, k), sensor_y_pred[i - 1]) for k in range(10)])
            y_pred[i] *= disp_probs
            curr_key = np.argmax(y_pred[i])

        labels_pred = encoder.inverse_transform(y_pred)
        string_pred2 = ''.join(labels_pred)

        correct += sum(c1 == c2 for c1, c2 in zip(string_pred, string_true))
        correct2 += sum(c1 == c2 for c1, c2 in zip(string_pred2, string_true))

    audio_acc = correct / len(audio_data_test) / len(audio_data_test[0])
    fused_acc = correct2 / len(audio_data_test) / len(audio_data_test[0])
    return audio_acc, fused_acc

c = 1
audio_acc_old, fused_acc_old = evaluate_ensemble_fused_old(models, audio_data_test, audio_labels_test, sensor_data_test)
print('Accuracy (audio only):  {:.1f}%'.format(audio_acc_old * 100))
print('Accuracy (audio+acce):  {:.1f}%'.format(fused_acc_old * 100))

Accuracy (audio only):  88.1%
Accuracy (audio+acce):  91.5%


In [15]:
ag_x_train = np.array([audio_scaler.transform(group) for group in audio_data_train])
sg_x_train = np.array([sensor_transform(sensor_scaler, np.array(group)) for group in sensor_data_train])
ag_y_train = np.array(audio_targets_train)
sg_y_train = np.array(sensor_targets_train)
ag_x_val = np.array([audio_scaler.transform(group) for group in audio_data_val])
sg_x_val = np.array([sensor_transform(sensor_scaler, np.array(group)) for group in sensor_data_val])
ag_y_val = np.array(audio_targets_val)
sg_y_val = np.array(sensor_targets_val)
ag_x_test = np.array([audio_scaler.transform(group) for group in audio_data_test])
sg_x_test = np.array([sensor_transform(sensor_scaler, np.array(group)) for group in sensor_data_test])
ag_y_test = np.array(audio_targets_test)
sg_y_test = np.array(sensor_targets_test)

In [16]:
def key_pos(k):
    key_locs = np.array([
        [0, 0],
        [-1, 1], [0, 1], [1, 1],
        [-1, 2], [0, 2], [1, 2],
        [-1, 3], [0, 3], [1, 3]
    ])
    return key_locs[k]

def log_ll(keys_pred, dirs_pred, guess):
    keys_pred /= np.vstack(np.sum(keys_pred, axis=1))
    key_probs = keys_pred[range(len(guess)), guess]
    log_key_prob = np.sum(np.log(key_probs))
    
    key_locs = key_pos(guess)
    disps = np.diff(key_locs, axis=0)
    log_dir_prob = -c * np.sum((disps - dirs_pred[:len(guess) - 1])**2)
    return log_key_prob + log_dir_prob

def max_log_ll(keys_pred, dirs_pred):
    best_guess = np.argmax(keys_pred, axis=1)
    max_log_ll = log_ll(keys_pred, dirs_pred, best_guess)
    
    def search_starting_from(guess):
        nonlocal best_guess, max_log_ll
        if len(guess) > 0:
            curr_log_ll = log_ll(keys_pred, dirs_pred, guess)
            if curr_log_ll <= max_log_ll:
                return
            
            if len(guess) == len(keys_pred):
                if curr_log_ll > max_log_ll:
                    max_log_ll = curr_log_ll
                    best_guess = guess
                    
                return
        
        new_guesses = [np.append(guess, x).astype(int) for x in range(keys_pred.shape[1])]
        for new_guess in new_guesses:
            search_starting_from(new_guess)
        
    search_starting_from([])
    return best_guess

def evaluate_ensemble_fused(audio_models, sensor_models, audio_data, sensor_data, audio_targets):
    audio_correct = 0
    fused_correct = 0
    for audio_x, sensor_x, audio_y in zip(audio_data, sensor_data, audio_targets):
        audio_pred = predict_ensemble(audio_models, audio_x)
        sensor_pred = predict_ensemble(sensor_models, sensor_x)
        string_pred_audio = np.argmax(audio_pred, axis=-1)
        string_pred_fused = max_log_ll(audio_pred, sensor_pred)
        string_true = np.argmax(audio_y, axis=-1)
        
        audio_correct += sum(string_pred_audio == string_true)
        fused_correct += sum(string_pred_fused == string_true)

    audio_acc = audio_correct / len(audio_data) / len(audio_data[0])
    fused_acc = fused_correct / len(audio_data) / len(audio_data[0])
    return audio_acc, fused_acc

In [17]:
def max_log_ll_top_k(keys_pred, dirs_pred, k):
    best_guess = np.argmax(keys_pred, axis=1)
    max_log_ll = log_ll(keys_pred, dirs_pred, best_guess)
    best_guesses = [(max_log_ll, 0, best_guess)]
    iterations = 0
    
    def search_starting_from(guess):
        nonlocal best_guesses, iterations
        iterations += 1
        if len(guess) > 0:
            curr_log_ll = log_ll(keys_pred, dirs_pred, guess)
            if len(best_guesses) == k and curr_log_ll <= best_guesses[0][0]:
                return
            
            if len(guess) == len(keys_pred):
                if (len(best_guesses) < k or curr_log_ll > best_guesses[0][0]) and not np.array_equal(guess, best_guess):
                    best_guesses.append((curr_log_ll, iterations, guess))
                    best_guesses = sorted(best_guesses)[-k:]
                    
                return
        
        new_guesses = [np.append(guess, x).astype(int) for x in range(keys_pred.shape[1])]
        for new_guess in new_guesses:
            search_starting_from(new_guess)
        
    search_starting_from([])
    return best_guesses

def evaluate_ensemble_fused_top_k(audio_models, sensor_models, audio_data, sensor_data, audio_targets, k):
    fused_correct = 0
    for audio_x, sensor_x, audio_y in zip(audio_data, sensor_data, audio_targets):
        audio_pred = predict_ensemble(audio_models, audio_x)
        sensor_pred = predict_ensemble(sensor_models, sensor_x)
        string_preds_fused = max_log_ll_top_k(audio_pred, sensor_pred, k)
        string_true = np.argmax(audio_y, axis=-1)
        
        fused_correct += sum(np.all(pred[2] == string_true) for pred in string_preds_fused)

    fused_acc = fused_correct / len(audio_data)
    return fused_acc

k = 5
audio_acc, fused_acc = evaluate_ensemble_fused(models, sensor_models, ag_x_test, sg_x_test, ag_y_test)
fused_acc_top_k = evaluate_ensemble_fused_top_k(models, sensor_models, ag_x_test, sg_x_test, ag_y_test, k)
print('Accuracy (audio only, individual key):         {:.1f}%'.format(audio_acc * 100))
print('Accuracy (audio+acce, MLTS, individual key):   {:.1f}%'.format(fused_acc * 100))
print('Accuracy (audio+acce, MLTS, full PIN, top {}):  {:.1f}%'.format(k, fused_acc_top_k * 100))

Accuracy (audio only, individual key):         88.1%
Accuracy (audio+acce, MLTS, individual key):   93.7%
Accuracy (audio+acce, MLTS, full PIN, top 5):  90.7%


In [18]:
test_folder = 'datasets/atm_pad_7(11,acce)'
peak_thresh = 0.8
peak_sep_sec = 0.2

print('Loading test data...')
audio_data_groups, audio_label_groups, sensor_data_groups, sensor_target_groups = get_data_from_folder(test_folder, debug_level=1)

audio_target_groups = [encoder.transform(group) for group in audio_label_groups]
ag_x_test = np.array([audio_scaler.transform(group) for group in audio_data_groups])
sg_x_test = np.array([sensor_transform(sensor_scaler, np.array(group)) for group in sensor_data_groups])
ag_y_test = np.array(audio_target_groups)
sg_y_test = np.array(sensor_target_groups)

print('ag_x_test shape:', ag_x_test.shape)
print('ag_y_test shape:', ag_y_test.shape)
print('sg_x_test shape:', sg_x_test.shape)
print('sg_y_test shape:', sg_y_test.shape)

Loading test data...


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


0 of 11 (0.0%) samples discarded
ag_x_test shape: (11, 6, 512)
ag_y_test shape: (11, 6, 10)
sg_x_test shape: (11, 5, 256, 3)
sg_y_test shape: (11, 5, 2)


In [19]:
sensor_x_test, sensor_y_test = flatten_groups(sg_x_test, sg_y_test)

score, loss = evaluate_ensemble_reg(sensor_models, sensor_x_test, sensor_y_test)
print('Test accuracy: {:.1f}%'.format(score * 100))
print('Test loss: {:.3g}\n'.format(loss))

for i, model in enumerate(sensor_models):
    score = model.evaluate(sensor_x_test, sensor_y_test, verbose=0)
    print('Model {} test accuracy: {:.1f}%'.format(i + 1, score[1] * 100))

Test accuracy: 54.5%
Test loss: 0.427

Model 1 test accuracy: 56.6%
Model 2 test accuracy: 43.8%
Model 3 test accuracy: 56.6%
Model 4 test accuracy: 41.8%
Model 5 test accuracy: 47.5%


In [20]:
c = 1
audio_acc, fused_acc_1 = evaluate_ensemble_fused(models, sensor_models, ag_x_test, sg_x_test, ag_y_test)

c = 100
_, fused_acc_100 = evaluate_ensemble_fused(models, sensor_models, ag_x_test, sg_x_test, ag_y_test)
print('Accuracy (audio only):              {:.1f}%'.format(audio_acc * 100))
print('Accuracy (audio+acce, MLTS, c=1):   {:.1f}%'.format(fused_acc_1 * 100))
print('Accuracy (audio+acce, MLTS, c=100): {:.1f}%'.format(fused_acc_100 * 100))

  del sys.path[0]


Accuracy (audio only):              12.1%
Accuracy (audio+acce, MLTS, c=1):   19.7%
Accuracy (audio+acce, MLTS, c=100): 27.3%
