## Setup

In [1]:
import os
from copy import deepcopy
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
import sklearn.decomposition
import matplotlib.pyplot as plt
from hdf5storage import loadmat, savemat

import torch
import torch.nn as nn
import torch.nn.functional as F

from api.preprocessing import sound2coch
from api.librispeech import LibriDataset
from api.model import SpeechRecognitionCTC

# CUDA for PyTorch
device = torch.device("cpu")
print(device)

cpu


## Load trained model

In [2]:
LibriDataset.set_mode('grapheme') # Should be one of 'grapheme', 'phoneme', 'word'

freq_bins = 65
size_vocab = LibriDataset.vocab_size()
print(f'Frequency dimension: {freq_bins}, alphabet size: {size_vocab}')

Frequency dimension: 65, alphabet size: 31


In [3]:
N_LAYER, N_NODES = 5, 500
model_id = f'{LibriDataset.MODE}-{N_LAYER}-{N_NODES}'
model_arch = dict(rnn_hidden_size=N_NODES, nb_layers=N_LAYER, window_size=10, rnn_stride=2)
model_name = 'models/model-ctc-{:s}'.format(model_id)

print(model_id)

grapheme-5-500


In [4]:
model = SpeechRecognitionCTC(rnn_type=nn.GRU, labels=LibriDataset.alphabet(), **model_arch, freq_bins=freq_bins)
model = model.to(device)

model.load_state_dict(torch.load(f'{model_name}.pt', map_location=device))
model.eval();

## Inferrence

In [5]:
alphabet = LibriDataset.alphabet()
sound_file = './unattended_story_2.wav'
cochleagram = sound2coch(sound_file)

pred = model.get_prediction(cochleagram)
actv = model.get_activation(cochleagram)

print(pred)

here
H O W <SPACE> T O <SPACE> I D E N T I F I <SPACE> A <SPACE> B I R D <SPACE> H A V E <SPACE> Y O U <SPACE> H A V E <SPACE> H E R S E E N <SPACE> A <SPACE> B I R D <SPACE> A T <SPACE> Y O U R <SPACE> F E E T E R <SPACE> T H A T <SPACE> Y O U <SPACE> R E A L L Y <SPACE> L I K E <SPACE> B U T <SPACE> J U S T <SPACE> C A N N O T <SPACE> S E E M <SPACE> T O <SPACE> F I G U R E <SPACE> O U T <SPACE> W H A T <SPACE> I T <SPACE> I S <SPACE> Y O U <SPACE> S H O U L D <SPACE> L E A R N <SPACE> H O <SPACE> T O <SPACE> I D E N T I F Y <SPACE> A <SPACE> B I R D <SPACE> A N D <SPACE> I <SPACE> W I L L <SPACE> E X P L A I N <SPACE> H O W <SPACE> T O <SPACE> D O <SPACE> S O <SPACE> F I R S T <SPACE> Y O U <SPACE> N E E <SPACE> T O <SPACE> C A T A G A R E Z E <SPACE> T H E <SPACE> B I R D <SPACE> T H E R E <SPACE> A R E <SPACE> E I T <SPACE> D I F F E R E N T <SPACE> C A B T A G O R I E S <SPACE> T H A T <SPACE> T H E Y K I N <SPACE> G O I N G <SPACE> T O <SPACE> A S <SPACE> S O O N <SPACE> A S <SP

In [6]:
actv[0].shape

(8923, 500)

## Custom inferrence

In [7]:
# trans = []
# for layer in range(nb_layers):
#     svd = sklearn.decomposition.TruncatedSVD(n_components=65, n_iter=25)
#     svd.fit(acts_libri[layer])
#     trans.append(svd)

In [8]:
alphabet = LibriDataset.alphabet()
sound_file = '/archive/menoua/Data/LibriSpeech/LibriSpeech/train-clean-100/1098/133695/1098-133695-0005.flac'
cochleagram = sound2coch(sound_file)

model.eval()
with torch.no_grad():
    xs = torch.Tensor(cochleagram).unsqueeze(0)
    xs = xs.type(torch.float32).to(device)
    xlen = torch.LongTensor([xs.shape[1]])
    z, zlen = model.predict(xs, xlen)
    
    zi = [_ for _ in z.squeeze().argmax(dim=1)]
    for i in range(len(zi)-1,0,-1):
        if zi[i] == zi[i-1]: zi = zi[:i-1] + zi[i:]
    zi = [alphabet[_] for _ in zi if _ > 0]
    
    zi = [' ' if _ == LibriDataset.SYM_SPACE else _ for _ in zi]
    zi = ''.join(zi)
    
    print(zi)
    print()
    
    activation = model.activations(xs)
    activation = [x.squeeze(0).numpy().astype('float32') for x in activation]
    # activation = [trans[layer].transform(activation[layer]) for layer in range(nb_layers)]



FileNotFoundError: [Errno 2] No such file or directory: '/archive/menoua/Data/LibriSpeech/LibriSpeech/train-clean-100/1098/133695/1098-133695-0005.flac'

In [None]:
print(' '.join([str(x.shape) for x in activation]))

In [None]:
# savemat(f'activations-ctc-{model_id}-100hz-20ms.mat', mdict={'acts_neural': acts_neural, 'stim_neural': stim_neural})