## Setup

In [1]:
import os
from copy import deepcopy
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
import sklearn.decomposition
import matplotlib.pyplot as plt
from hdf5storage import loadmat, savemat

import torch
import torch.nn as nn
import torch.nn.functional as F

from api.preprocessing import sound2coch
from api.librispeech import LibriDataset
from api.model import SpeechRecognitionCTC

# CUDA for PyTorch
device = torch.device("cpu")
print(device)

cpu


## Define model

In [2]:
LibriDataset.set_mode('phoneme') # Should be one of 'grapheme', 'phoneme', 'word'
LibriDataset.set_spacing(True)

freq_bins = 65
size_vocab = LibriDataset.vocab_size()
print(f'Frequency dimension: {freq_bins}, alphabet size: {size_vocab}')

Frequency dimension: 65, alphabet size: 42


In [3]:
N_LAYER, N_NODES = 3, 500
model_id = f'{LibriDataset.MODE}-spaced-{N_LAYER}-{N_NODES}'
model_arch = dict(rnn_hidden_size=N_NODES, nb_layers=N_LAYER, window_size=10, rnn_stride=2)
model_name = 'models/model-ctc-{:s}'.format(model_id)

print(model_id)

phoneme-spaced-3-500


In [4]:
model = SpeechRecognitionCTC(rnn_type=nn.GRU, labels=LibriDataset.alphabet(), **model_arch, freq_bins=freq_bins)
model = model.to(device)

model.load_state_dict(torch.load(f'{model_name}.pt', map_location=device))
model.eval();

## Inferrence

In [5]:
alphabet = LibriDataset.alphabet()
sound_file = '/archive/menoua/Data/LibriSpeech/LibriSpeech/train-clean-100/1098/133695/1098-133695-0005.flac'
cochleagram = sound2coch(sound_file)

pred = model.get_prediction(cochleagram)
actv = model.get_activation(cochleagram)

print(pred)

IH Z AH B EH L <SPACE> P R ER Z AH N T L IY <SPACE> F AW N D <SPACE> HH ER S EH L F <SPACE> IH N <SPACE> DH AH <SPACE> S IH NG G Y AH L ER <SPACE> S IH CH UW EY SH AH N <SPACE> AH V <SPACE> D IH F V EH N D IH NG <SPACE> DH AH <SPACE> B R IH T IH SH <SPACE> K AA N S T AH T UW SH AH N <SPACE> AH G EH N S T <SPACE> HH ER <SPACE> AE N T


## Custom inferrence

In [6]:
# trans = []
# for layer in range(nb_layers):
#     svd = sklearn.decomposition.TruncatedSVD(n_components=65, n_iter=25)
#     svd.fit(acts_libri[layer])
#     trans.append(svd)

In [10]:
alphabet = LibriDataset.alphabet()
sound_file = '/archive/menoua/Data/LibriSpeech/LibriSpeech/train-clean-100/1098/133695/1098-133695-0005.flac'
cochleagram = sound2coch(sound_file)

model.eval()
with torch.no_grad():
    xs = torch.Tensor(cochleagram).unsqueeze(0)
    xs = xs.type(torch.float32).to(device)
    xlen = torch.LongTensor([xs.shape[1]])
    z, zlen = model.predict(xs, xlen)
    
    zi = [_ for _ in z.squeeze().argmax(dim=1)]
    for i in range(len(zi)-1,0,-1):
        if zi[i] == zi[i-1]: zi = zi[:i-1] + zi[i:]
    zi = [alphabet[_] for _ in zi if _ > 0]
    
    zi = ['<SPACE>' if _ == LibriDataset.SYM_SPACE else _ for _ in zi]
    zi = ' '.join(zi)
    
    print(zi)
    print()
    
    activation = model.activations(xs)
    activation = [x.squeeze(0).numpy().astype('float32') for x in activation]
    # activation = [trans[layer].transform(activation[layer]) for layer in range(nb_layers)]

IH Z AH B EH L <SPACE> P R ER Z AH N T L IY <SPACE> F AW N D <SPACE> HH ER S EH L F <SPACE> IH N <SPACE> DH AH <SPACE> S IH NG G Y AH L ER <SPACE> S IH CH UW EY SH AH N <SPACE> AH V <SPACE> D IH F V EH N D IH NG <SPACE> DH AH <SPACE> B R IH T IH SH <SPACE> K AA N S T AH T UW SH AH N <SPACE> AH G EH N S T <SPACE> HH ER <SPACE> AE N T



In [11]:
print(' '.join([str(x.shape) for x in activation]))

(348, 500) (348, 500) (348, 500)


In [9]:
# savemat(f'activations-ctc-{model_id}-100hz-20ms.mat', mdict={'acts_neural': acts_neural, 'stim_neural': stim_neural})