In [1]:
import numpy as np
import scipy.io.wavfile as sio
import os

In [2]:
ROOT = "./datasets/clarin-long/data"

In [3]:
def get_fnames_from_clarin_corpus(ROOT):
    rec_fnames, trans_fnames = [], []
    for i in [x for x in os.listdir(ROOT) if os.path.isdir(os.path.join(ROOT, x))]:
        recordings = os.path.join(ROOT, i, "wav")
        transcripts = os.path.join(ROOT, i, "lab")
        for fname in os.listdir(recordings):
            core, extension = fname.split(".")
            assert extension == "wav"
            if os.path.isfile(os.path.join(transcripts, core + ".hlb")):
                rec_fnames.append(os.path.join(recordings, fname))
                trans_fnames.append(os.path.join(transcripts, core + ".hlb"))
    return rec_fnames, trans_fnames

rec_fnames, trans_fnames = get_fnames_from_clarin_corpus(ROOT)

In [5]:
def get_recording_lengths(fnames):
    lens = []
    for f in fnames:
        sr, data = sio.read(f)
        assert sr == 16000
        assert len(data.shape) == 1
        lens.append(len(data))
    return lens

lens = get_recording_lengths(rec_fnames)

In [6]:
sorted_lens = np.array(sorted(lens))
step = len(sorted_lens) // 10
delimiter_lens = [sorted_lens[step * i] for i in range(10)]
delimiters = [step * i for i in range(11)]

In [7]:
delimiter_lens.append(sorted_lens[-1] + 128)
indices = []
for i in range(10):
    lb, hb = delimiter_lens[i], delimiter_lens[i+1]
    indices.append((lb <= np.array(lens)) * (np.array(lens) < hb))

In [8]:
def get_phones_clarin(fname):
    with open(fname, "r", encoding="utf-8") as f:
        s = f.read()
        s = s.split("Phoneme Phoneme")[1]
        s = s.split("\n\n")[0]
        s = [x.split(' ')[1] for x in s.split('\n') if x.strip()]
        s = [x.split('_')[0] for x in s]
    return s

transes = (get_phones_clarin(x) for x in trans_fnames)
all_phones = set()

def len_add_set(trans):
    global all_phones
    all_phones |= set(trans)
    return len(trans)

trans_lens = [len_add_set(x) for x in transes]
list(all_phones), max(trans_lens), len(list(all_phones))

(['dz',
  'b',
  'zi',
  'dZ',
  'Z',
  't',
  'f',
  'ts',
  'i',
  'I',
  'o',
  'si',
  'j',
  's',
  'g',
  'on',
  'r',
  'ni',
  'sil',
  'z',
  'a',
  'v',
  'e',
  'd',
  'm',
  'k',
  'u',
  'S',
  'w',
  'x',
  'l',
  'tsi',
  'n',
  'tS',
  'dzi',
  'p',
  'en'],
 423,
 37)

In [10]:
import librosa


phone_dict = list(all_phones)
phone_zero = len(phone_dict)

callback = lambda x: x.reshape(-1, 1)

# callback = lambda x: np.log(np.abs(librosa.stft(x, n_fft=512, hop_length=128).T) ** 2) + 2e-12)
# callback = lambda x: librosa.feature.mfcc(S=librosa.feature.melspectrogram(x, sr=16000, n_fft=512, hop_length=128), sr=16000).T

BIN_SIZE = 1
# BIN_SIZE = 257
# BIN_SIZE = 20
LENGTHS = [x for x in delimiter_lens[1:]]

for ix, stratum in enumerate(indices):
    ixes = np.where(stratum)[0]
    shape = [len(ixes), LENGTHS[ix] + 4, BIN_SIZE]
    phones_shape = [len(ixes), max([trans_lens[x] for x in ixes]) + 1]
    specs, transes = None, None
    specs = np.zeros(shape, np.float32)
    transes = np.ones(phones_shape, np.uint16) * phone_zero
    for num, rec_ix in enumerate(ixes):
        fname = rec_fnames[rec_ix]
        data = sio.read(fname)[1].astype(np.float32) / 2**15
        stft = callback(data)
        # print(stft.shape, specs.shape)
        specs[num, :stft.shape[0], :BIN_SIZE] = stft
        trans = get_phones_clarin(trans_fnames[rec_ix])
        trans = np.array([phone_dict.index(x) for x in trans])
        transes[num, :len(trans)] = trans
    np.save(os.path.join(ROOT, "clarin-mfcc-rec-pure-timedomain-{}".format(ix)), specs)
    np.save(os.path.join(ROOT, "clarin-mfcc-trans-pure-timedomain-{}".format(ix)), transes)
    print("Saved batch", ix)

Saved batch 0
Saved batch 1
Saved batch 2
Saved batch 3
Saved batch 4
Saved batch 5
Saved batch 6
Saved batch 7
Saved batch 8
Saved batch 9


In [139]:
! top

[?1h=[H[2J[mtop - 10:56:44 up 1 day, 16:41,  0 users,  load average: 1.33, 1.59, 1.68[m[m[m[m[K
Tasks:[m[m[1m   6 [m[mtotal,[m[m[1m   1 [m[mrunning,[m[m[1m   5 [m[msleeping,[m[m[1m   0 [m[mstopped,[m[m[1m   0 [m[mzombie[m[m[m[m[K
%Cpu(s):[m[m[1m  2.1 [m[mus,[m[m[1m  0.3 [m[msy,[m[m[1m  0.0 [m[mni,[m[m[1m 97.6 [m[mid,[m[m[1m  0.0 [m[mwa,[m[m[1m  0.0 [m[mhi,[m[m[1m  0.0 [m[msi,[m[m[1m  0.0 [m[mst[m[m[m[m[K
KiB Mem :[m[m[1m 32931504 [m[mtotal,[m[m[1m 16264216 [m[mfree,[m[m[1m   488724 [m[mused,[m[m[1m 16178564 [m[mbuff/cache[m[m[m[m[K
KiB Swap:[m[m[1m        0 [m[mtotal,[m[m[1m        0 [m[mfree,[m[m[1m        0 [m[mused.[m[m[1m 31869656 [m[mavail Mem [m[m[m[m[K
[K
[7m  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND     [m[m[K
[m   15 root      20   0  329408  73140  13128 S   6.7  0.2   1:05.93 jupyter-no+ [m[m[K
[m  9

In [3]:
import librosa
librosa.feature.mfcc

<function librosa.feature.spectral.mfcc>

In [24]:
librosa.feature.mfcc?

In [5]:
import numpy as np

In [16]:
time = np.random.normal(size=[32000,])

In [15]:
librosa.feature.mfcc(time, sr=16000).shape

(20, 63)

In [23]:
librosa.stft(time, n_fft=512)

(257, 251)

In [29]:
librosa.feature.mfcc(S=librosa.feature.melspectrogram(time, sr=16000, n_fft=512, hop_length=128), sr=16000).shape

(20, 251)

In [16]:
with open("/pictec/datasets/clarin-long/data/SES0001/lab/sent001.plb", encoding="utf-8") as f:
    text = f.read()

In [18]:
print(text)

#
0 100 #
0.52 100 sil
0.59 100 s
0.66 100 t
0.66 100 sil
0.69 100 f
0.75 100 j
0.8 100 e
0.86 100 r
0.91 100 dz
0.98 100 a
1.04 100 m
1.26 100 sil
1.38 100 Z
1.44 100 e
1.55 100 s
1.59 100 e
1.65 100 n
1.73 100 a
1.79 100 t
1.83 100 o
1.92 100 r
2.08 100 s
2.14 100 t
2.19 100 a
2.25 100 ni
2.3 100 i
2.42 100 s
2.46 100 w
2.51 100 a
2.56 100 f
2.56 100 sil
2.72 100 x
2.76 100 u
2.83 100 s
2.83 100 sil
2.94 100 k
3.0 100 o
3.06 100 f
3.16 100 s
3.24 100 k
3.4 100 i
3.57 100 sil
3.72 100 z
3.72 100 sil
3.9 100 w
3.95 100 o
3.95 100 sil
4.04 100 Z
4.1 100 I
4.28 100 w
4.28 100 sil
4.45 100 si
4.49 100 l
4.53 100 u
4.59 100 b
4.66 100 o
4.72 100 v
4.8 100 a
4.88 100 ni
4.95 100 e
5.66 100 sil
5.72 100 p
5.77 100 a
5.85 100 ni
5.88 100 e
5.99 100 s
6.02 100 e
6.05 100 n
6.11 100 a
6.11 100 sil
6.19 100 t
6.25 100 o
6.32 100 Z
6.37 100 e
6.45 100 g
6.5 100 r
6.56 100 a
6.64 100 t
6.68 100 u
6.73 100 l
6.73 100 sil
6.82 100 u
6.86 100 j
6.93 100 e
7.71 100 sil
7.79 100 o
7.86 100 k
7.9 100 l
