In [5]:
import numpy as np
import scipy.io.wavfile as sio
import os

In [6]:
ROOT = "./datasets/clarin-long/data"

In [7]:
def get_fnames_from_clarin_corpus(ROOT):
    rec_fnames, trans_fnames = [], []
    for i in [x for x in os.listdir(ROOT) if os.path.isdir(os.path.join(ROOT, x))]:
        recordings = os.path.join(ROOT, i, "wav")
        transcripts = os.path.join(ROOT, i, "lab")
        for fname in os.listdir(recordings):
            core, extension = fname.split(".")
            assert extension == "wav"
            if os.path.isfile(os.path.join(transcripts, core + ".plb")):
                rec_fnames.append(os.path.join(recordings, fname))
                trans_fnames.append(os.path.join(transcripts, core + ".plb"))
    return rec_fnames, trans_fnames

rec_fnames, trans_fnames = get_fnames_from_clarin_corpus(ROOT)

In [8]:
def get_recording_lengths(fnames):
    lens = []
    for f in fnames:
        sr, data = sio.read(f)
        assert sr == 16000
        assert len(data.shape) == 1
        lens.append(len(data))
    return lens

lens = get_recording_lengths(rec_fnames)

In [9]:
sorted_lens = np.array(sorted(lens))
step = len(sorted_lens) // 10
delimiter_lens = [sorted_lens[step * i] for i in range(10)]
delimiters = [step * i for i in range(11)]

In [10]:
delimiter_lens.append(sorted_lens[-1] + 128)
indices = []
for i in range(10):
    lb, hb = delimiter_lens[i], delimiter_lens[i+1]
    indices.append((lb <= np.array(lens)) * (np.array(lens) < hb))

In [11]:
timescale = 512 / 16000
hop = 128 / 16000

def get_phones_clarin(fname):
    with open(fname, "r", encoding="utf-8") as f:
        s = f.read()
        s = s.split("\n")[2:]
        s = [(
            int((float(x.split(' ')[0]) - (timescale - hop)) / hop + 0.00001) + 1,
            x.split(' ')[2].split('_')[0]) for x in s if x]
        s = [x for x in s if x[1] != "sil"]
    return s

transes = ([y[1] for y in get_phones_clarin(x)] for x in trans_fnames)
all_phones = set()

def len_add_set(trans):
    global all_phones
    all_phones |= set(trans)
    return len(trans)

trans_lens = [len_add_set(x) for x in transes]
list(all_phones), max(trans_lens), len(list(all_phones))

(['S',
  'g',
  'r',
  'tsi',
  'ts',
  'zi',
  'a',
  'on',
  'o',
  'v',
  'n',
  'm',
  'dz',
  'e',
  's',
  'f',
  'dZ',
  'I',
  'Z',
  'w',
  'i',
  'en',
  'b',
  'ni',
  'x',
  'u',
  'd',
  'l',
  'z',
  't',
  'si',
  'p',
  'k',
  'j',
  'tS',
  'dzi'],
 350,
 36)

In [16]:
trans_lens

[144,
 137,
 135,
 174,
 184,
 161,
 71,
 99,
 203,
 111,
 118,
 155,
 125,
 201,
 212,
 91,
 90,
 192,
 150,
 168,
 184,
 140,
 117,
 152,
 202,
 150,
 164,
 120,
 119,
 138,
 147,
 65,
 136,
 111,
 139,
 198,
 57,
 71,
 163,
 51,
 100,
 72,
 137,
 104,
 117,
 136,
 101,
 81,
 94,
 96,
 78,
 147,
 76,
 70,
 68,
 253,
 93,
 132,
 71,
 85,
 67,
 92,
 210,
 88,
 181,
 113,
 119,
 95,
 128,
 110,
 134,
 99,
 221,
 81,
 65,
 109,
 53,
 100,
 141,
 78,
 76,
 77,
 134,
 105,
 106,
 82,
 149,
 79,
 110,
 89,
 112,
 110,
 115,
 98,
 177,
 155,
 199,
 147,
 118,
 119,
 66,
 106,
 114,
 123,
 138,
 150,
 143,
 134,
 199,
 164,
 183,
 189,
 208,
 201,
 215,
 185,
 99,
 132,
 145,
 105,
 114,
 162,
 161,
 171,
 131,
 139,
 126,
 125,
 170,
 104,
 135,
 73,
 192,
 174,
 190,
 155,
 93,
 179,
 161,
 108,
 123,
 111,
 137,
 152,
 147,
 159,
 190,
 183,
 168,
 165,
 149,
 166,
 117,
 140,
 80,
 99,
 87,
 104,
 45,
 79,
 136,
 79,
 84,
 56,
 75,
 85,
 105,
 109,
 137,
 68,
 159,
 88,
 117,
 87,
 109,
 

In [14]:
import librosa

phone_dict = list(all_phones)
phone_zero = len(phone_dict)

def pcen(S, sr=22050, hop_length=512, gain=0.98, bias=2, power=0.5,
         time_constant=0.400, eps=1e-6, b=None, max_size=1, ref=None,
         axis=-1, max_axis=None):
    """
    Adapted from librosa
    """
    if b is None:
        t_frames = time_constant * sr / float(hop_length)
        # By default, this solves the equation for b:
        #   b**2  + (1 - b) / t_frames  - 2 = 0
        # which approximates the full-width half-max of the
        # squared frequency response of the IIR low-pass filter
        b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn('pcen was called on complex input so phase '
                      'information will be discarded. To suppress this warning, '
                      'call pcen(np.abs(D)) instead.')
        S = np.abs(S)

    if ref is None:
        if max_size == 1:
            ref = S
        elif S.ndim == 1:
            raise ParameterError('Max-filtering cannot be applied to 1-dimensional input')
        else:
            if max_axis is None:
                if S.ndim != 2:
                    raise ParameterError('Max-filtering a {:d}-dimensional spectrogram '
                                         'requires you to specify max_axis'.format(S.ndim))
                # if axis = 0, max_axis=1
                # if axis = +- 1, max_axis = 0
                max_axis = np.mod(1 - axis, 2)

            ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis)

    S_smooth = scipy.signal.lfilter([b], [1, b - 1], ref, axis=axis)

    # Working in log-space gives us some stability, and a slight speedup
    smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
    return (S * smooth + bias)**power - bias**power

def callback(data):
    spec = librosa.stft(data, fft_size=512, hop_length=128)
    spec = pcen(spec, sr=16000, hop_length=128)
    return spec

# callback = lambda x: np.log(np.abs(librosa.stft(x, n_fft=512, hop_length=128).T) ** 2) + 2e-12)
# callback = lambda x: librosa.feature.mfcc(S=librosa.feature.melspectrogram(x, sr=16000, n_fft=512, hop_length=128), sr=16000).T

BIN_SIZE = 512
LENGTHS = [1 + (x - 512) // 128 for x in delimiter_lens[1:]]
N_PHONES = len(list(all_phones))

for ix, stratum in enumerate(indices):
    ixes = np.where(stratum)[0]
    shape = [len(ixes), LENGTHS[ix] + 4, BIN_SIZE]
    phones_shape = [len(ixes), (LENGTHS[ix] + 4)  // 2, N_PHONES]
    specs, transes = None, None
    specs = np.zeros(shape, np.float32)
    transes = np.zeros(phones_shape, np.uint16)    
    for num, rec_ix in enumerate(ixes):
        fname = rec_fnames[rec_ix]
        data = sio.read(fname)[1].astype(np.float32) / 2**15
        stft = callback(data)
        specs[num, :stft.shape[0], :BIN_SIZE] = stft
        trans = np.zeros(phones_shape[1:])
        trans_phones = get_phones_clarin(trans_fnames[rec_ix])
        trans_phones = [(x[0], phone_dict.index(x[1])) for x in trans_phones]
        for where, what in trans_phones:
            trans[(where // 2), what] = 1
        transes[num, :, :] = trans
    np.save(os.path.join(ROOT, "clarin-mfcc-rec-aligned-timedomain-{}".format(ix)), specs)
    np.save(os.path.join(ROOT, "clarin-mfcc-trans-aligned-timedomain-{}".format(ix)), transes)
    print("Saved batch", ix)

Saved batch 0
Saved batch 1
Saved batch 2
Saved batch 3
Saved batch 4
Saved batch 5
Saved batch 6
Saved batch 7
Saved batch 8


MemoryError: 

In [15]:
import gc
gc.collect()

175

In [None]:
import librosa
librosa.feature.mfcc

In [None]:
librosa.feature.mfcc?

In [None]:
import numpy as np

In [None]:
time = np.random.normal(size=[32000,])

In [None]:
librosa.feature.mfcc(time, sr=16000).shape

In [None]:
librosa.stft(time, n_fft=512)

In [None]:
librosa.feature.mfcc(S=librosa.feature.melspectrogram(time, sr=16000, n_fft=512, hop_length=128), sr=16000).shape

In [None]:
with open("/pictec/datasets/clarin-long/data/SES0001/lab/sent001.plb", encoding="utf-8") as f:
    text = f.read()

In [None]:
print(text)

In [4]:
import librosa
librosa.__version__

'0.6.0'

In [None]:
def pcen(S, sr=22050, hop_length=512, gain=0.98, bias=2, power=0.5,
         time_constant=0.400, eps=1e-6, b=None, max_size=1, ref=None,
         axis=-1, max_axis=None):
    """
    Adapted from librosa
    """
    if b is None:
        t_frames = time_constant * sr / float(hop_length)
        # By default, this solves the equation for b:
        #   b**2  + (1 - b) / t_frames  - 2 = 0
        # which approximates the full-width half-max of the
        # squared frequency response of the IIR low-pass filter
        b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn('pcen was called on complex input so phase '
                      'information will be discarded. To suppress this warning, '
                      'call pcen(np.abs(D)) instead.')
        S = np.abs(S)

    if ref is None:
        if max_size == 1:
            ref = S
        elif S.ndim == 1:
            raise ParameterError('Max-filtering cannot be applied to 1-dimensional input')
        else:
            if max_axis is None:
                if S.ndim != 2:
                    raise ParameterError('Max-filtering a {:d}-dimensional spectrogram '
                                         'requires you to specify max_axis'.format(S.ndim))
                # if axis = 0, max_axis=1
                # if axis = +- 1, max_axis = 0
                max_axis = np.mod(1 - axis, 2)

            ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis)

    S_smooth = scipy.signal.lfilter([b], [1, b - 1], ref, axis=axis)

    # Working in log-space gives us some stability, and a slight speedup
    smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
    return (S * smooth + bias)**power - bias**power