In [2]:
import numpy as np

def x(X):
    sample = np.random.random([512])
    sampleX = np.stack([sample[x:x+24] for x in range(511-24)])
    sampleY = np.stack([sample[x+24] for x in range(511-24)])
    np.linalg.pinv((sampleX.T @ sampleX)) @ sampleX.T @ sampleY

In [3]:
import numpy as np
import scipy.io.wavfile as sio
import os

In [4]:
ROOT = "./datasets/clarin-long/data"

In [5]:
def get_fnames_from_clarin_corpus(ROOT):
    rec_fnames, trans_fnames = [], []
    for i in [x for x in os.listdir(ROOT) if os.path.isdir(os.path.join(ROOT, x))]:
        recordings = os.path.join(ROOT, i, "wav")
        transcripts = os.path.join(ROOT, i, "lab")
        for fname in os.listdir(recordings):
            core, extension = fname.split(".")
            assert extension == "wav"
            if os.path.isfile(os.path.join(transcripts, core + ".hlb")):
                rec_fnames.append(os.path.join(recordings, fname))
                trans_fnames.append(os.path.join(transcripts, core + ".hlb"))
    return rec_fnames, trans_fnames

rec_fnames, trans_fnames = get_fnames_from_clarin_corpus(ROOT)

In [6]:
def get_recording_lengths(fnames):
    lens = []
    for f in fnames:
        sr, data = sio.read(f)
        assert sr == 16000
        assert len(data.shape) == 1
        lens.append(len(data))
    return lens

lens = get_recording_lengths(rec_fnames)

In [7]:
sorted_lens = np.array(sorted(lens))
step = len(sorted_lens) // 10
delimiter_lens = [sorted_lens[step * i] for i in range(10)]
delimiters = [step * i for i in range(11)]

In [8]:
delimiter_lens.append(sorted_lens[-1] + 128)
indices = []
for i in range(10):
    lb, hb = delimiter_lens[i], delimiter_lens[i+1]
    indices.append((lb <= np.array(lens)) * (np.array(lens) < hb))

In [11]:
delimiter_lens

[64269,
 141920,
 166560,
 186240,
 205120,
 223520,
 243600,
 266000,
 293600,
 338880,
 1585088]

In [None]:
def get_phones_clarin(fname):
    with open(fname, "r", encoding="utf-8") as f:
        s = f.read()
        s = s.split("Phoneme Phoneme")[1]
        s = s.split("\n\n")[0]
        s = [x.split(' ')[1] for x in s.split('\n') if x.strip()]
        s = [x.split('_')[0] for x in s]
    return s

transes = (get_phones_clarin(x) for x in trans_fnames)
all_phones = set()

def len_add_set(trans):
    global all_phones
    all_phones |= set(trans)
    return len(trans)

trans_lens = [len_add_set(x) for x in transes]
list(all_phones), max(trans_lens), len(list(all_phones))

In [None]:
import librosa

NFEAT = 24

phone_dict = list(all_phones)
phone_zero = len(phone_dict)

def reg(X):
    sample = np.random.random([512])
    sampleX = np.stack([sample[x:x+NFEAT] for x in range(511-NFEAT)])
    sampleY = np.stack([sample[x+NFEAT] for x in range(511-NFEAT)])
    return np.linalg.pinv((sampleX.T @ sampleX)) @ sampleX.T @ sampleY

def callback(data):
    length = 1 + (len(data) - 512) // 128
    rec = np.zeros([length, NFEAT])
    for i in range(length):
        subbin = data[i * 128 : 128 * i + 512]
        rec[i, :NFEAT] = reg(subbin)
    return rec

# callback = lambda x: np.log(np.abs(librosa.stft(x, n_fft=512, hop_length=128).T) ** 2) + 2e-12)
# callback = lambda x: librosa.feature.mfcc(S=librosa.feature.melspectrogram(x, sr=16000, n_fft=512, hop_length=128), sr=16000).T

# N_SIZE = 1
# BIN_SIZE = 257
# BIN_SIZE = 20
LENGTHS = [1 + (len(x) - 512) // 128 for x in delimiter_lens[1:]]

for ix, stratum in enumerate(indices):
    ixes = np.where(stratum)[0]
    shape = [len(ixes), LENGTHS[ix] + 4, NFEAT]
    print(shape)
    phones_shape = [len(ixes), max([trans_lens[x] for x in ixes]) + 1]
    specs, transes = None, None
    specs = np.zeros(shape, np.float32)
    transes = np.ones(phones_shape, np.uint16) * phone_zero
    for num, rec_ix in enumerate(ixes):
        print(num)
        fname = rec_fnames[rec_ix]
        data = sio.read(fname)[1].astype(np.float32) / 2**15
        stft = callback(data)
        # print(stft.shape, specs.shape)
        specs[num, :stft.shape[0], :NFEAT] = stft
        trans = get_phones_clarin(trans_fnames[rec_ix])
        trans = np.array([phone_dict.index(x) for x in trans])
        transes[num, :len(trans)] = trans
    np.save(os.path.join(ROOT, "clarin-mfcc-rec-aligned-PLC-{}".format(ix)), specs)
    np.save(os.path.join(ROOT, "clarin-mfcc-trans-aligned-PLC-{}".format(ix)), transes)
    print("Saved batch", ix)

In [15]:
import gc
gc.collect()

175

In [None]:
import librosa
librosa.feature.mfcc

In [None]:
librosa.feature.mfcc?

In [None]:
import numpy as np

In [None]:
time = np.random.normal(size=[32000,])

In [None]:
librosa.feature.mfcc(time, sr=16000).shape

In [None]:
librosa.stft(time, n_fft=512)

In [None]:
librosa.feature.mfcc(S=librosa.feature.melspectrogram(time, sr=16000, n_fft=512, hop_length=128), sr=16000).shape

In [None]:
with open("/pictec/datasets/clarin-long/data/SES0001/lab/sent001.plb", encoding="utf-8") as f:
    text = f.read()

In [None]:
print(text)