In [40]:
import os
from os.path import join
import glob
import numpy as np
import pandas as pd
import torchaudio
import torch
import matplotlib.pyplot as plt

In [10]:
def get_name(path):
    return os.path.splitext(os.path.split(path)[1])[0]

In [7]:
paths = sorted(glob.glob('datasets/khanty_data/*.wav'))

In [62]:
labels = pd.read_csv('datasets/khanty_data/gender.txt')

In [106]:
def get_mfcc_vector(indices, mfcc_dim=40):
    x = []
    for idx in indices:
        waveform, sample_rate = torchaudio.load(f'datasets/khanty_data/{idx}.wav')
        mfcc_transform = torchaudio.transforms.MFCC(sample_rate, mfcc_dim)
        mfcc = mfcc_transform(waveform[0])
        x.append(mfcc)
    return torch.cat(x, dim=1).T

def normalize(x):
    return x - x.mean()

In [119]:
indices_train = labels[labels['gender'] == 'F']['audio'].values[:-1]
indices_test = labels[labels['gender'] == 'F']['audio'].values[-1:]

x_train = normalize(get_mfcc_vector(indices_train))
x_test = normalize(get_mfcc_vector(indices_test))



In [120]:
U, S, V = torch.svd(x_train)
h_f_train = V[:, 0]
h_f_train

tensor([ 0.9770, -0.1534,  0.0454, -0.0322, -0.0078, -0.0229, -0.0053, -0.0112,
        -0.0115, -0.0238, -0.0262, -0.0288, -0.0282, -0.0244, -0.0260, -0.0227,
        -0.0199, -0.0267, -0.0231, -0.0240, -0.0244, -0.0251, -0.0275, -0.0247,
        -0.0255, -0.0318, -0.0298, -0.0261, -0.0221, -0.0169, -0.0152, -0.0180,
        -0.0224, -0.0264, -0.0254, -0.0220, -0.0194, -0.0172, -0.0191, -0.0253])

In [121]:
U, S, V = torch.svd(x_test)
h_f_test = V[:, 0]
h_f_test

tensor([ 9.7826e-01, -8.6711e-02,  5.5314e-02, -6.1813e-02,  2.6816e-04,
        -5.3532e-02,  2.3406e-03, -1.8332e-02, -1.1466e-03, -5.4392e-02,
        -2.3299e-02, -2.1411e-02, -3.1435e-02, -1.9401e-02, -2.7597e-02,
        -3.1296e-02, -1.7533e-02, -3.4167e-02, -2.8201e-02, -3.3922e-02,
        -1.5393e-02, -2.7294e-02, -1.7583e-02, -1.9748e-02, -2.3089e-02,
        -2.9150e-02, -3.1969e-02, -2.1351e-02, -2.5039e-02, -3.8233e-02,
        -3.1472e-02, -4.4096e-02, -4.6368e-02, -3.6007e-02, -3.5726e-02,
        -1.9718e-02, -7.7559e-03, -4.3249e-03, -1.9154e-03, -1.2160e-02])

In [122]:
indices_train = labels[labels['gender'] == 'M']['audio'].values[:-1]
indices_test = labels[labels['gender'] == 'M']['audio'].values[-1:]

x_train = normalize(get_mfcc_vector(indices_train))
x_test = normalize(get_mfcc_vector(indices_test))

In [124]:
U, S, V = torch.svd(x_train)
h_m_train = V[:, 0]
h_m_train

tensor([ 9.8406e-01, -9.9265e-02, -2.0072e-04, -3.0784e-02, -3.7083e-03,
        -3.1992e-02, -3.0225e-02, -2.5942e-02, -1.3084e-02, -3.7729e-02,
        -2.2513e-02, -3.2091e-02, -2.9919e-02, -2.6609e-02, -2.8249e-02,
        -2.5583e-02, -1.4551e-02, -2.5640e-02, -2.2220e-02, -2.6551e-02,
        -1.9533e-02, -2.1328e-02, -1.7966e-02, -2.1634e-02, -2.0535e-02,
        -2.4924e-02, -2.2526e-02, -2.4307e-02, -2.0531e-02, -2.2504e-02,
        -2.2246e-02, -2.9002e-02, -2.8157e-02, -3.0201e-02, -2.7383e-02,
        -2.0272e-02, -1.3626e-02, -1.3753e-02, -1.4903e-02, -2.1272e-02])

In [125]:
U, S, V = torch.svd(x_test)
h_m_test = V[:, 0]
h_m_test

tensor([ 0.9816, -0.0487,  0.0412, -0.0430,  0.0315, -0.0383, -0.0396, -0.0411,
        -0.0107, -0.0400, -0.0264, -0.0321, -0.0392, -0.0328, -0.0321, -0.0301,
        -0.0168, -0.0306, -0.0283, -0.0289, -0.0219, -0.0226, -0.0171, -0.0210,
        -0.0219, -0.0282, -0.0254, -0.0256, -0.0235, -0.0288, -0.0310, -0.0403,
        -0.0420, -0.0419, -0.0346, -0.0217, -0.0096, -0.0072, -0.0092, -0.0185])

In [126]:
torch.linalg.norm(h_f_train - h_f_test, ord=2), torch.linalg.norm(h_f_train - h_m_test, ord=2)

(tensor(0.1056), tensor(0.1331))

In [127]:
torch.linalg.norm(h_m_train - h_m_test, ord=2), torch.linalg.norm(h_m_train - h_f_test, ord=2)

(tensor(0.0840), tensor(0.0905))

In [78]:
# idx = 0
# filename = paths[idx]
# waveform, sample_rate = torchaudio.load(filename, )

# print("Shape of waveform: {}".format(waveform.size()))
# print("Sample rate of waveform: {}".format(sample_rate))

# plt.figure()
# plt.plot(waveform.t().numpy())