In [4]:
import librosa
import os
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import matplotlib.pyplot as plt
from python_speech_features import mfcc
import  torch

ann_folder = '/content/drive/My Drive/SSP/EmoDB/lablaut/'
wav_folder = '/content/drive/My Drive/SSP/EmoDB/wav/'

In [5]:
# Assign data folders

ann_folder = 'lablaut/'
wav_folder = 'wav/'

In [6]:
emotions={'N':"Neutral",'W':"Ärger",'L':"Langeweile",'E':"Ekel",'A':"Angst",'F':"Freude",'T':"Trauer"}
emocoding = list(emotions.keys())
X=[]
y=[]
for filename in os.listdir(wav_folder):
    (Fs,sig) = wavfile.read(wav_folder+filename)
    mfcc_feat = mfcc(sig[:8000],Fs,winfunc=np.hanning,numcep=8)
    mfcc_feat = np.array(mfcc_feat)
    mfcc_feat = mfcc_feat.flatten('C')
    X.append(list(mfcc_feat))
    y.append(emocoding.index(filename[5]))

  


In [20]:
# Alternate feature extraction

filename = os.listdir(wav_folder)[0]


n_mfcc_coeffs = 12
n_formant_lpc_coeffs = 12
n_formants = 3

length_of_frame_in_seconds = 20 / 1000
# Hop len is same as frame len for now

X_formants = []
X_energy = []
X_pitch = []

# filename = os.listdir(wav_folder)[2]

for filename in os.listdir(wav_folder):

    path = wav_folder+filename
    x, sr = librosa.load(path)
    frame_len = int(length_of_frame_in_seconds * sr)
    hop_len = frame_len

    # MFCC
    mfccs = librosa.feature.mfcc(x, sr=sr, n_mfcc=n_mfcc_coeffs, n_fft=frame_len, hop_length=hop_len)
    mfccs = mfccs.T

    # Formants
    frames = librosa.util.frame(x, frame_length=frame_len, hop_length=hop_len)
    frames = frames.T

    n_coeff = n_formant_lpc_coeffs

    formants = []

    for frame in frames:
        
        if np.all((frame == 0)):
            formants.append([0.] * n_formants)
            break

        w = np.hamming(len(frame))
        f = (frame * w).astype(np.float64)
        A = librosa.core.lpc(f, n_coeff)

        rts = np.roots(A)
        rts = rts[np.imag(rts) >= 0]
        angz = np.arctan2(np.imag(rts), np.real(rts))
        frqs = angz * (sr / (2 *  np.pi))
        frqs.sort()

        frqs=[i for i in frqs if(i>0.0)]

        if(len(frqs)>=3):
            forms = frqs[0:n_formants]
        else:
            forms = frqs

        assert len(forms) == 3

        formants.append(forms)

    # Pitch
    pitches = []

    for frame in frames:
        max_s = int(sr * 1/50)
        min_s = int(sr * 1/500)

        # frame = [i/32767 for i in frame]

        c = librosa.autocorrelate(frame, max_s)

        z = c[round(len(c)/2):]
        z = z[min_s : max_s]
        zmax = max(z)

        index = np.where(z==zmax)
        index = index[0][0]

        pitch = sr/(min_s + index + 2)

        pitches.append(pitch)

    # Energy
    energies = []

    for frame in frames:
        energy = librosa.feature.rms(frame, frame_length=frame_len, hop_length=frame_len)
        energies.append(energy[0, 0])

    X.append(formants)


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = MLPClassifier(solver='adam', alpha=1,hidden_layer_sizes=(100,50,25,8), random_state=1, max_iter = 1000)



ValueError: Found input variables with inconsistent numbers of samples: [115, 535]

In [None]:
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = MLPClassifier(solver='adam', alpha=1,hidden_layer_sizes=(100,50,25,8), random_state=1, max_iter = 1000)

clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.3954802259887006

In [None]:
from sklearn.model_selection import train_test_split

X = []
lengths = []
y = []
emotions={'N':"Neutral",'W':"Ärger",'L':"Langeweile",'E':"Ekel",'A':"Angst",'F':"Freude",'T':"Trauer"}
emocoding = list(emotions.keys())
for filename in os.listdir(wav_folder):
    (Fs,sig) = wavfile.read(wav_folder+filename)
    mfcc_feat = mfcc(sig,Fs,winfunc=np.hanning)
    X.append(torch.tensor(mfcc_feat).float())
    print(mfcc_feat.shape)
    lengths.append(len(mfcc_feat))
    empty = [0]*7
    empty[emocoding.index(filename[5])]=1
    y.append(empty)

(678, 13)
(278, 13)
(143, 13)
(224, 13)
(535, 13)
(390, 13)
(245, 13)
(260, 13)
(339, 13)
(265, 13)
(321, 13)
(533, 13)
(223, 13)
(589, 13)
(215, 13)
(897, 13)
(373, 13)
(379, 13)
(306, 13)
(318, 13)
(308, 13)
(348, 13)
(147, 13)
(216, 13)
(339, 13)
(345, 13)
(317, 13)
(401, 13)
(419, 13)
(297, 13)
(262, 13)
(269, 13)
(155, 13)
(188, 13)
(213, 13)
(335, 13)
(301, 13)
(207, 13)
(405, 13)
(520, 13)
(157, 13)
(168, 13)
(201, 13)
(595, 13)
(173, 13)
(407, 13)
(270, 13)
(246, 13)
(223, 13)
(162, 13)
(299, 13)
(331, 13)
(386, 13)
(221, 13)
(279, 13)
(365, 13)
(163, 13)
(175, 13)
(313, 13)
(334, 13)
(330, 13)
(236, 13)
(147, 13)
(242, 13)
(200, 13)
(284, 13)
(208, 13)
(209, 13)
(220, 13)
(219, 13)
(244, 13)
(212, 13)
(211, 13)
(237, 13)
(149, 13)
(149, 13)
(190, 13)
(178, 13)
(266, 13)
(364, 13)
(161, 13)
(334, 13)
(199, 13)
(270, 13)
(540, 13)
(291, 13)
(242, 13)
(403, 13)
(317, 13)
(318, 13)
(393, 13)
(277, 13)
(279, 13)
(153, 13)
(264, 13)
(391, 13)
(271, 13)
(206, 13)
(261, 13)
(216, 13)


  if __name__ == '__main__':


(304, 13)
(292, 13)
(304, 13)
(159, 13)
(304, 13)
(233, 13)
(374, 13)
(159, 13)
(294, 13)
(398, 13)
(289, 13)
(343, 13)
(295, 13)
(310, 13)
(271, 13)
(254, 13)
(247, 13)
(251, 13)
(269, 13)
(218, 13)
(200, 13)
(294, 13)
(145, 13)
(353, 13)
(392, 13)
(360, 13)
(309, 13)
(225, 13)
(504, 13)
(234, 13)
(258, 13)
(151, 13)
(343, 13)
(147, 13)
(176, 13)
(150, 13)
(207, 13)
(146, 13)
(165, 13)
(236, 13)
(383, 13)
(212, 13)
(213, 13)
(244, 13)
(296, 13)
(206, 13)
(232, 13)
(621, 13)
(278, 13)
(248, 13)
(348, 13)
(194, 13)
(331, 13)
(180, 13)
(512, 13)
(241, 13)
(406, 13)
(267, 13)
(376, 13)
(264, 13)
(418, 13)
(275, 13)
(179, 13)
(235, 13)
(317, 13)
(221, 13)
(394, 13)
(234, 13)
(203, 13)
(152, 13)
(311, 13)
(153, 13)
(357, 13)
(274, 13)
(331, 13)
(167, 13)
(228, 13)
(382, 13)
(286, 13)
(324, 13)
(147, 13)
(298, 13)
(317, 13)
(228, 13)
(354, 13)
(198, 13)
(276, 13)
(196, 13)
(320, 13)
(239, 13)
(288, 13)
(282, 13)
(310, 13)
(209, 13)
(276, 13)
(235, 13)
(177, 13)
(244, 13)
(363, 13)
(303, 13)


In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
max_epochs = 100

class MinimalDataset(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = pad_sequence(data)
        self.labels = torch.tensor(labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

    def __len__(self):
        return len(self.labels)
print(len(X),len(y))
dataset = MinimalDataset(X,y)
data_loader = DataLoader(dataset, **params)
# encoder = GRU(input_size=pad_sequence.data.size,hidden_size=3)

535 535


In [None]:
print(dataset[0][0].size())
from torch import nn
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(13,120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, len(emocoding))
    
    def forward(self,x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

net = Net()
print(Net)
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


torch.Size([535, 13])
<class '__main__.Net'>


In [None]:
from tqdm import tqdm, tqdm_notebook, tnrange
num_batch = len(data_loader)
for epoch in tnrange(max_epochs):
    running_loss=0.0
    t = tqdm_notebook(iter(data_loader), leave=False, total=num_batch)
    for input, label in t:
        optimizer.zero_grad()

        outputs = net(input)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(running_loss)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

66.89940071105957


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

54.55457401275635


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

50.036741733551025


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

48.25227689743042


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.582398414611816


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.311996936798096


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.09434938430786


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.00727462768555


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.00074481964111


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.07336902618408


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.95512580871582


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.166707038879395


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.11082935333252


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.84037208557129


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.88788461685181


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.95160484313965


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.97392797470093


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.148868560791016


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.078054904937744


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.93372869491577


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.94037199020386


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.85188961029053


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.05171012878418


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.127456188201904


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.12570762634277


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.97748041152954


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.945209980010986


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.867578983306885


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.052844524383545


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.96768236160278


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.84799909591675


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.07292413711548


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.95099401473999


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.905049324035645


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.876280307769775


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.66964912414551


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.9922456741333


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.05826950073242


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.830543518066406


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.882962703704834


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.296199798583984


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.95458650588989


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.84868288040161


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.039557456970215


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.11465406417847


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.88615274429321


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.06224298477173


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.688106536865234


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.058189392089844


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.68193006515503


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.87245559692383


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.96115779876709


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.13846158981323


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.84791564941406


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.98373222351074


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.85948324203491


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.92319202423096


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.96033191680908


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.895328521728516


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.95472478866577


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.09593152999878


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.968453884124756


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.189430236816406


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.860297203063965


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.76505470275879


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.02474308013916


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.06626844406128


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.92842483520508


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.949130058288574


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.07561159133911


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.81739950180054


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.92650270462036


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.97770118713379


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.02189254760742


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.94827842712402


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.08847951889038


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.07813119888306


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.00288438796997


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.02345943450928


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.954697608947754


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.1409649848938


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.21754264831543


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.91550636291504


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.93730115890503


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.04266834259033


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.074326515197754


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.887606143951416


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.90839862823486


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.929442405700684


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.810033321380615


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.80276346206665


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.88158369064331


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.92300033569336


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.119328022003174


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.16599082946777


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.004939556121826


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.98560810089111


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

47.12592887878418


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.82814311981201


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

46.942604064941406



In [None]:
from sklearn.metrics import accuracy_score, f1_score
# for i in range(100):
#     input = X[i]
#     label = torch.tensor(y[i])
#     output = net(input)
#     print(output.size())
#     print(torch.argmax(label))
#     if(torch.argmax(output)==torch.argmax(label)):
#         accuracy+=1
# print(accuracy)
print(f1_score(output))