In [1]:
import os
import torch
import torchaudio
torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False
import pandas as pd
import numpy as np
import torch.nn.functional as F
import scipy
from scipy import signal
import librosa.display


## Feature extractors for emoDB

In [2]:
from torchaudio.transforms import MFCC

def MFCC_Extractor(waveform, DEVICE):
    x = MFCC(sample_rate=16000, 
             n_mfcc=20,
             melkwargs={"n_fft": 2048, "hop_length": 512, "power": 2}).to(DEVICE)(waveform)
    return x

In [3]:
root = './Dataset/emodb'
target_location = './Dataset/emodb_MFCC'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cpu")

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[1] <= 143652:
            signal_wiener_padded = F.pad(input=signal_wiener, 
                                         pad=(0, 143652 - signal_wiener.shape[1] , 0, 0), 
                                         mode='constant', value=0)
        mfcc = MFCC_Extractor(signal_wiener_padded, DEVICE)
        file_name = file[:-4]
        torch.save(mfcc, target_location + '/' + file_name + '.pt')

  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


In [4]:
def LMS_Extractor(waveform, DEVICE):
    x = torchaudio.transforms.MelSpectrogram(sample_rate = 16000,
                                             n_fft = 2048, 
                                             hop_length = 512, 
                                             power = 2).to(DEVICE)(waveform)
    x = torchaudio.transforms.AmplitudeToDB()(x)
    return x

In [5]:
root = './Dataset/emodb'
target_location = './Dataset/emodb_LMS'
DEVICE = torch.device("cpu")

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[1] <= 143652:
            signal_wiener_padded = F.pad(input=signal_wiener, pad=(0, 143652 - signal_wiener.shape[1] , 0, 0), mode='constant', value=0)
        lms = LMS_Extractor(signal_wiener_padded, DEVICE)
        file_name = file[:-4]
        torch.save(lms, target_location + '/' + file_name + '.pt')

In [6]:
from torchaudio.compliance.kaldi import fbank
from torchaudio.functional import compute_kaldi_pitch
def LogMFB_Energy_Pitch_NCCF_Extractor(waveform,sr):
    logmfb_w_energy = fbank(waveform = waveform, sample_frequency=sr, frame_length=40, frame_shift=10,num_mel_bins=40,use_energy=True)
    logmfb_w_energy = torch.unsqueeze(logmfb_w_energy,0)
    pitch = compute_kaldi_pitch(waveform = waveform, sample_rate = sr, frame_length=40, frame_shift=10)
    x = torch.cat((logmfb_w_energy,pitch),2).permute(0,2,1)
    x = x.squeeze()
    return x

In [7]:
root = './Dataset/emodb'
target_location = './Dataset/emodb_LogMFB_Energy_Pitch_NCCF'

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[1] <= 143652:
            signal_wiener_padded = F.pad(input=signal_wiener, pad=(0, 143652 - signal_wiener.shape[1] , 0, 0), mode='constant', value=0)
        logmfb_w_energy_pitch = LogMFB_Energy_Pitch_NCCF_Extractor(signal_wiener_padded,torch_sr)
        file_name = file[:-4]
        torch.save(logmfb_w_energy_pitch, target_location + '/' + file_name + '.pt')

## MFCC and LMS Generation for RAVDESS

In [8]:
def MFCC_Extractor(waveform, DEVICE):
    x = MFCC(sample_rate=48000, n_mfcc=20,melkwargs={"n_fft": 2048, "hop_length": 512, "power": 2}).to(DEVICE)(waveform)
    # print(x.shape)
    return x

In [9]:
root = './Dataset/ravdess'
target_location = './Dataset/ravdess_MFCC'
DEVICE = torch.device("cpu")

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[0] > 1:
            signal_wiener = torch.unsqueeze(signal_wiener[0],0)
        if signal_wiener.shape[1] <= 169472:
            signal_wiener_padded = F.pad(input=signal_wiener, pad=(0, 169472 - signal_wiener.shape[1] , 0, 0), mode='constant', value=0)
        mfcc = MFCC_Extractor(signal_wiener_padded, DEVICE)
        file_name = file[:-4]
        torch.save(mfcc, target_location + '/' + file_name + '.pt')       

In [10]:
def LMS_Extractor(waveform, DEVICE):
    x = torchaudio.transforms.MelSpectrogram(sample_rate = 48000,n_fft = 2048, hop_length = 512, power = 2).to(DEVICE)(waveform)
    x = torchaudio.transforms.AmplitudeToDB()(x)
    return x

In [11]:
root = './Dataset/ravdess'
target_location = './Dataset/ravdess_LMS'
DEVICE = torch.device("cpu")

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[0] > 1:
            signal_wiener = torch.unsqueeze(signal_wiener[0],0)
        if signal_wiener.shape[1] <= 169472:
            signal_wiener_padded = F.pad(input=signal_wiener, pad=(0, 169472 - signal_wiener.shape[1] , 0, 0), mode='constant', value=0)
        lms = LMS_Extractor(signal_wiener_padded, DEVICE)
        file_name = file[:-4]
        torch.save(lms, target_location + '/' + file_name + '.pt')       

In [12]:
from torchaudio.compliance.kaldi import fbank
from torchaudio.functional import compute_kaldi_pitch
def LogMFB_Energy_Pitch_NCCF_Extractor(waveform,sr):
    logmfb_w_energy = fbank(waveform = waveform, sample_frequency=sr, frame_length=40, frame_shift=10,num_mel_bins=40,use_energy=True)
    logmfb_w_energy = torch.unsqueeze(logmfb_w_energy,0)
    pitch = compute_kaldi_pitch(waveform = waveform, sample_rate = sr, frame_length=40, frame_shift=10)
    x = torch.cat((logmfb_w_energy,pitch),2).permute(0,2,1)
    x = x.squeeze()
    return x

In [13]:
root = './Dataset/ravdess'
target_location = './Dataset/ravdess_LogMFB_Energy_Pitch_NCCF'

for root, dirs, files in os.walk(root):
    for file in files:
        audio = root + '/' + file
        waveform, torch_sr = torchaudio.load(audio)
        signal, _ = librosa.load(audio,sr=torch_sr)
        trimmed_signal,index = librosa.effects.trim(signal,top_db = 25)
        signal_wiener = scipy.signal.wiener(trimmed_signal)
        signal_wiener = torch.from_numpy(signal_wiener)
        signal_wiener = torch.unsqueeze(signal_wiener, 0)
        signal_wiener = signal_wiener.type(torch.FloatTensor)
        if signal_wiener.shape[0] > 1:
            signal_wiener = torch.unsqueeze(signal_wiener[0],0)
        if signal_wiener.shape[1] <= 169472:
            signal_wiener_padded = F.pad(input=signal_wiener, pad=(0, 169472 - signal_wiener.shape[1] , 0, 0), mode='constant', value=0)
        logmfb_w_energy_pitch = LogMFB_Energy_Pitch_NCCF_Extractor(signal_wiener_padded,torch_sr)
        file_name = file[:-4]
        torch.save(logmfb_w_energy_pitch, target_location + '/' + file_name + '.pt')    