In [33]:
import numpy as np
import sounddevice as snddev
import soundfile as sndfl
import librosa
import os
from collections import namedtuple
from random import sample 
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Question 1

## Train, dev, test split

In [2]:
def resample(file_name, target_sr):
    sound, samplerate = sndfl.read(file_name)
    return librosa.core.resample(sound, samplerate, target_sr)   

In [3]:
female = 'Speech Data/IEEE/IEEE_female'
male = 'Speech Data/IEEE/IEEE_male'

In [4]:
def train_test_dev_split(path, target_sr):
    
    def put_to_dir(paths, root, part, target_sr):
        
        for path in tqdm(paths):
            resampled = resample(path, target_sr)
            file_name =  os.path.split(path)[-1]
            new_path = os.path.join(root+'_split', part)
    
            if os.path.exists(new_path):
                librosa.output.write_wav(new_path + '/' + file_name, resampled, target_sr)
            else:
                os.makedirs(new_path)
                librosa.output.write_wav(new_path + '/' + file_name, resampled, target_sr)
        
    paths = []
    root = ''
 
    files = []
    
    for root, dirs, files in os.walk(path):
        for file in files:
            paths.append(os.path.join(root, file))
    
            
    train = paths[:500]
    dev = paths[500:600]
    test = paths[600:700]
    
    put_to_dir(train, root, 'train', target_sr)
    put_to_dir(dev, root, 'dev', target_sr)
    put_to_dir(test, root, 'test', target_sr) 

In [5]:
train_test_dev_split(female, 16000)

100%|██████████| 500/500 [00:36<00:00, 13.61it/s]
100%|██████████| 100/100 [00:07<00:00, 13.51it/s]
100%|██████████| 100/100 [00:07<00:00, 13.36it/s]


In [6]:
train_test_dev_split(male, 16000)

100%|██████████| 500/500 [00:51<00:00,  9.69it/s]
100%|██████████| 100/100 [00:09<00:00, 10.49it/s]
100%|██████████| 100/100 [00:06<00:00, 14.44it/s]


## Split noise data

In [4]:
#Librosa reads instantly with a desired SR, no resampling needed

noise1 = librosa.core.load('Noise_Data/Live_Restaurant.wav', 16000)
noise2 = librosa.core.load('Noise_Data/adtCafe.wav', 16000)
noise3 = librosa.core.load('Noise_Data/adtBabble2.wav', 16000)

In [5]:
def split_noise_half(noise_files):
    
    Data = namedtuple('Data', 'train test')
    
    noises_split = []
    
    for n in noise_files:
        div_index = len(n)//2
        noise_data = Data(n[:div_index], n[div_index:])
        noises_split.append(noise_data)
        
    return noises_split

In [6]:
#Split noise for train, dev, test sets
noises_split = split_noise_half([noise1[0], noise2[0], noise3[0]])

In [7]:
SNRS = [-3, 0, 3]

In [8]:
def generate_noisy(speech, noise, desired_snr):   
    #calculate energies
    E_speech = np.sum(np.power(speech, 2))
    E_noise = np.sum(np.power(noise, 2))
    
    #calculate b coeff
    b = np.sqrt((E_speech/(np.power(10, (desired_snr/10))))/E_noise)    
    return speech + b*noise

def spit(file_name, signal):
    librosa.output.write_wav(file_name, signal, 16000)

In [9]:
def generate_train_data(noise_signals, snrs, speech_dir, output_dir):
    
    #generate paths to read
    paths = []    
    for root, dirs, files in os.walk(speech_dir):
        for file in files:
            paths.append(os.path.join(root, file))
            
    #make correspondence df
    correspondence = {}
                
    for path in tqdm(paths):
        speech = librosa.core.load(path, 16000)
        #noise index will be used for file names
        noise_index = 0
        noisy_signals = []
        for n in noise_signals:
            noise_index+=1
            for s in snrs:
                # select random values from noise vector
                noise = np.random.choice(n, size=len(speech[0]))
                noisy_speech = generate_noisy(speech[0], noise, s)
                file_name = os.path.split(path)[-1][:-4]+'-'+str(noise_index)+str(s)+'.wav'
                noisy_signals.append(output_dir + '/' + file_name)

                if os.path.exists(output_dir):
                    spit(output_dir + '/' + file_name, noisy_speech)
                else:
                    os.makedirs(output_dir)
                    spit(output_dir + '/' + file_name, noisy_speech)
        correspondence[path] = '|'.join(noisy_signals)
    return correspondence

In [11]:
train1 = generate_train_data([n.train for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_female_split/train/',\
                   'Speech Data/IEEE/train_noisy')

100%|██████████| 500/500 [00:25<00:00, 19.47it/s]


In [12]:
train2 = generate_train_data([n.train for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_male_split/train/',\
                   'Speech Data/IEEE/train_noisy')

100%|██████████| 500/500 [00:25<00:00, 19.88it/s]


In [13]:
dev1 = generate_train_data([n.train for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_female_split/dev/',\
                   'Speech Data/IEEE/dev_noisy')

dev2 = generate_train_data([n.train for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_male_split/dev/',\
                   'Speech Data/IEEE/dev_noisy')

100%|██████████| 100/100 [00:07<00:00, 12.67it/s]
100%|██████████| 100/100 [00:05<00:00, 17.09it/s]


In [14]:
# select test noise data for test files
test1 = generate_train_data([n.test for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_female_split/test/',\
                   'Speech Data/IEEE/test_noisy')

test2 = generate_train_data([n.test for n in noises_split], SNRS,\
                    'Speech Data/IEEE/IEEE_male_split/test/',\
                   'Speech Data/IEEE/test_noisy')

100%|██████████| 100/100 [00:08<00:00, 11.72it/s]
100%|██████████| 100/100 [00:16<00:00,  6.05it/s]


In [15]:
train1.update(train2)
dev1.update(dev2)
test1.update(test2)

In [19]:
#save dictionaries to pickle
import pickle

with open('train.p', 'wb') as fp:
    pickle.dump(train1, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('dev.p', 'wb') as fp:
    pickle.dump(dev1, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test.p', 'wb') as fp:
    pickle.dump(test1, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
import pickle

In [6]:
with open('mapping_dicts/train.p', 'rb') as fp:
    data = pickle.load(fp)

# Question 2. Data loading and preprocessing

In [6]:
import ffmpeg
import torch
import torch.nn.functional as Func
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import copy

### a) Training Dataloader
The input X for the DNN is a single time frame of the STFT log-magnitude response of a noisy speech signal.
The label M for the DNN is a single time from from the corresponding STFT magnitude response of the clean speech signal.

In the following sections, we first paired each noisy speech signal with its clean speech by filename and save their STFTs in .npy format for Training data. 

You'll need to change the data paths throughout, so that they match your file structure.

In [20]:
train_frame_path = 'Speech Data/IEEE/npy/train_frame/'

def save_npy(paths_dict, train_frame_path):
    #were k is the path to clean speech file
    
    counter = 0
    
    for k in tqdm(paths_dict.keys()):
        #make stft for the clean file
        clean_speech, sr = librosa.load(k,sr=None)
        stft_clean = librosa.stft(clean_speech, n_fft=512,hop_length=160,win_length=320)
        stft_clean = 10*np.log10(np.abs(stft_clean))
        
        # make stfts for noisy files
        for n in paths_dict[k].split('|'):
            noisy_speech, sr = librosa.load(n, sr=None)
            stft_noisy = librosa.stft(noisy_speech, n_fft=512, hop_length=160, win_length=320)
            stft_noisy = 10*np.log10(np.abs(stft_noisy))
            
            for j in range(stft_clean.shape[1]):
                Xfile = train_frame_path + 'x' + str(counter) +'.npy'
                Mfile = train_frame_path + 'm' + str(counter) +'.npy'
                #X is the magnitude STFT for noisy speech, M is the magnitude STFT for clean speech
                np.save(Xfile, stft_noisy[:,j])
                np.save(Mfile,stft_clean[:,j])
                counter+=1

In [21]:
save_npy(train1, train_frame_path)

  if sys.path[0] == '':
 26%|██▌       | 255/1000 [11:05<32:23,  2.61s/it]ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [7]:
class trainDataLoader(data.Dataset):
    def __init__(self):
        self.dataPath = '/N/u/anakuzne/Carbonate/dl_for_speech/\
        HW3_II/IEEE/npy/train_frame/'
    def __getitem__(self, index):
        xFile = self.dataPath + 'x' + str(index) + '.npy'
        mFile = self.dataPath + 'm' + str(index) + '.npy'
        X = np.load(xFile)
        M_truth = np.load(mFile)
        return torch.from_numpy(X),torch.from_numpy(M_truth)
    def __len__(self):
        #Number of files
        return 4638636 # CORRECT THIS CODE

In [9]:
class valDataLoader(data.Dataset):
    def __init__(self):
        self.dataPath = '/N/u/anakuzne/Carbonate/dl_for_speech/\
        HW3_II/IEEE/npy/dev_frame/'
    def __getitem__(self, index):
        xFile = self.dataPath + 'x' + str(index) + '.npy'
        mFile = self.dataPath + 'm' + str(index) + '.npy'
        X = np.load(xFile)
        M_truth = np.load(mFile)
        return torch.from_numpy(X),torch.from_numpy(M_truth)
    def __len__(self):
        #Number of files
        return 936144

In [11]:
testPath = '/N/u/anakuzne/Carbonate/dl_for_speech/HW3_II/\
            IEEE/test_noisy/'
testMixedList = os.listdir(testPath)
testLength = len(testMixedList)
testPyPath ='/N/u/anakuzne/Carbonate/dl_for_speech/\
        HW3_II/IEEE/npy/test_frame/'
teList = os.listdir(testPyPath)

FileNotFoundError: [Errno 2] No such file or directory: '/N/u/anakuzne/Carbonate/dl_for_speech/HW3_II/            IEEE/test_noisy/'

In [12]:
for index in range(0, testLength):
    xFile = testMixedList[index]
    sx, sr = librosa.load(testPath + xFile, sr=None)
    X = librosa.stft(sx,n_fft=512,hop_length=160,win_length=320)
    X = 10*np.log10(np.abs(X))
    a = xFile.split('.',1)
    Xfile = testPyPath + a[0] + '.npy'
    np.save(Xfile, X)

NameError: name 'testLength' is not defined

In [14]:
class testDataLoader(data.Dataset):
    def __init__(self):
        self.dataPath = '/N/u/anakuzne/Carbonate/dl_for_speech/\
        HW3_II/IEEE/npy/test_frame/'
        self.dataList = teList
    def __getitem__(self, index):
        xFile = self.dataPath + self.dataList[index]
        X = np.load(xFile)
        return torch.from_numpy(X).t()
    def __len__(self):
        return 1800

In [None]:
trainData = data.DataLoader(trainDataLoader(),batch_size = 10000,shuffle=True,drop_last = True) 
valData = data.DataLoader(valDataLoader(),batch_size = 10000,shuffle=True,drop_last = True) 

In [1]:
import pickle
import librosa
import numpy as np
from tqdm import tqdm

import ffmpeg
import os
import torch
import torch.nn.functional as Func
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import copy

In [5]:
def open_map_dicts(path):
    with open(path, 'rb') as fp:
        f = pickle.load(fp)
    return f

In [7]:
DICT_PATH = '/home/anakuz/data/docs/iu_courses/dl_for_speech/hw3/II/mapping_dicts/train.p'
DICT = open_map_dicts(DICT_PATH)

In [11]:
clean_file = list(DICT.keys())[0]
noisy_data = DICT[list(DICT.keys())[0]].split('|')

In [17]:
len(train_tups)

2403

In [18]:
train_tups = []
for i in DICT:
    clean_file = i
    clean_speech, sr = librosa.load(clean_file,sr=None)

    stft_clean = librosa.stft(clean_speech, n_fft=512,hop_length=160,win_length=320)
    stft_clean = np.abs(stft_clean)

    #noisy_stfts = []
    for p in DICT[i]:
        noisy_speech, sr = librosa.load(p ,sr=None)
        stft_noisy = librosa.stft(noisy_speech, n_fft=512,hop_length=160,win_length=320)
        stft_noisy = 10*np.log10(np.abs(stft_noisy))
        #noisy_stfts.append(stft_noisy)
        for j in range(stft_clean.shape[1]):
            train_tups.append((stft_noisy[:,j], stft_clean[:,j]))

FileNotFoundError: [Errno 2] No such file or directory: 'Speech Data/IEEE/train_noisy/l40s10-1-3.wav|Speech Data/IEEE/train_noisy/l40s10-10.wav|Speech Data/IEEE/train_noisy/l40s10-13.wav|Speech Data/IEEE/train_noisy/l40s10-2-3.wav|Speech Data/IEEE/train_noisy/l40s10-20.wav|Speech Data/IEEE/train_noisy/l40s10-23.wav|Speech Data/IEEE/train_noisy/l40s10-3-3.wav|Speech Data/IEEE/train_noisy/l40s10-30.wav|Speech Data/IEEE/train_noisy/l40s10-33.wav'

In [9]:
def save_data(DICT):
    
    train_tups = []

    for i in range(len(DICT)):
        clean_file = list(DICT.keys())[i]
        clean_speech, sr = librosa.load(clean_file,sr=None)

        stft_clean = librosa.stft(clean_speech, n_fft=512,hop_length=160,win_length=320)
        stft_clean = np.abs(stft_clean)

        noisy_stfts = []
        for p in noisy_data:
            noisy_speech, sr = librosa.load(p ,sr=None)
            stft_noisy = librosa.stft(noisy_speech, n_fft=512,hop_length=160,win_length=320)
            stft_noisy = 10*np.log10(np.abs(stft_noisy))
            noisy_stfts.append(stft_noisy)
            
            for j in range(stft_clean.shape[1]):
                train_tups.append((noisy_stft[:,j], stft_clean[:,j]))

        
    np.save('out_train.npy', np.array(train_tups))

In [10]:
save(DICT)

'' was not found in history, as a file, url, nor in the user namespace.


In [49]:
class trainDataLoader(data.Dataset):
    def __init__(self):
        self.dataFile = train_tups
    def __getitem__(self, index):
        X = self.dataFile[index][1]
        M_truth = self.dataFile[index][0]
        return torch.from_numpy(X),torch.from_numpy(M_truth)
    def __len__(self):
        #Number of files
        return 641601 # CORRECT THIS CODE

In [43]:
trainData = data.DataLoader(trainDataLoader(),batch_size = 10000, shuffle=True,drop_last = True) 

In [44]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(257,1024)
        self.fc2 = nn.Linear(1024,1024)
        self.fc3 = nn.Linear(1024,1024)
        self.fc4 = nn.Linear(1024,257)
        
    def forward(self,audio):
        audio = Func.relu(self.fc1(audio))
        audio = Func.relu(self.fc2(audio))
        audio = Func.relu(self.fc3(audio))
        audio = self.fc4(audio)
        return audio

In [45]:
def weights(m):
    if isinstance(m,nn.Linear):
        nn.init.xavier_normal(m.weight.data)
        nn.init.constant(m.bias.data,0.1)

In [46]:
model = Net()
model.apply(weights)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.cuda()
model = model.to(device)
criterion.cuda()

AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

In [48]:
num_epochs = 20
best_model = copy.deepcopy(model.state_dict())
best_loss = 9999

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    loss = 0.0 
    vali_loss = 0.0
    for step, (audio, target) in enumerate(trainData): 
        audio = audio.to(device)
        target = target.to(device)
        model.train()
        output = model(audio)
        newLoss = criterion(output,target)
        loss += newLoss.data
        #print(step,loss)
        optimizer.zero_grad()
        newLoss.backward()
        optimizer.step()
        print("Train step:"+str(step)+"/"+str(len(trainData)))
    for step, (audio, target) in enumerate(valData): 
        audio = audio.to(device)
        target = target.to(device)        
        model.eval()
        output = model(audio)
        new_valiLoss = criterion(output,target)
        vali_loss += new_valiLoss.data
        #print(step,vali_loss)
        if vali_loss < best_loss:
                best_loss = vali_loss
                best_model = copy.deepcopy(model.state_dict())
        print("Valid step:"+str(step)+"/"+str(len(valData)))
    print('Epoch:{:2},Loss:{:>.5f}'.format(epoch,loss))
    print('Epoch:{:2},Loss:{:>.5f}'.format(epoch,vali_loss)) 

Epoch 0/19


AttributeError: 'trainDataLoader' object has no attribute 'self'