# Mixing different signals for input data creation

## Imports and drive mounting

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import librosa
import numpy as np
import os
from os import listdir
from os.path import isfile, join

In [0]:
import sys
sys.path.append('/content/gdrive/My Drive/Nagy házi/')
from audio_masking import *

In [0]:
baseFolder = "/content/gdrive/My Drive/Nagy házi/audiobooks/single_stft"
baseFileNames = [(baseFolder + os.sep + f) for f in listdir(baseFolder) if isfile(join(baseFolder, f))]
print(len(baseFileNames))

3387


## Extracting speakers

Identifying the 10 different speakers and storing their names

In [0]:
speakers = []

for bfn in baseFileNames:
  speaker = "_".join(bfn.split("__")[-1].split("_")[:-1])
  if(speaker not in speakers):
    speakers.append(speaker)
    print("Added: " + speaker)

print(speakers)

Added: Joseph_Ugoretz
Added: Rosalind_Wills
Added: Debra_Lynn
Added: Stewart_Wills
Added: Paul_Hansen
Added: Zachary_Brewster-Geisz
Added: Jemma_Blythe
Added: Kristen_Ferreri
Added: Graham_Thomsen
Added: Stuart_Bell
['Joseph_Ugoretz', 'Rosalind_Wills', 'Debra_Lynn', 'Stewart_Wills', 'Paul_Hansen', 'Zachary_Brewster-Geisz', 'Jemma_Blythe', 'Kristen_Ferreri', 'Graham_Thomsen', 'Stuart_Bell']


## Creating pairs

The following code is for pairing randomly selected 5s length signals from different speakers. Duplicate pairing is taken into consideration and is prohibited. 

The workflow of the program:


1.   Choosing a speaker form the ten
2.   Selecting N random utterances
3.   Choosing M utterances to each one from all the other speakers
4.   Creating pairs 
5.   Continue with next speaker (number of options decreases in each iteration)



In [0]:
N = 10
M = 11

In [0]:
import random
random.seed(123)
np.random.seed(123)

pairs = []
alreadySelected = []
for speaker1 in speakers:
  selected1 = [bfn for bfn in baseFileNames if (speaker1 in bfn)]
  alreadySelected.append(speaker1)
  indices1 = random.sample(range(len(selected1)), N)
  for i in range(N):
    utterance1 = selected1[indices1[i]]
    for speaker2 in [s for s in speakers if(s not in alreadySelected)]:
      selected2 = [bfn for bfn in baseFileNames if (speaker2 in bfn)]
      indices2 = random.sample(range(len(selected2)), M)
      for j in range(M):
        utterance2 = selected2[indices2[j]]
        pairs.append([utterance1, utterance2])

len(pairs)

4950

In [0]:
pairs[0]

['/content/gdrive/My Drive/Nagy házi/audiobooks/single_stft/hardtimes_04_dickens__Joseph_Ugoretz_192.npy',
 '/content/gdrive/My Drive/Nagy házi/audiobooks/single_stft/hardtimes_03_dickens__Rosalind_Wills_118.npy']

Loading first pair for later shaping

In [0]:
X0=np.load(pairs[0][0], allow_pickle=False, fix_imports=True)
outputShape = len(pairs), 3, *X0.shape
outputArray = np.ndarray(outputShape, dtype=np.abs(X0).dtype)
outputArray.shape

(4950, 3, 257, 431)

In [0]:
np.abs(X0).dtype

dtype('float32')

Loading each pair to the output array

In [0]:
from math import floor
counter = 0
percentage = 0
for pair in pairs:
  X1 = np.load(pair[0], allow_pickle=False, fix_imports=True)
  X2 = np.load(pair[1], allow_pickle=False, fix_imports=True)
  YAbs = np.abs(X1 + X2)
  outputArray[counter] = np.array([YAbs, np.abs(X1), np.abs(X2)])
  counter += 1
  
  if floor(counter/len(pairs)*100) > percentage:
    percentage = floor(counter/len(pairs)*100)
    print("\r", str(percentage)+'%', end="")

 100%

In [0]:
outputArray[0,0].dtype

dtype('float32')

In [0]:
print(outputArray[0].shape)
print(outputArray[0, 0].shape)
print(np.atleast_3d(outputArray[0, 0]).shape)

(3, 257, 431)
(257, 431)
(257, 431, 1)


## Saving dataset

Splitting data to train, validation and test sets

In [0]:
indices = random.sample(range(len(outputArray)), len(outputArray))
trainIndices = indices[:int(len(indices)*0.9)]
validIndices = indices[int(len(indices)*0.9):int(len(indices)*0.95)]
testIndices = indices[int(len(indices)*0.95):len(indices)]

Creating datasets and saving them in hdf5 format

In [0]:
import h5py
h5Path = "/content/gdrive/My Drive/Nagy házi/audiobooks/train_data/data2.hdf5"

if os.path.isfile(h5Path):
  os.remove(h5Path)

counter = 0
percentage = 0
with h5py.File(h5Path, "a") as f:
  trainInputSet = f.create_dataset("trainInput", shape=(len(trainIndices), *X0.shape, 1), dtype = outputArray[0,0].dtype, chunks=(5, *X0.shape, 1))
  trainOutputSet = f.create_dataset("trainOutput", shape=(len(trainIndices), *X0.shape, 3), dtype = outputArray[0,1].dtype, chunks=(5, *X0.shape, 3))
  validInputSet = f.create_dataset("validInput", shape=(len(validIndices), *X0.shape, 1), dtype = outputArray[0,0].dtype, chunks=(5, *X0.shape, 1))
  validOutputSet = f.create_dataset("validOutput", shape=(len(validIndices), *X0.shape, 3), dtype = outputArray[0,1].dtype, chunks=(5, *X0.shape, 3))
  testInputSet = f.create_dataset("testInput", shape=(len(testIndices), *X0.shape, 1), dtype = outputArray[0,0].dtype, chunks=(5, *X0.shape, 1))
  testOutputSet = f.create_dataset("testOutput", shape=(len(testIndices), *X0.shape, 3), dtype = outputArray[0,1].dtype, chunks=(5, *X0.shape, 3))

  for i in range(len(trainIndices)):
    trainInputSet[i] = np.atleast_3d(outputArray[trainIndices[i], 0])
    # trainOutputSet[i] = outputArray[trainIndices[i]]
    toPersist = np.ndarray((257, 431, 3), dtype=np.abs(X0).dtype)
    for x in range(257):
      for y in range(431):
        for z in range(3):
          toPersist[x, y, z] = outputArray[trainIndices[i], z, x, y]
    trainOutputSet[i] = toPersist
    counter += 1
    if floor(counter/len(outputArray)*100) > percentage:
      percentage = floor(counter/len(outputArray)*100)
      print("\r", str(percentage)+'%', end="")

  for i in range(len(validIndices)):
    validInputSet[i] = np.atleast_3d(outputArray[validIndices[i], 0])
    #validOutputSet[i] = outputArray[validIndices[i]]
    toPersist = np.ndarray((257, 431, 3), dtype=np.abs(X0).dtype)
    for x in range(257):
      for y in range(431):
        for z in range(3):
          toPersist[x, y, z] = outputArray[validIndices[i], z, x, y]
    validOutputSet[i] = toPersist
    counter += 1
    if floor(counter/len(outputArray)*100) > percentage:
      percentage = floor(counter/len(outputArray)*100)
      print("\r", str(percentage)+'%', end="")

  for i in range(len(testIndices)):
    testInputSet[i] = np.atleast_3d(outputArray[testIndices[i], 0])
    #testOutputSet[i] = outputArray[testIndices[i]]
    toPersist = np.ndarray((257, 431, 3), dtype=np.abs(X0).dtype)
    for x in range(257):
      for y in range(431):
        for z in range(3):
          toPersist[x, y, z] = outputArray[testIndices[i], z, x, y]
    testOutputSet[i] = toPersist
    counter += 1
    if floor(counter/len(outputArray)*100) > percentage:
      percentage = floor(counter/len(outputArray)*100)
      print("\r", str(percentage)+'%', end="")


 100%

Saving the relationship between dataset indices and original files

In [0]:
import pandas as pd
import os

trainPairIndicesPath = "/content/gdrive/My Drive/Nagy házi/audiobooks/train_data/train_pair_indices.csv"
validPairIndicesPath = "/content/gdrive/My Drive/Nagy házi/audiobooks/train_data/valid_pair_indices.csv"
testPairIndicesPath = "/content/gdrive/My Drive/Nagy házi/audiobooks/train_data/test_pair_indices.csv"

if os.path.isfile(trainPairIndicesPath):
  os.remove(trainPairIndicesPath)

if os.path.isfile(validPairIndicesPath):
  os.remove(validPairIndicesPath)

if os.path.isfile(testPairIndicesPath):
  os.remove(testPairIndicesPath)


df = pd.DataFrame(data=None)
for i in range(len(trainIndices)):
  df = df.append([pairs[trainIndices[i]]])
df.to_csv(trainPairIndicesPath)

df = pd.DataFrame(data=None)
for i in range(len(validIndices)):
  df = df.append([pairs[validIndices[i]]])
df.to_csv(validPairIndicesPath)

df = pd.DataFrame(data=None)
for i in range(len(testIndices)):
  df = df.append([pairs[testIndices[i]]])
df.to_csv(testPairIndicesPath)

In [0]:
drive.flush_and_unmount()