# **Drive Mount**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# **Unzip & Format Change**

In [0]:
import os

kor_pcm_path = "./kor_pcm_dataset"
kor_wav_path = "./kor_wav_dataset"

eng_pcm_path = "./eng_pcm_dataset"
eng_wav_path = "./eng_wav_dataset"

# mkdir
if not os.path.exists(kor_pcm_path): os.mkdir(kor_pcm_path)
if not os.path.exists(kor_wav_path): os.mkdir(kor_wav_path)

if not os.path.exists(eng_pcm_path): os.mkdir(eng_pcm_path)
if not os.path.exists(eng_wav_path): os.mkdir(eng_wav_path)

# unsip
!unzip -q "/content/drive/My Drive/ETRI/9_voice_download_4_Korean_English_by_Korean_part2.zip" -d "./kor_pcm_dataset"
!unzip -q "/content/drive/My Drive/ETRI/8_voice_download_4_Korean_English_by_Korean_part1.zip" -d "./eng_pcm_dataset"

In [0]:
print(os.listdir(kor_pcm_path)[:5], len(os.listdir(kor_pcm_path)))
print(os.listdir(eng_pcm_path)[:5], len(os.listdir(eng_pcm_path)))

['GSF09025WYY0', 'I4F07852PKJ0', 'GSF08233NEH0', 'GSF05134JHK0', 'GSF07132LCY0'] 50
['GSF09025WYY0', 'I4F07852PKJ0', 'GSF08233NEH0', 'GSF05134JHK0', 'GSF07132LCY0'] 50


In [0]:
import wave

for pcm_path, wav_path in zip([kor_pcm_path, eng_pcm_path], [kor_wav_path, eng_wav_path]):

    for i, folder_path in enumerate(sorted(os.listdir(pcm_path))):
        
        file_list = os.listdir(os.path.join(pcm_path, folder_path)) # label's path
        file_list_pcm = [files for files in file_list if files.endswith(".pcm")] # *.pcm files

        for j, file_path in enumerate(sorted(file_list_pcm)):
            with open(os.path.join(pcm_path, folder_path, file_path), "rb") as pcm_file:
                pcm_data = pcm_file.read()
            # save as {label}_{features}.wav format
            with wave.open(os.path.join(wav_path, "{}_{}.wav".format(i, j)), 'wb') as wav_file:
                wav_file.setparams((1, 2, 16000, 0, 'NONE', 'NONE')) # 16kHz, mono channel
                wav_file.writeframes(pcm_data)

# **Generate Dataset**

In [0]:
import numpy as np

num_labels = 50
num_features = 100

data_length_1 = 3 ** 8
data_length_2 = 2 * (3 ** 8)
data_length_3 = 3 ** 9

train_rate = 0.05
num_train_data = int(num_features * train_rate) # 80

select_train_data = [True] * num_train_data + [False] * (num_features - num_train_data)
select_test_data = [False] * num_train_data + [True] * (num_features - num_train_data)

In [0]:
from scipy.io import wavfile

data1 = np.zeros((num_labels , num_features, data_length_1))
data2 = np.zeros((num_labels , num_features, data_length_2))
data3 = np.zeros((num_labels , num_features, data_length_3))
labels = np.zeros((num_labels , num_features))

# korean waves
for i in range(num_labels):
    for j in range(num_features):
        file_name = "{}_{}.wav".format(i, j)
        sample_rate, samples = wavfile.read(os.path.join(kor_wav_path, file_name))
        
        start = int(len(samples) - data_length_1) // 2
        end = start + data_length_1
        data1[i, j] = samples[start:end]

        start = int(len(samples) - data_length_2) // 2
        end = start + data_length_2
        data2[i, j] = samples[start:end]

        start = int(len(samples) - data_length_3) // 2
        end = start + data_length_3
        data3[i, j] = samples[start:end]

        labels[i, j] = i

In [0]:
# save files
if not os.path.exists("/content/drive/My Drive/Speaker Recognition/dataset"):
    os.mkdir("/content/drive/My Drive/Speaker Recognition/dataset")

np.savez_compressed(
    "/content/drive/My Drive/Speaker Recognition/dataset/kor_dataset",
    data_6561 = data1,
    data_13122 = data2,
    data_19683 = data3,
    labels = labels)

In [0]:
# english waveform
for i in range(num_labels):
    for j in range(num_features):
        file_name = "{}_{}.wav".format(i, j)
        sample_rate, samples = wavfile.read(os.path.join(eng_wav_path, file_name))
        
        start = int(len(samples) - data_length_1) // 2
        end = start + data_length_1
        data1[i, j] = samples[start:end]

        start = int(len(samples) - data_length_2) // 2
        end = start + data_length_2
        data2[i, j] = samples[start:end]

        start = int(len(samples) - data_length_3) // 2
        end = start + data_length_3
        data3[i, j] = samples[start:end]

        labels[i, j] = i

In [0]:
np.savez_compressed(
    "/content/drive/My Drive/Speaker Recognition/dataset/eng_dataset",
    data_6561 = data1,
    data_13122 = data2,
    data_19683 = data3,
    labels = labels)

# **Load Dataset**

In [0]:
import numpy as np

# check files
kor_dataset = np.load("/content/drive/My Drive/Speaker Recognition/dataset/kor_dataset.npz")
eng_dataset = np.load("/content/drive/My Drive/Speaker Recognition/dataset/eng_dataset.npz")

print("kor_dataset[\"data_{}\"].shape:\t\t{}".format(3 ** 8, kor_dataset["data_{}".format(3 ** 8)].shape))
print("kor_dataset[\"data_{}\"].shape:\t{}".format(2 * (3 ** 8), kor_dataset["data_{}".format(2 * (3 ** 8))].shape))
print("kor_dataset[\"data_{}\"].shape:\t{}".format(3 ** 9, kor_dataset["data_{}".format(3 ** 9)].shape))
print("kor_dataset[\"labels\"].shape:\t\t{}".format(kor_dataset["labels"].shape))

print("eng_dataset[\"data_{}\"].shape:\t\t{}".format(3 ** 8, eng_dataset["data_{}".format(3 ** 8)].shape))
print("eng_dataset[\"data_{}\"].shape:\t{}".format(2 * (3 ** 8), eng_dataset["data_{}".format(2 * (3 ** 8))].shape))
print("eng_dataset[\"data_{}\"].shape:\t{}".format(3 ** 9, eng_dataset["data_{}".format(3 ** 9)].shape))
print("eng_dataset[\"labels\"].shape:\t\t{}".format(eng_dataset["labels"].shape))

kor_dataset["data_6561"].shape:		(50, 100, 6561)
kor_dataset["data_13122"].shape:	(50, 100, 13122)
kor_dataset["data_19683"].shape:	(50, 100, 19683)
kor_dataset["labels"].shape:		(50, 100)
eng_dataset["data_6561"].shape:		(50, 100, 6561)
eng_dataset["data_13122"].shape:	(50, 100, 13122)
eng_dataset["data_19683"].shape:	(50, 100, 19683)
eng_dataset["labels"].shape:		(50, 100)


In [0]:
# download files
# try:
#     from google.colab import files
#     files.download("train_dataset.npz")
#     files.download("test_dataset.npz")
# except:
#     pass