In [1]:
import os
from random import randint, uniform
import re
import numpy as np
import wave
import contextlib

## Calculating recordings lengths

In [2]:

total = 0
corrupted=0
files=0
for directory in os.listdir("./Audio/"):
    if os.path.isdir("./Audio/"+directory):
        for file in os.listdir("./Audio/"+directory):
            if file.endswith(".wav"):
                files+=1
                fname = "./Audio/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        total+=duration
                except Exception as e:
                    corrupted+=1

print("We have a total of ",total," seconds",", ",total/(60*60)," hours")

We have a total of  7883.6113125  seconds ,  2.18989203125  hours


In [3]:
print('This is the whole files:',files)
print('The corrupted ones are:' , corrupted)

This is the whole files: 498
The corrupted ones are: 35


# Organizing data

### spliting records to train, val and test sets

In [4]:
!mkdir data
!mkdir data/records

In [5]:
# Extracting all the non-corrupted files
wav_files = []
all_linkers  = dict()
for directory in os.listdir("./Audio/"): # parent directory of the recordings, it should contain folders that contain wav,json and txt files
    if os.path.isdir("./Audio/"+directory):
        to_remove = []
        linker = []

        for file in os.listdir("./Audio/"+directory):
            if file.endswith(".txt"):
                linker_data = open("./Audio/"+directory+"/"+file).readlines()
                
            if file.endswith(".wav"):
                fname = "./Audio/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        wav_files.append(fname)
                except Exception as e:
                    to_remove.append(file)
        for file in to_remove:
            i=0
            while i<len(linker_data):
                if file in linker_data[i]:
                    linker_data.pop(i)
                i+=1
        linker.extend(linker_data)
        all_linkers[directory] =linker

In [6]:
all_linkers.keys()

dict_keys(['recordings_small_file_2', 'recordings_small_file_1', 'recordings_small_file_3'])

In [7]:
print("After removing the corrupted files, we have {} .wav files left. " . format(len(wav_files)))

After removing the corrupted files, we have 463 .wav files left. 


In [8]:
to_copy = " ".join(wav_files)
!cp -t data/records/ {to_copy}

In [9]:
for section,linker in all_linkers.items():
    for i,link in enumerate(linker):
        line = link.split(";")[0].split("(")[1].split(")")[0].strip()
        wav = link.split(";")[1].strip().split("/")[-1]
        linker[i] = wav+":"+line

In [10]:
out_linker = open("./data/linker.txt","w")
out_linker.write("\n".join(linker))
out_linker.close()

In [11]:
!mkdir ./data/records/train
!mkdir ./data/records/test
!mkdir ./data/records/val

In [12]:
import numpy as np
np.random.seed(0)
indices = np.random.permutation(len(wav_files))

test_idx = indices[:len(indices)//2]
validation_portion = int(len(indices)//2 * 0.2)
train_idx = indices[len(indices)//2:-validation_portion]
valid_idx = indices[-validation_portion:]

In [13]:
train_set_files = [wav_files[i].split("/")[-1] for i in train_idx]
valid_set_files = [wav_files[i].split("/")[-1] for i in valid_idx]
test_set_files = [wav_files[i].split("/")[-1] for i in test_idx]

In [14]:
to_copy_train = "./data/records/" + " ./data/records/".join(train_set_files)
to_copy_valid = "./data/records/" + " ./data/records/".join(valid_set_files)
to_copy_test = "./data/records/" + " ./data/records/".join(test_set_files)

In [15]:
!mv -t data/records/train/ {to_copy_train}
!mv -t data/records/val/ {to_copy_valid}
!mv -t data/records/test/ {to_copy_test}

### making of chars.txt file

In [18]:
#all_linkers.items()

In [20]:
import re
chars = {" ":1,"ε":0}
text_data = []
char_idx = 2
for section_id,linker in all_linkers.items():
    section_id = re.sub("_","",section_id)+".txt"
    text_file = open("./Transcripts/"+section_id,"r").readlines() # Modify this to the directory of your txt file that you recorded with
    
    for link in linker:
        file,idx = link.split(":")[0],int(link.split(":")[1].split(" ")[1])-1
        line = text_file[idx]
        line = line.split("##")[0].strip()
        #line = re.sub("[\[\]|٪%«»_ـ]","",line) # Clean unnecessary characters from the data, this is for arabic
        line = re.sub("[—\u200b]","",line) # Clean unnecessary characters from the data, this is for arabic
        text_data.append((line,file.split(".")[0]))
        char_set = set(line)
        for c in char_set:
            if c not in chars:
                chars[c]=char_idx
                char_idx+=1

In [21]:
raw_text = "\n".join([wav+":"+line for line,wav in text_data])
with open("data/raw_text_file.txt","w") as f:
    f.write(raw_text)

In [22]:
indices_text = []
for line,wav in text_data:
    line = list(line)
    indicies = []
    for c in line:
        indicies.append(str(chars[c]))
    indices_text.append(wav+" "+" ".join(indicies))

    

In [23]:
len(chars)

64

In [24]:
chars

{' ': 1,
 'ε': 0,
 '.': 2,
 'v': 3,
 'h': 4,
 'g': 5,
 'P': 6,
 'd': 7,
 'K': 8,
 'J': 9,
 'l': 10,
 'R': 11,
 'r': 12,
 'M': 13,
 'n': 14,
 'p': 15,
 'o': 16,
 'W': 17,
 'i': 18,
 'm': 19,
 'V': 20,
 'b': 21,
 'C': 22,
 'z': 23,
 'e': 24,
 'u': 25,
 'w': 26,
 'a': 27,
 'y': 28,
 'f': 29,
 'c': 30,
 'k': 31,
 'D': 32,
 's': 33,
 't': 34,
 'j': 35,
 'O': 36,
 'x': 37,
 'A': 38,
 'U': 39,
 'H': 40,
 '1': 41,
 '8': 42,
 '3': 43,
 '2': 44,
 'T': 45,
 'B': 46,
 'S': 47,
 '0': 48,
 '6': 49,
 '9': 50,
 'G': 51,
 'N': 52,
 'I': 53,
 'Z': 54,
 '4': 55,
 'E': 56,
 '7': 57,
 'L': 58,
 '5': 59,
 'F': 60,
 'q': 61,
 'Y': 62,
 'X': 63}

In [25]:
indicies_text = "\n".join(indices_text)
with open("data/chars.txt","w") as f:
    f.write(indicies_text)

In [26]:
with open("data/charset.txt","w") as js:
    js.write(str(chars))

In [27]:
with open("data/charset.txt") as js:
    charset = eval(js.read())

In [28]:
charset

{' ': 1,
 'ε': 0,
 '.': 2,
 'v': 3,
 'h': 4,
 'g': 5,
 'P': 6,
 'd': 7,
 'K': 8,
 'J': 9,
 'l': 10,
 'R': 11,
 'r': 12,
 'M': 13,
 'n': 14,
 'p': 15,
 'o': 16,
 'W': 17,
 'i': 18,
 'm': 19,
 'V': 20,
 'b': 21,
 'C': 22,
 'z': 23,
 'e': 24,
 'u': 25,
 'w': 26,
 'a': 27,
 'y': 28,
 'f': 29,
 'c': 30,
 'k': 31,
 'D': 32,
 's': 33,
 't': 34,
 'j': 35,
 'O': 36,
 'x': 37,
 'A': 38,
 'U': 39,
 'H': 40,
 '1': 41,
 '8': 42,
 '3': 43,
 '2': 44,
 'T': 45,
 'B': 46,
 'S': 47,
 '0': 48,
 '6': 49,
 '9': 50,
 'G': 51,
 'N': 52,
 'I': 53,
 'Z': 54,
 '4': 55,
 'E': 56,
 '7': 57,
 'L': 58,
 '5': 59,
 'F': 60,
 'q': 61,
 'Y': 62,
 'X': 63}