In [1]:
import os
from random import randint, uniform
import re
import numpy as np
import wave
import contextlib


## Calculating recordings lengths

In [2]:

total = 0
corrupted=0
files=0
for directory in os.listdir("./sections/"):
    if os.path.isdir("./sections/"+directory):
        for file in os.listdir("./sections/"+directory):
            if file.endswith(".wav"):
                files+=1
                fname = "./sections/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        total+=duration
                except Exception as e:
                    corrupted+=1

print("We have a total of ",total," seconds",", ",total/(60*60)," hours")

We have a total of  7948.633687500008  seconds ,  2.2079538020833356  hours


In [3]:
print('This is the whole files:',files)
print('The corrupted ones are:' , corrupted)

This is the whole files: 1035
The corrupted ones are: 90


# Organizing data

### spliting records to train, val and test sets

In [4]:
!mkdir data
!mkdir data/records

In [5]:
# Extracting all the non-corrupted files
wav_files = []
all_linkers  = dict()
for directory in os.listdir("./sections/"): # parent directory of the recordings, it should contain folders that contain wav,json and txt files
    if os.path.isdir("./sections/"+directory):
        to_remove = []
        linker = []

        for file in os.listdir("./sections/"+directory):
            if file.endswith(".txt"):
                linker_data = open("./sections/"+directory+"/"+file).readlines()
                
            if file.endswith(".wav"):
                fname = "./sections/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        wav_files.append(fname)
                except Exception as e:
                    to_remove.append(file)
        for file in to_remove:
            i=0
            while i<len(linker_data):
                if file in linker_data[i]:
                    linker_data.pop(i)
                i+=1
        linker.extend(linker_data)
        all_linkers[directory] =linker
        

            

In [6]:
all_linkers.keys()

dict_keys(['section_15', 'section_2', 'section_16', 'section_7', 'section_9', 'section_8', 'section_17', 'section_4', 'section_6', 'section_14', 'section_19', 'section_20', 'section_13', 'section_12', 'section_5', 'section_10', 'section_11', 'section_1', 'section_18', 'section_3'])

In [7]:
print("After removing the corrupted files, we have {} .wav files left. " . format(len(wav_files)))

After removing the corrupted files, we have 945 .wav files left. 


In [8]:
to_copy = " ".join(wav_files)
!cp -t data/records/ {to_copy}

In [9]:
for section,linker in all_linkers.items():
    for i,link in enumerate(linker):
        line = link.split(";")[0].split("(")[1].split(")")[0].strip()
        wav = link.split(";")[1].strip().split("/")[-1]
        linker[i] = wav+":"+line


In [10]:
out_linker = open("./data/linker.txt","w")
out_linker.write("\n".join(linker))
out_linker.close()

In [11]:
!mkdir ./data/records/train
!mkdir ./data/records/test
!mkdir ./data/records/val


In [12]:
import numpy as np
np.random.seed(0)
indices = np.random.permutation(len(wav_files))

test_idx = indices[:len(indices)//2]
validation_portion = int(len(indices)//2 * 0.2)
train_idx = indices[len(indices)//2:-validation_portion]
valid_idx = indices[-validation_portion:]

In [13]:
train_set_files = [wav_files[i].split("/")[-1] for i in train_idx]
valid_set_files = [wav_files[i].split("/")[-1] for i in valid_idx]
test_set_files = [wav_files[i].split("/")[-1] for i in test_idx]

In [14]:
to_copy_train = "./data/records/" + " ./data/records/".join(train_set_files)
to_copy_valid = "./data/records/" + " ./data/records/".join(valid_set_files)
to_copy_test = "./data/records/" + " ./data/records/".join(test_set_files)

In [15]:
!mv -t data/records/train/ {to_copy_train}
!mv -t data/records/val/ {to_copy_valid}
!mv -t data/records/test/ {to_copy_test}

### making of chars.txt file

In [16]:
import re
chars = {" ":1,"ε":0}
text_data = []
char_idx = 2
for section_id,linker in all_linkers.items():
    section_id = re.sub("_","",section_id)+".txt"
    text_file = open("./text_files/"+section_id,"r").readlines() # Modify this to the directory of your txt file that you recorded with
    
    for link in linker:
        file,idx = link.split(":")[0],int(link.split(":")[1].split(" ")[1])-1
        line = text_file[idx]
        line = line.split("##")[0].strip()
        #line = re.sub("[\[\]|٪%«»_ـ]","",line) # Clean unnecessary characters from the data, this is for arabic
        line = re.sub("[—\u200b]","",line) # Clean unnecessary characters from the data, this is for arabic
        text_data.append((line,file.split(".")[0]))
        char_set = set(line)
        for c in char_set:
            if c not in chars:
                chars[c]=char_idx
                char_idx+=1
            
    

In [17]:
raw_text = "\n".join([wav+":"+line for line,wav in text_data])
with open("data/raw_text_file.txt","w") as f:
    f.write(raw_text)


In [18]:
indices_text = []
for line,wav in text_data:
    line = list(line)
    indicies = []
    for c in line:
        indicies.append(str(chars[c]))
    indices_text.append(wav+" "+" ".join(indicies))

    

In [19]:
len(chars)

78

In [20]:
chars

{' ': 1,
 'ε': 0,
 'w': 2,
 'n': 3,
 'h': 4,
 'i': 5,
 'e': 6,
 'J': 7,
 'S': 8,
 'b': 9,
 'k': 10,
 'M': 11,
 'd': 12,
 'p': 13,
 't': 14,
 'y': 15,
 'c': 16,
 '.': 17,
 'g': 18,
 'l': 19,
 'r': 20,
 's': 21,
 'm': 22,
 'a': 23,
 'f': 24,
 'o': 25,
 'u': 26,
 ':': 27,
 'E': 28,
 '”': 29,
 ')': 30,
 '3': 31,
 ',': 32,
 '(': 33,
 '9': 34,
 '4': 35,
 '“': 36,
 'I': 37,
 'A': 38,
 'v': 39,
 'D': 40,
 'j': 41,
 'N': 42,
 '2': 43,
 'T': 44,
 'x': 45,
 'C': 46,
 'R': 47,
 'F': 48,
 'z': 49,
 '?': 50,
 'G': 51,
 'W': 52,
 'O': 53,
 'P': 54,
 'B': 55,
 'L': 56,
 'H': 57,
 'K': 58,
 ';': 59,
 '-': 60,
 '7': 61,
 '1': 62,
 '8': 63,
 'q': 64,
 '5': 65,
 '0': 66,
 '!': 67,
 'Z': 68,
 '6': 69,
 'Y': 70,
 '‘': 71,
 '’': 72,
 'U': 73,
 'Q': 74,
 'V': 75,
 '[': 76,
 ']': 77}

In [21]:
indicies_text = "\n".join(indices_text)
with open("data/chars.txt","w") as f:
    f.write(indicies_text)

In [22]:
with open("data/charset.txt","w") as js:
    js.write(str(chars))

In [23]:
with open("data/charset.txt") as js:
    charset = eval(js.read())

In [24]:
charset

{' ': 1,
 'ε': 0,
 'w': 2,
 'n': 3,
 'h': 4,
 'i': 5,
 'e': 6,
 'J': 7,
 'S': 8,
 'b': 9,
 'k': 10,
 'M': 11,
 'd': 12,
 'p': 13,
 't': 14,
 'y': 15,
 'c': 16,
 '.': 17,
 'g': 18,
 'l': 19,
 'r': 20,
 's': 21,
 'm': 22,
 'a': 23,
 'f': 24,
 'o': 25,
 'u': 26,
 ':': 27,
 'E': 28,
 '”': 29,
 ')': 30,
 '3': 31,
 ',': 32,
 '(': 33,
 '9': 34,
 '4': 35,
 '“': 36,
 'I': 37,
 'A': 38,
 'v': 39,
 'D': 40,
 'j': 41,
 'N': 42,
 '2': 43,
 'T': 44,
 'x': 45,
 'C': 46,
 'R': 47,
 'F': 48,
 'z': 49,
 '?': 50,
 'G': 51,
 'W': 52,
 'O': 53,
 'P': 54,
 'B': 55,
 'L': 56,
 'H': 57,
 'K': 58,
 ';': 59,
 '-': 60,
 '7': 61,
 '1': 62,
 '8': 63,
 'q': 64,
 '5': 65,
 '0': 66,
 '!': 67,
 'Z': 68,
 '6': 69,
 'Y': 70,
 '‘': 71,
 '’': 72,
 'U': 73,
 'Q': 74,
 'V': 75,
 '[': 76,
 ']': 77}