In [1]:
import os
from random import randint, uniform
import re
import numpy as np
import wave
import contextlib


## Calculating recordings lengths

In [2]:
!ls ./sessions/

200625-151026_fra_1de_elicit  200706-174013_fra_1de_elicit
200625-154904_fra_1de_elicit  200706-213645_fra_1de_elicit
200625-164013_fra_1de_elicit


In [3]:
import ipdb

In [4]:

total = 0
corrupted=0
files=0
for directory in os.listdir("./sessions"):
    if os.path.isdir("./sessions/"+directory):
        #ipdb.set_trace()
        for file in os.listdir("./sessions/"+directory):
            if file.endswith(".wav"):
                files+=1
                fname = "./sessions/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        total+=duration
                except Exception as e:
                    corrupted+=1

print("We have a total of ",total," seconds",", ",total/(60*60)," hours")

We have a total of  3527.6793750000015  seconds ,  0.9799109375000005  hours


In [5]:
files

407

In [6]:
print('This is the whole files:',files)
print('The corrupted ones are:' , corrupted)

This is the whole files: 407
The corrupted ones are: 0


# Organizing data

### spliting records to train, val and test sets

In [7]:
!mkdir data
!mkdir data/records

In [8]:
# Extracting all the non-corrupted files
wav_files = []
all_linkers  = dict()
for directory in os.listdir("./sessions/"): # parent directory of the recordings, it should contain folders that contain wav,json and txt files
    if os.path.isdir("./sessions/"+directory):
        to_remove = []
        linker = []

        for file in os.listdir("./sessions/"+directory):
            if file.endswith(".txt"):
                linker_data = open("./sessions/"+directory+"/"+file).readlines()
                
            if file.endswith(".wav"):
                fname = "./sessions/"+directory+"/"+file
                try:
                    with contextlib.closing(wave.open(fname,'r')) as f:
                        frames = f.getnframes()
                        rate = f.getframerate()
                        duration = frames / float(rate)
                        wav_files.append(fname)
                except Exception as e:
                    to_remove.append(file)
        for file in to_remove:
            i=0
            while i<len(linker_data):
                if file in linker_data[i]:
                    linker_data.pop(i)
                i+=1
        linker.extend(linker_data)
        all_linkers[directory] =linker
        

            

In [9]:
all_linkers.keys()

dict_keys(['200625-154904_fra_1de_elicit', '200625-164013_fra_1de_elicit', '200706-174013_fra_1de_elicit', '200625-151026_fra_1de_elicit', '200706-213645_fra_1de_elicit'])

In [10]:
print("After removing the corrupted files, we have {} .wav files left. " . format(len(wav_files)))

After removing the corrupted files, we have 407 .wav files left. 


In [11]:
to_copy = " ".join(wav_files)
!cp -t data/records/ {to_copy}

In [12]:
for section,linker in all_linkers.items():
    for i,link in enumerate(linker):
        #ipdb.set_trace()
        line = link.split(";")[0].split("(")[1].split(")")[0].strip()
        wav = link.split(";")[1].strip().split("/")[-1]
        linker[i] = wav+":"+line


In [15]:
out_linker = open("./data/linker.txt","w")
out_linker.write("\n".join(linker))
out_linker.close()

In [16]:
!mkdir ./data/records/train
!mkdir ./data/records/test
!mkdir ./data/records/val

mkdir: cannot create directory ‘./data/records/train’: File exists
mkdir: cannot create directory ‘./data/records/test’: File exists
mkdir: cannot create directory ‘./data/records/val’: File exists


In [17]:
import numpy as np
np.random.seed(0)
indices = np.random.permutation(len(wav_files))

test_idx = indices[:len(indices)//2]
validation_portion = int(len(indices)//2 * 0.2)
train_idx = indices[len(indices)//2:-validation_portion]
valid_idx = indices[-validation_portion:]

In [18]:
train_set_files = [wav_files[i].split("/")[-1] for i in train_idx]
valid_set_files = [wav_files[i].split("/")[-1] for i in valid_idx]
test_set_files = [wav_files[i].split("/")[-1] for i in test_idx]

In [19]:
to_copy_train = "./data/records/" + " ./data/records/".join(train_set_files)
to_copy_valid = "./data/records/" + " ./data/records/".join(valid_set_files)
to_copy_test = "./data/records/" + " ./data/records/".join(test_set_files)

In [20]:
!mv -t data/records/train/ {to_copy_train}
!mv -t data/records/val/ {to_copy_valid}
!mv -t data/records/test/ {to_copy_test}

### making of chars.txt file

In [21]:
import re
chars = {" ":1,"ε":0}
text_data = []
char_idx = 2
for section_id,linker in all_linkers.items():
    section_id += ".txt"
    #ipdb.set_trace()
    text_file = open("./Lig-Text/"+section_id,"r").readlines() # Modify this to the directory of your txt file that you recorded with
    
    for link in linker:
        file,idx = link.split(":")[0],int(link.split(":")[1].split(" ")[1])-1
        line = text_file[idx]
        line = line.split("##")[0].strip()
        #line = re.sub("[\[\]|٪%«»_ـ]","",line) # Clean unnecessary characters from the data, this is for arabic
        line = re.sub("[—\u200b]","",line) # Clean unnecessary characters from the data, this is for arabic
        text_data.append((line,file.split(".")[0]))
        char_set = set(line)
        for c in char_set:
            if c not in chars:
                chars[c]=char_idx
                char_idx+=1

In [22]:
raw_text = "\n".join([wav+":"+line for line,wav in text_data])
with open("data/raw_text_file.txt","w") as f:
    f.write(raw_text)


In [23]:
indices_text = []
for line,wav in text_data:
    line = list(line)
    indicies = []
    for c in line:
        indicies.append(str(chars[c]))
    indices_text.append(wav+" "+" ".join(indicies))

In [24]:
len(chars)

90

In [25]:
chars

{' ': 1,
 'ε': 0,
 'p': 2,
 'r': 3,
 'd': 4,
 'j': 5,
 'i': 6,
 's': 7,
 '.': 8,
 'y': 9,
 'n': 10,
 'e': 11,
 'h': 12,
 't': 13,
 'o': 14,
 'I': 15,
 'q': 16,
 'u': 17,
 'm': 18,
 'b': 19,
 'a': 20,
 'l': 21,
 'é': 22,
 'c': 23,
 'ê': 24,
 '(': 25,
 'è': 26,
 'v': 27,
 ',': 28,
 'f': 29,
 'D': 30,
 ')': 31,
 "'": 32,
 'L': 33,
 'A': 34,
 'ç': 35,
 'g': 36,
 'k': 37,
 'à': 38,
 'S': 39,
 '-': 40,
 '«': 41,
 '?': 42,
 '»': 43,
 'C': 44,
 'J': 45,
 'T': 46,
 'E': 47,
 'ô': 48,
 'M': 49,
 'û': 50,
 'x': 51,
 'K': 52,
 'À': 53,
 ':': 54,
 'z': 55,
 'ï': 56,
 'â': 57,
 'N': 58,
 'O': 59,
 'ù': 60,
 'Y': 61,
 '!': 62,
 '0': 63,
 '3': 64,
 '1': 65,
 '5': 66,
 'R': 67,
 '7': 68,
 ';': 69,
 'î': 70,
 'V': 71,
 'í': 72,
 'B': 73,
 '2': 74,
 'É': 75,
 'P': 76,
 'Q': 77,
 'w': 78,
 '4': 79,
 'U': 80,
 'H': 81,
 'X': 82,
 'F': 83,
 '/': 84,
 '6': 85,
 'á': 86,
 'Z': 87,
 'Ô': 88,
 'ë': 89}

In [26]:
indicies_text = "\n".join(indices_text)
with open("data/chars.txt","w") as f:
    f.write(indicies_text)

In [27]:
with open("data/charset.txt","w") as js:
    js.write(str(chars))

In [28]:
with open("data/charset.txt") as js:
    charset = eval(js.read())

In [29]:
charset;