In [1]:
import csv
from itertools import product
from os import remove
from os.path import isfile
import sqlite3
import subprocess

db_fname = '../../hmt.db'
# audio directory, put stereo files in "orig/" subdirectory and create
# empty "split/" subdirectory for mono
audio_dir = '/media/andi/1E42EC061079D2FE/map_task_corpus_hebrew/'
tmp_dir = '/media/andi/1E42EC061079D2FE/tmp/'
meta_dir = '../../data/meta/'
trans_dir = '../../data/trans/'

In [2]:
# run init script to create empty tables
with open('../sql/init.sql') as sql_file:
    sql_script = ''.join(sql_file.readlines())
    with sqlite3.connect(db_fname) as conn:
        conn.cursor().executescript(sql_script)

In [3]:
# split audio files by channel; only needs to run once, takes a while
for i in range(16):
    for a_or_b in ['A', 'B']:
        in_fname = '%sorig/FNL%03d%s.wav' % (audio_dir, i, a_or_b)
        if isfile(in_fname):
            for ch in [1, 2]: 
                out_fname = '%ssplit/%d%s.%s.wav' % \
                    (audio_dir, i, a_or_b, 'B' if ch == 1 else 'A')
                subprocess.check_call(
                    ['sox', in_fname, out_fname, 'remix', str(ch)])

In [4]:
# speakers table from spk.csv
sql_stmt = \
    'INSERT INTO speakers ' \
    '(spk_id, gender, age, born_in, native_lang, years_edu) ' \
    'VALUES (?,?,?,?,?,?)'

with sqlite3.connect(db_fname) as conn:
    c = conn.cursor()
    with open('%sspk.csv' % meta_dir, 'r') as spk_file:
        spk_reader = csv.reader(spk_file, delimiter=',', quotechar='"')
        # skip header
        next(spk_reader)
        for row in spk_reader:
            gender = 'f' if row[1] == 'female' else \
                     'm' if row[1] == 'male' else None
            c.execute(sql_stmt, 
                      (row[0], gender, row[2], row[3], row[4], row[5]))
        conn.commit()

In [5]:
# sessions table from ses_tsk.csv
sql_stmt = \
    'INSERT INTO sessions(ses_id, spk_id_a, spk_id_b, status) ' \
    'VALUES(?,?,?,?)'

with sqlite3.connect(db_fname) as conn:
    c = conn.cursor()
    with open('%sses_tsk.csv' % meta_dir, 'r') as ses_file:
        ses_reader = csv.reader(ses_file, delimiter=',', quotechar='"')
        # skip header
        next(ses_reader)
        for row in ses_reader:
            if row[0][-1] == 'B':
                # each session is listed twice, skip tasks 'B'
                continue
            c.execute(sql_stmt, (row[0][3:6], row[2], row[3], 0))
        conn.commit()

In [6]:
# tasks table from ses_tsk.csv
sql_stmt = \
    'INSERT INTO tasks(tsk_id, ses_id, map_index, task_index, a_or_b) ' \
    'VALUES(?,?,?,?,?)'

with sqlite3.connect(db_fname) as conn:
    c = conn.cursor()
    with open('%sses_tsk.csv' % meta_dir, 'r') as tsk_file:
        tsk_reader = csv.reader(tsk_file, delimiter=',', quotechar='"')
        # skip header
        next(tsk_reader)
        tsk_id = 1
        for row in tsk_reader:
            task_index = 1 if row[0][-1] == 'A' else 2
            a_or_b = 'A' if row[8] == 'master' else 'B'
            c.execute(sql_stmt, 
                      (tsk_id, row[0][3:6], row[1], task_index, a_or_b))
            tsk_id += 1
        conn.commit()

In [7]:
# parse praat textgrid transcripts into different format
# (code to further process that format already existed)
for a_or_b in ['A', 'B']:
    for i in range(16):
        in_fname = '%sorig/%d%s_merged.TextGrid' % \
            (trans_dir, i, a_or_b)
        out_fname_d = '%ssplit/%d%s_d.txt' % (trans_dir, i, a_or_b)
        out_fname_f = '%ssplit/%d%s_f.txt' % (trans_dir, i, a_or_b)
        if isfile(out_fname_d):
            remove(out_fname_d)
        if isfile(out_fname_f):
            remove(out_fname_f)
        
        subprocess.check_call(['praat', '--run', 
                               '../misc/convert_transcripts.praat',
                               in_fname, out_fname_d, '3'])
        
        subprocess.check_call(['praat', '--run', 
                               '../misc/convert_transcripts.praat',
                               in_fname, out_fname_f, '4'])

In [8]:
# aux function for transcript parsing; removes different markups
def preprocess(in_item):
    out = in_item.replace('\n', '')
    # @ marks unintelligible text; not very common, simply ignore
    out = out.replace('@ ', '')
    out = out.replace('@', '')
    # remove markup (silence, noises)
    while out.find('<') != -1:
        out = out[:out.find('<')] + out[out.find('>')+1:]
    # 'condense' double spaces
    out = ' '.join(out.split())
    # remove empty overlaps
    if out == '[start_overlap][end_overlap]':
        out = ''
    return out

In [9]:
# chunks (without features) and turns computed from transcripts

sql_stmt1 = 'SELECT tsk_id, ses_id, task_index FROM tasks ORDER BY tsk_id'

sql_stmt2 = \
    'INSERT INTO turns (tur_id, tsk_id, turn_index, speaker_role) ' \
    'VALUES(?,?,?,?)'

sql_stmt3 = \
    'INSERT INTO chunks (chu_id, tur_id, chunk_index, ' \
    'start_time, end_time, words) VALUES(?,?,?,?,?,?)'

with sqlite3.connect(db_fname) as conn:
    c1 = conn.cursor()
    c2 = conn.cursor()
    c3 = conn.cursor()
    
    tur_ids = [0, 0]
    chu_id = 0
    
    c1.execute(sql_stmt1,)
    for tsk_id, ses_id, task_index in c1.fetchall():
        # read both transcription files
        lines = []
        for d_or_f in ['d', 'f']:
            fname = '%ssplit/%s%s_%s.txt' % \
                (trans_dir, ses_id, chr(64 + task_index), d_or_f)
            with open(fname) as file:
                lines += [[line.split()[0],
                           line.split()[1],
                           ' '.join(line.split()[2:]), 
                           d_or_f]
                          for line in file.readlines()]
        lines = [(float(l[0]), float(l[1]), preprocess(l[2]), l[3]) 
                 for l in lines]
        lines.sort()
        # ensure silence at the end so last chunks are processed
        max_f = max([l[1] for l in lines if l[3] == 'f'])
        max_d = max([l[1] for l in lines if l[3] == 'd'])
        if max_d < max_f:
            lines.append((max_d, max_f + 1, '', 'd'))
            lines.append((max_f, max_f + 1, '', 'f'))
        else:
            lines.append((max_f, max_d + 1, '', 'f'))
            lines.append((max_d, max_d + 1, '', 'd'))
        
        # arrays to track words of current chunk, start/end timestamps,  
        # and turn/chunk counts per speaker (order: describer, follower)
        words = ['', ''] 
        starts = [0.0, 0.0]
        ends = [0.0, 0.0]
        tur_cnts = [0, 0]
        chu_cnts = [0, 0]
        
        # combine individual lines to chunks and turns
        # (each line should actually be a full chunk)
        for start, end, text, d_or_f in lines:
            # index of current speaker in arrays (1-idx is other speaker)
            idx = 0 if d_or_f == 'd' else 1
            if text != '':
                if len(words[idx]) == 0:
                    # word after pause -> new chunk, maybe new turn
                    if ends[1-idx] > ends[idx] \
                    or tur_cnts[1-idx] > tur_cnts[idx] \
                    or tur_cnts[idx] == 0:
                        # new turn, update index and count
                        tur_cnts[idx] = max(tur_cnts) + 1
                        tur_ids[idx] = max(tur_ids) + 1
                        chu_cnts[idx] = 1
                    else:
                        # continuation of old turn
                        chu_cnts[idx] += 1
                    starts[idx] = start
                words[idx] += ' ' + text
            else:
                if len(words[idx]) != 0:
                    # silence after some words -> chunk complete
                    if chu_cnts[idx] == 1:
                        # first chunk in turn; insert turn first
                        c2.execute(
                            sql_stmt2, 
                            (tur_ids[idx], tsk_id, tur_cnts[idx], d_or_f))
                    chu_id += 1
                    # chunk ended when silence started
                    ends[idx] = start
                    
                    c3.execute(
                        sql_stmt3, 
                        (chu_id, tur_ids[idx], chu_cnts[idx], starts[idx],
                         start, words[idx]))
                    words[idx] = ''
                else:
                    # continued silence, nothing to do
                    pass
    conn.commit()

In [11]:
# extract features for all chunks (runs a long time)

sql_stmt1 = \
    'SELECT tsk_id, ses_id, task_index, a_or_b ' \
    'FROM   tasks ' \
    'ORDER BY tsk_id'

sql_stmt2 = \
    'SELECT chu.chu_id, ' \
    '       chu.start_time, ' \
    '       chu.end_time, ' \
    '       chu.words, ' \
    '       tur.speaker_role ' \
    'FROM   chunks chu ' \
    'JOIN   turns tur ' \
    'ON     chu.tur_id == tur.tur_id ' \
    'WHERE  tur.tsk_id == ? ' \
    'ORDER BY chu.chunk_index'

sql_stmt3 = \
    'UPDATE chunks ' \
    'SET    pitch_min = ?,' \
    '       pitch_max = ?,' \
    '       pitch_mean = ?,' \
    '       pitch_std = ?,' \
    '       rate_syl = ?,' \
    '       rate_vcd = ?,' \
    '       intensity_min = ?,' \
    '       intensity_max = ?,' \
    '       intensity_mean = ?,' \
    '       intensity_std = ?,' \
    '       jitter = ?,' \
    '       shimmer = ?,' \
    '       nhr = ? ' \
    'WHERE  chu_id = ?'

with sqlite3.connect(db_fname) as conn:
    c1 = conn.cursor()
    c2 = conn.cursor()
    c3 = conn.cursor()
    
    c1.execute(sql_stmt1,)
    for tsk_id, ses_id, task_index, a_or_b in c1.fetchall():
        print(ses_id, tsk_id, a_or_b)
        fname = '%sorig/FNL%03d%s.wav' % \
            (audio_dir, int(ses_id), chr(64 + task_index))
        if not isfile(fname):
            continue
        
        c2.execute(sql_stmt2, (tsk_id,))
        for chu_id, start, end, words, role in c2.fetchall():
            # skip chunks that are too short to process
            if end - start < 0.086:
                continue
            
            file_a_or_b = 'A' if a_or_b == 'A' and role == 'd' \
                else 'A' if a_or_b == 'B' and role == 'f' \
                else 'B'
            in_fname = '%ssplit/%s%s.%s.wav' % \
                (audio_dir, ses_id, chr(64 + task_index), file_a_or_b)
            cut_fname = '%s%d_%d.wav' % (tmp_dir, tsk_id, chu_id)
            out_fname = '%s%d_%d.txt' % (tmp_dir, tsk_id, chu_id)
            
            subprocess.check_call(['sox', in_fname, cut_fname, 'trim', 
                                   str(start), '=' + str(end)])
            subprocess.check_call(['praat', '--run', 
                                   '../misc/extract_features.praat',
                                   cut_fname, out_fname])

            with open(out_fname, 'r') as out_file:
                lines = out_file.readlines()
                feats = {}
                for line in lines:
                    key, val = line.replace('\n', '').split(',')
                    feats[key] = val
            remove(cut_fname)
            remove(out_fname)
            
            # count occurrences per vowel, add up, divide by duration
            rate = sum(map(words.lower().count, 'aeiou')) / (end - start)
            
            c3.execute(sql_stmt3, 
                       (feats['f0_min'],
                        feats['f0_max'],
                        feats['f0_mean'],
                        feats['f0_std'],
                        rate,
                        feats['vcd2tot_frames'],
                        feats['int_min'],
                        feats['int_max'],
                        feats['int_mean'],
                        feats['int_std'],
                        feats['jitter'],
                        feats['shimmer'],
                        feats['nhr'],
                        chu_id))
        conn.commit()

0 1 B
0 2 A
1 3 A
1 4 B
2 5 B
2 6 A
3 7 B
3 8 A
4 9 A
4 10 B
5 11 B
5 12 A
6 13 B
6 14 A
7 15 B
7 16 A
8 17 B
8 18 A
9 19 B
9 20 A
10 21 B
10 22 A
11 23 B
11 24 A
12 25 B
12 26 A
13 27 B
13 28 A
14 29 B
14 30 A
15 31 B
15 32 A


In [12]:
# run cleanup script, mostly for null values (instead of '--undefined--')
with open('../sql/cleanup.sql') as sql_file:
    sql_script = ''.join(sql_file.readlines())
    with sqlite3.connect(db_fname) as conn:
        conn.cursor().executescript(sql_script)