In [1]:
import multiprocessing
import time

import sys
sys.path.append('../python/')
import cfg
import db
import fio
import lex
import sb

## Columbia Games Corpus

In [None]:
corpus_id = cfg.CORPUS_ID_GC

In [None]:
# initialize the database, extract data from _bt table
db.connect(corpus_id)
# careful, this DELETES ALL DB TABLES
db.executescript(cfg.SQL_PATH, cfg.SQL_INIT_FNAME_GC)
db.commit()
db.close()

In [None]:
# extract features for all chunks
db.connect(corpus_id)
path = cfg.get_corpus_path(corpus_id)
for ses_id in db.get_ses_ids():
    for a_or_b in ['A', 'B']:
        fname = 's%02d.objects.1.%s.wav' % (ses_id, a_or_b)
        for chu_id, words, start, end in db.find_chunks(ses_id, a_or_b):
            if end - start >= 0.04: # min duration for 75Hz min pitch
                features = fio.extract_features(
                    path, fname, ses_id, chu_id, words, start, end)
                db.set_features(chu_id, features)
    db.commit()
# run cleanup (set all features null for all chunks with any null)
db.executescript(cfg.SQL_PATH, cfg.SQL_CU_FNAME)
db.commit()
# create auxiliary table chunk_pairs
db.executescript(cfg.SQL_PATH, cfg.SQL_AT_FNAME)
db.commit()
db.close()

In [None]:
# compute auxiliary files for lex measures
db.connect(corpus_id)
fio.store_tokens(corpus_id)
# lms only for ses (games corpus tasks are too short)
lex.store_lms_ngrams(corpus_id, 'ses')
db.close()

## Switchboard Corpus

In [2]:
corpus_id = cfg.CORPUS_ID_SB

In [None]:
# initialize the database, extract meta-data from logs/transcriptions
db.connect(corpus_id)
# careful, this DELETES ALL DB TABLES
db.executescript(cfg.SQL_PATH, cfg.SQL_INIT_FNAME_SB)
db.commit()

sb.populate_speakers()
sb.populate_topics()
sb.populate_sessions()
sb.populate_tasks()
sb.populate_turns_and_chunks()
db.close()

In [None]:
db.connect(corpus_id)
# run script to delete all meta-data for three sessions missing audio
db.executescript(cfg.SQL_PATH, cfg.SQL_DM_FNAME)
db.commit()
db.close()

In [None]:
# extract features for all chunks
# (takes many hours, almost a day on my machine *with* multiprocessing)

pool = multiprocessing.Pool(7)

corpus_id = cfg.CORPUS_ID_SB
db.connect(corpus_id)
ses_ids = db.get_ses_ids()
db.close()

start = 0
step = 70

for i in range(start, len(ses_ids), step):
    print('%d %s\n' % (i, time.ctime()))
    args = [ses_ids[j] for j in range(i, min(i+step, len(ses_ids)))]
    results = pool.map(sb.extract_features, args)
print('done! %s\n' % time.ctime())

In [None]:
# run cleanup (set all features null for all chunks with any null)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_CU_FNAME)
db.commit()
db.close()

In [3]:
# create auxiliary table chunk_pairs (takes hours)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_AT_FNAME)
db.commit()
db.close()

In [None]:
# compute auxiliary files for lex measures
corpus_id = cfg.CORPUS_ID_SB
db.connect(corpus_id)
# tokens/lms only for ses (switchboard tasks and sessions are the same)
fio.store_tokens(corpus_id, 'ses')
lex.store_lms_ngrams(corpus_id, 'ses')
db.close()