In [1]:
import multiprocessing
import time

import sys
sys.path.append('../python/')
import cfg
import dc
import db
import fc

## Fisher Corpus

In [2]:
corpus_id = cfg.CORPUS_ID_FC

In [3]:
# initialize the database (careful, this DELETES ALL DB TABLES)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_INIT_FNAME_FC)
db.commit()
db.close()

In [4]:
# extract meta-data from logs/transcripts
# (processes 11.7k transcripts, takes a few minutes)
db.connect(corpus_id)
fc.populate_speakers()
fc.populate_topics()
fc.populate_sessions_and_tasks()
fc.populate_turns_and_chunks()
db.commit()
db.close()

1000 done
2000 done
3000 done
4000 done
5000 done
6000 done
7000 done
8000 done
9000 done
10000 done
11000 done
11699 done, finished!


In [5]:
# delete all data not relating to relevant subset of sessions
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_DI_FNAME)
db.commit()
db.close()

In [6]:
# extract features for all chunks (takes about an hour on my machine)
pool = multiprocessing.Pool(7)

db.connect(corpus_id)
ses_ids = db.get_ses_ids()
db.close()

start = 0
step = 21

for i in range(start, len(ses_ids), step):
    print('%d %s\n' % (i, time.ctime()))
    args = [ses_ids[j] for j in range(i, min(i+step, len(ses_ids)))]
    results = pool.map(fc.extract_features, args)
print('done! %s\n' % time.ctime())

0 Fri Jul  1 14:15:53 2022

21 Fri Jul  1 14:28:22 2022

42 Fri Jul  1 14:41:31 2022

63 Fri Jul  1 14:55:31 2022

84 Fri Jul  1 15:08:33 2022

done! Fri Jul  1 15:20:47 2022



In [7]:
# run cleanup (set all features null for all chunks with any null)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_CU_FNAME)
db.commit()
db.close()

In [8]:
# create auxiliary table chunk_pairs
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_AT_FNAME)
db.commit()
db.close()

## Deception Corpus

In [9]:
corpus_id = cfg.CORPUS_ID_DC

In [10]:
# initialize the database (careful, this DELETES ALL DB TABLES)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_INIT_FNAME_DC)
db.commit()
db.close()

In [11]:
# read list of relevant sessions into dict
ses_dict = dc.get_ses_dict()

In [12]:
# populate basic tables from meta-data files (takes ~30 seconds)
db.connect(corpus_id)
dc.populate_speakers(ses_dict)
dc.populate_sessions_and_tasks(ses_dict)
dc.populate_turns_and_chunks(ses_dict)
db.set_turn_index_ses()
db.set_duration()
db.commit()
db.close()

In [13]:
# extract features for all chunks (takes about five hours on my machine)
pool = multiprocessing.Pool(7)

db.connect(corpus_id)
ses_ids = db.get_ses_ids()
db.close()

start = 0
step = 21

for i in range(start, len(ses_ids), step):
    print('%d %s\n' % (i, time.ctime()))
    args = [ses_ids[j] for j in range(i, min(i+step, len(ses_ids)))]
    results = pool.map(dc.extract_features, args)
print('done! %s\n' % time.ctime())

0 Wed Jun 29 18:31:48 2022

21 Wed Jun 29 19:12:32 2022

42 Wed Jun 29 19:48:52 2022

63 Wed Jun 29 20:27:00 2022

84 Wed Jun 29 21:11:08 2022

105 Wed Jun 29 21:52:36 2022

126 Wed Jun 29 22:38:47 2022

147 Wed Jun 29 23:21:18 2022

done! Wed Jun 29 23:32:20 2022



In [14]:
# run cleanup (set all features null for all chunks with any null)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_CU_FNAME)
db.commit()
db.close()

In [15]:
# make timestamps continuous per session (run this only once!)
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_FT_FNAME)
db.commit()
db.close()

In [16]:
# create auxiliary table chunk_pairs
db.connect(corpus_id)
db.executescript(cfg.SQL_PATH, cfg.SQL_AT_FNAME)
db.commit()
db.close()