In [1]:
# extract opensmile features and write them to db, 
# then combine them into triplets for neural network input 
# and dump to pickle file
#
#################
# NOTES/CONCEPT #
#################
# 
# 'triplets' of feature tensors:
#     index 0: turn-initial IPU of speaker A
#     index 1: turn-final IPU of next turn by speaker B
#     index 2: turn-initial IPU of next turn by speaker A
#     (A, B is independent of the labels in Fisher; each speaker takes
#      either position for some triplets for each conversation)
# alternative triplets: index 0 for very first IPU from A (per ses)
#
# three different subsets of Fisher 
# (no balancing by gender etc., percentages based on session count):
#     'trn': training, ~80%
#     'vld': validation, ~10%
#     'tst': test, ~10%

import multiprocessing
import time

import sys
sys.path.append('../python')
import cfg, fea

In [None]:
# features for games corpus (takes several minutes)
fea.set_last_in_turn(cfg.CORPUS_ID_GAMES)

pool = multiprocessing.Pool(6)
args = [(cfg.CORPUS_ID_GAMES, '', i) for i in range(1, 13)]
results = pool.starmap(fea.extract, args)

In [None]:
# determine turn-final chunks for fisher corpus; separated from 
# feature extraction so it is not repeated for restarts of that
# (takes several minutes)
fea.set_last_in_turn(cfg.CORPUS_ID_FISHER)

In [None]:
# features for fisher corpus (takes DOZENS OF HOURS, can be restarted)
with open(cfg.FILELIST_FNAME_FC % 1) as lst_file:
    lst_lines = lst_file.readlines()
with open(cfg.FILELIST_FNAME_FC % 2) as lst_file:
    lst_lines.extend(lst_file.readlines())

# use this for a restart after an interruption 
# (set to last one that had been started, according to log;
#  that run was incomplete, all those sessions should be done again)
ses_id_start = 1
step = 700
pool = multiprocessing.Pool(7)

for i in range(ses_id_start-1, len(lst_lines), step):
    with open('log.txt', 'a') as log_file:
        log_file.write('%d %s\n' % (i+1, time.ctime()))
    args = [[cfg.CORPUS_ID_FISHER]
             + lst_lines[j].split()[:2] for j in range(i, i+step)]
    results = pool.starmap(fea.extract, args)

In [2]:
# store triplets per task for games corpus (takes only seconds)
fea.load_and_store_triplets(cfg.CORPUS_ID_GAMES)

loading data for all tasks/sessions...
...done! Mon Mar  8 14:17:21 2021

3227 triplets total

storing data per task...
...done! Mon Mar  8 14:17:22 2021
storing data per session...
...done! Mon Mar  8 14:17:22 2021

converting to triplets with IPU 0...
...done! Mon Mar  8 14:17:22 2021

storing data per task...
...done! Mon Mar  8 14:17:22 2021
storing data per session...
...done! Mon Mar  8 14:17:23 2021


In [3]:
# store triplets per session for fisher corpus (takes ~ 40mins)
fea.load_and_store_triplets(cfg.CORPUS_ID_FISHER, True)

loading data for all tasks/sessions...
0% done Mon Mar  8 14:17:38 2021
10% done Mon Mar  8 14:19:58 2021
20% done Mon Mar  8 14:22:13 2021
30% done Mon Mar  8 14:24:48 2021
40% done Mon Mar  8 14:27:10 2021
50% done Mon Mar  8 14:29:22 2021
60% done Mon Mar  8 14:31:43 2021
70% done Mon Mar  8 14:34:07 2021
80% done Mon Mar  8 14:36:20 2021
90% done Mon Mar  8 14:38:36 2021
100% done Mon Mar  8 14:40:53 2021
...done! Mon Mar  8 14:40:55 2021

1351612 triplets total

storing data per session...
...done! Mon Mar  8 14:42:57 2021
storing data per set (trn, dev, tst) and session...
...done! Mon Mar  8 14:45:14 2021
storing data per set (trn, dev, tst)...
...done! Mon Mar  8 14:47:18 2021

converting to triplets with IPU 0...
...done! Mon Mar  8 14:48:14 2021

storing data per session...
...done! Mon Mar  8 14:50:14 2021
storing data per set (trn, dev, tst) and session...
...done! Mon Mar  8 14:52:14 2021
storing data per set (trn, dev, tst)...
...done! Mon Mar  8 14:54:17 2021
