# QA transcripts

In [2]:
from glob import glob

import pandas as pd

from util.path import Path
from constants import CONVS_STRANGERS, CONVS

In [3]:
%cd ..

/scratch/gpfs/zzada/fconv


In [4]:
def reduce_consecutive(df: pd.DataFrame, on: str) -> pd.DataFrame:
    newdf = df.copy()
    newdf['key'] = (df[on] != df[on].shift(1)).astype(int).cumsum()
    newdf = newdf.groupby('key').first()  # NOTE hardcoded function
    newdf.reset_index(drop=True, inplace=True)
    return newdf

# Timing files
validate timing files after `copy_timings.py`

In [3]:
timepath = Path(root="stimuli", datatype="timing", suffix="events", ext=".csv")
files = glob(timepath.starstr(["conv", "datatype"]))
len(files)

295

In [4]:
# create one df of all timing files
dfs = []
for filename in files:
    df = pd.read_csv(filename)
    filepath = Path.frompath(filename)
    df.insert(0, "conv", filepath["conv"])
    dfs.append(df)
df = pd.concat(dfs)
df.sort_values(["conv", "run", "trial"], inplace=True)
df.reset_index(drop=True, inplace=True)
timingdf = df

print(len(df))
df.head()

8921


Unnamed: 0,conv,run,trial,item,condition,role,time.time,run.time,comm.time,audio_position
0,103,1,1.0,1.0,R,trial_intro,1579303000.0,12.071489,,0
1,103,1,1.0,1.0,R,listener,1579303000.0,21.085684,0.01384,0
2,103,1,1.0,1.0,R,speaker,1579303000.0,32.16962,11.097776,352256
3,103,1,1.0,1.0,R,listener,1579303000.0,173.450472,152.378629,4861952
4,103,1,1.0,1.0,R,trial_end,1579303000.0,201.116802,180.044958,5746688


In [10]:
# check for repeated trials within one run (i.e. bad merging)
groups = []
for (conv, run, trial), group in df.groupby(['conv', 'run', 'trial']):
    role_counts = group.role.value_counts()
    if role_counts['trial_intro'] > 1:
        print('intro problem at', conv, run, trial)
        if conv in CONVS_STRANGERS:
            groups.append(group)
    if role_counts['trial_end'] > 1:
        print('end problem at', conv, run, trial)
        if conv in CONVS_STRANGERS:
            groups.append(group)

intro problem at 104 1 1.0
intro problem at 122 1 1.0
end problem at 122 1 1.0
intro problem at 122 1 2.0
end problem at 122 1 2.0
intro problem at 122 1 3.0
intro problem at 129 1 1.0
end problem at 129 1 1.0
intro problem at 129 1 2.0
intro problem at 138 1 1.0
intro problem at 143 3 1.0
end problem at 143 3 1.0
intro problem at 143 3 2.0
end problem at 143 3 2.0
intro problem at 143 3 3.0
end problem at 143 3 3.0
intro problem at 143 3 4.0
intro problem at 154 5 1.0
intro problem at 164 5 1.0
intro problem at 167 4 1.0
end problem at 167 4 1.0
intro problem at 167 4 2.0
end problem at 167 4 2.0
intro problem at 167 4 3.0


In [11]:
len(groups)

17

In [12]:
groups[0]

Unnamed: 0,conv,run,trial,item,condition,role,time.time,run.time,comm.time,audio_position
156,104,1,1.0,2.0,G,trial_intro,1581631000.0,12.041294,,0
157,104,1,1.0,2.0,G,listener,1581631000.0,21.041629,1.7e-05,0
158,104,1,1.0,2.0,G,trial_intro,1581631000.0,12.048135,,0
159,104,1,1.0,2.0,G,listener,1581631000.0,21.048367,1.4e-05,0
160,104,1,1.0,2.0,G,speaker,1581631000.0,46.475296,25.433309,811008
161,104,1,1.0,2.0,G,listener,1581631000.0,74.136829,53.088473,1683456
162,104,1,1.0,2.0,G,speaker,1581631000.0,103.974939,82.932952,2637824
163,104,1,1.0,2.0,G,listener,1581631000.0,137.081048,116.032693,3686400
164,104,1,1.0,2.0,G,speaker,1581631000.0,168.191276,147.149289,4681728
165,104,1,1.0,2.0,G,listener,1581632000.0,195.613203,174.564848,5550080


In [6]:
# any missing conversations?
set(CONVS) - set(df.conv.unique().tolist())

set()

In [8]:
# any conversations missing runs?
dfc = df[["conv", "run"]].drop_duplicates().reset_index(drop=True)
dft = dfc.groupby("conv").run.count()
dft[dft.values != 5]

Series([], Name: run, dtype: int64)

In [10]:
# any conversations missing transcripts?
dfc = df[["conv", "run", "trial"]].drop_duplicates().reset_index(drop=True)
dft = dfc.groupby("conv").trial.count()
dft[dft.values != 20]

Series([], Name: trial, dtype: int64)

# Utterance transcripts
validate transcripts after `copy_transcripts.py`

Errors for num_speakers != 2
```
[x] conv_114_run_5_set_3_trial_17_item_17_condition_G_first_B
[x] conv_128_run_1_set_1_trial_1_item_2_condition_G_first_A

[#] conv_146_run_5_set_3_trial_19_item_20_condition_G_first_A
[#] conv_147_run_1_set_1_trial_2_item_1_condition_G_first_A

[-] conv_103_run_2_set_1_trial_2_item_5.0_condition_G_first_B
[!] conv_103_run_3_set_1_trial_7_item_12.0_condition_G_first_B
[*] conv_105_run_1_set_1_trial_1_item_1_condition_G_first_B
[*] conv_106_run_3_set_2_trial_10_item_10_condition_G_first_A
[*] conv_113_run_2_set_1_trial_8_item_7_condition_G_first_B
[*] conv_127_run_2_set_1_trial_4_item_7_condition_G_first_A
[*] conv_128_run_2_set_1_trial_5_item_6_condition_G_first_A
[*] conv_128_run_5_set_3_trial_17_item_17_condition_G_first_B
[*] conv_130_run_3_set_2_trial_9_item_9_condition_G_first_A
[*] conv_130_run_4_set_2_trial_14_item_14_condition_G_first_B
[*] conv_134_run_1_set_1_trial_2_item_1_condition_G_first_A
[*] conv_143_run_4_set_3_trial_8_item_15_condition_G_first_B
[*] conv_147_run_5_set_3_trial_20_item_19_condition_G_first_B
[*] conv_148_run_1_set_1_trial_2_item_2_condition_G_first_A
[*] conv_148_run_2_set_1_trial_6_item_5_condition_G_first_A
[*] conv_153_run_5_set_3_trial_18_item_17_condition_G_first_A
[*] conv_172_run_3_set_2_trial_10_item_10_condition_G_first_A
[*] conv_172_run_5_set_3_trial_20_item_20_condition_G_first_A
[*] conv_173_run_2_set_1_trial_8_item_7_condition_G_first_B

x = one subject spoke the entire prompt
* = fixed in `fix_transcripts.sh`
! = fixed but timings look wrong
# = good as-is (self-correct)
- = replaced file with new version
```

In [5]:
transpath = Path(root="stimuli", datatype="transcript", suffix="utterance", ext=".csv")
files = glob(transpath.starstr(["conv", "datatype"]))
len(files)

590

In [6]:
dfs = []
for filename in files:
    df = pd.read_csv(filename)
    filepath = Path.frompath(filename)
    df.insert(0, "trial", filepath["trial"])
    df.insert(0, "run", filepath["run"])
    df.insert(0, "conv", filepath["conv"])
    dfs.append(df)
df = pd.concat(dfs)
df.sort_values(["conv", "run", "trial", "onset"], inplace=True)
df.reset_index(drop=True, inplace=True)

print(len(df))
df.head()

4381


Unnamed: 0,conv,run,trial,speaker,onset,text
0,103,1,2,103,42,"Um, uh I'd want to be famous for something, I ..."
1,103,1,2,103,43,"But honestly, I, like if you make a big discov..."
2,103,1,2,3,44,"Um I think I want to be famous also, probably ..."
3,103,1,2,3,46,Especially because then everything that you do...
4,103,1,2,103,71,"Yeah, definitely."


In [7]:
# what conversations are missing?
set(CONVS) - set(df.conv.unique().tolist())

set()

In [8]:
# which conversations are missing transcripts?
dfc = df[["conv", "run", "trial"]].drop_duplicates().reset_index(drop=True)
dft = dfc.groupby("conv").trial.count()
dft[dft.values != 10]

Series([], Name: trial, dtype: int64)

In [9]:
# any utterances with non-increasing onsets?
df.groupby(['conv', 'run', 'trial']).onset.filter(lambda x: not x.is_monotonic_increasing)

Series([], Name: onset, dtype: int64)

In [10]:
# which convs have only one speaker?
# these are expected:
# conv_114_run_5_set_3_trial_17_item_17_condition_G_first_B
# conv_128_run_1_set_1_trial_1_item_2_condition_G_first_A

ids = df.groupby(['conv', 'run', 'trial']).speaker.filter(lambda x: (x == x.iloc[0]).all())
df.iloc[ids.index][['conv', 'run', 'trial', 'speaker']].drop_duplicates()

Unnamed: 0,conv,run,trial,speaker
768,114,5,17,114
1368,128,1,1,28


In [20]:
df['inaudible'] = df.text.apply(lambda x: 'inaudible' in str(x))
df

Unnamed: 0,conv,run,trial,speaker,onset,text,inaudible
0,103,1,2,103,42,"Um, uh I'd want to be famous for something, I ...",False
1,103,1,2,103,43,"But honestly, I, like if you make a big discov...",False
2,103,1,2,3,44,"Um I think I want to be famous also, probably ...",False
3,103,1,2,3,46,Especially because then everything that you do...,False
4,103,1,2,103,71,"Yeah, definitely.",False
...,...,...,...,...,...,...,...
4376,175,5,20,75,74,"Okay, so that's what I thought, too. Like, he'...",False
4377,175,5,20,175,106,"But I feel like at some point, like, if, if he...",False
4378,175,5,20,75,123,"No, 'cause she was, like, telling me, she was ...",False
4379,175,5,20,175,154,"Yeah, I don't think that's okay. I mean, I thi...",False


In [21]:
df[df.inaudible]

Unnamed: 0,conv,run,trial,speaker,onset,text,inaudible


In [11]:
df['nwords'] = df.text.apply(lambda x: len(str(x).split()))
df

Unnamed: 0,conv,run,trial,speaker,onset,text,nwords
0,103,1,2,103,42,"Um, uh I'd want to be famous for something, I ...",25
1,103,1,2,103,43,"But honestly, I, like if you make a big discov...",32
2,103,1,2,3,44,"Um I think I want to be famous also, probably ...",40
3,103,1,2,3,46,Especially because then everything that you do...,57
4,103,1,2,103,71,"Yeah, definitely.",2
...,...,...,...,...,...,...,...
4376,175,5,20,75,74,"Okay, so that's what I thought, too. Like, he'...",121
4377,175,5,20,175,106,"But I feel like at some point, like, if, if he...",75
4378,175,5,20,75,123,"No, 'cause she was, like, telling me, she was ...",136
4379,175,5,20,175,154,"Yeah, I don't think that's okay. I mean, I thi...",70


In [12]:
df.nwords.sum()

286975

## Do they match with TimingLogs?
ignore this for now. we will assume transcripts are correct because they're based on timing logs

In [5]:
transdf = df.copy()
transdf.drop("text", axis=1, inplace=True)
transdf["trial4"] = 1 + (transdf.trial - 1) % 4
transdf

Unnamed: 0,conv,run,trial,speaker,onset,trial4
0,103,1,2,103,42,2
1,103,1,2,103,43,2
2,103,1,2,3,44,2
3,103,1,2,3,46,2
4,103,1,2,103,71,2
...,...,...,...,...,...,...
4384,175,5,20,75,74,4
4385,175,5,20,175,106,4
4386,175,5,20,75,123,4
4387,175,5,20,175,154,4


In [10]:
timingdf = timingdf[timingdf.role.isin(['speaker', 'listener'])]

In [22]:
records = []
for idx, group in transdf.groupby(['conv', 'run', 'trial4']):
    conv, run, trial = idx

    subdf = transdf[(transdf.conv == conv) & (transdf.run == run) & (transdf.trial4 == trial)]
    newdf = reduce_consecutive(subdf, 'speaker')

    records.append((conv, run, trial, len(group), len(newdf)))

In [26]:
matchdf = pd.DataFrame(records, columns=('conv', 'run', 'trial', 'trans_size', 'time_size'))
matchdf['matching'] = matchdf.trans_size == matchdf.time_size
matchdf

Unnamed: 0,conv,run,trial,trans_size,time_size,matching
0,103,1,2,11,6,False
1,103,1,4,6,6,True
2,103,2,2,14,13,False
3,103,2,4,10,7,False
4,103,3,2,7,6,False
...,...,...,...,...,...,...
585,175,3,4,9,8,False
586,175,4,2,14,13,False
587,175,4,3,12,12,True
588,175,5,2,12,12,True


In [30]:
conv = 103
run = 1
trial = 4

In [31]:
subdf = timingdf[(timingdf.conv == conv) & (timingdf.run == run) & (timingdf.trial == trial)].copy()
subdf['run.time'] -= subdf.iloc[0]['run.time']
subdf

Unnamed: 0,conv,run,trial,item,condition,role,time.time,run.time,comm.time,audio_position
21,103,1,4.0,4.0,G,listener,1579304000.0,0.0,2e-05,17203200
22,103,1,4.0,4.0,G,speaker,1579304000.0,50.591321,50.718468,18829312
23,103,1,4.0,4.0,G,listener,1579304000.0,93.299838,93.299857,20180992
24,103,1,4.0,4.0,G,speaker,1579304000.0,119.805892,119.93304,21037056
25,103,1,4.0,4.0,G,listener,1579304000.0,136.715433,136.715452,21557248
26,103,1,4.0,4.0,G,speaker,1579304000.0,167.255569,167.382717,22544384


In [32]:
subdf = transdf[(transdf.conv == conv) & (transdf.run == run) & (transdf.trial4 == trial)]
reduce_consecutive(subdf, 'speaker')

Unnamed: 0,conv,run,trial,speaker,onset,trial4
0,103,1,4,3,0,4
1,103,1,4,103,18,4
2,103,1,4,3,101,4
3,103,1,4,103,122,4
4,103,1,4,3,140,4
5,103,1,4,103,170,4


# Word-level transcripts

check out the processed transcripts

In [11]:
# new whisperx
transpath = Path(root="stimuli", datatype="transcript", suffix="aligned", ext=".csv")
files = glob(transpath.starstr(["conv", "datatype"]))
len(files)

579

In [4]:
transpath = Path(root="stimuli", datatype="transcript", suffix="word", ext=".csv")
files = glob(transpath.starstr(["conv", "datatype"]))
len(files)

590

In [12]:
dfs = []
for filename in files:
    df = pd.read_csv(filename)
    filepath = Path.frompath(filename)
    df.insert(0, "item", filepath["item"])
    df.insert(0, "set", filepath["set"])
    df.insert(0, "trial", filepath["trial"])
    df.insert(0, "run", filepath["run"])
    df.insert(0, "conv", filepath["conv"])
    dfs.append(df)
df = pd.concat(dfs)
df.sort_values(["conv", "run", "trial", "start"], inplace=True)
df.reset_index(drop=True, inplace=True)

print(len(df))
df.head()

282017


Unnamed: 0,conv,run,trial,set,item,speaker,sentence,word,start,end,score
0,103,1,4,1,4,3,1,"Um,",0.02,0.06,0.0
1,103,1,4,1,4,3,1,I,0.08,0.1,0.001
2,103,1,4,1,4,3,1,think,0.12,0.24,0.1
3,103,1,4,1,4,3,1,the,0.26,0.36,0.332
4,103,1,4,1,4,3,1,perfect,0.38,0.821,0.39


In [13]:
# what conversations are missing?
set(CONVS) - set(df.conv.unique().tolist())

set()

In [14]:
# Which trials are missing?
dfc = df[["conv", "run", "trial"]].drop_duplicates().reset_index(drop=True)
dft = dfc.groupby("conv").trial.count()
dft[dft.values != 10]

conv
103    7
109    9
150    9
155    9
165    9
166    7
172    9
Name: trial, dtype: int64

In [10]:
df.token[df.token.str.startswith('[')].value_counts()

AttributeError: 'DataFrame' object has no attribute 'token'

In [9]:
df[~df.is_punct]

Unnamed: 0,conv,run,trial,turn,utterance,speaker,onset,offset,sentence_id,is_punct,token,token_norm
0,103,1,2,0,0,103,42,43,0,False,Um,um
2,103,1,2,0,0,103,42,43,0,False,uh,uh
3,103,1,2,0,0,103,42,43,0,False,I'd,i'd
4,103,1,2,0,0,103,42,43,0,False,want,want
5,103,1,2,0,0,103,42,43,0,False,to,to
...,...,...,...,...,...,...,...,...,...,...,...,...
347995,175,5,20,6,6,75,174,180,1,False,though,though
347997,175,5,20,6,6,75,174,180,1,False,if,if
347998,175,5,20,6,6,75,174,180,1,False,I,i
347999,175,5,20,6,6,75,174,180,1,False,said,said


In [10]:
df[df.token.str.startswith('(')]

Unnamed: 0,conv,run,trial,turn,utterance,speaker,onset,offset,sentence_id,is_punct,token,token_norm
147206,132,4,15,3,3,132,69,180,2,True,(,
305690,166,3,10,19,19,166,106,109,0,True,(,
305694,166,3,10,20,20,66,109,111,0,True,(,
305885,166,3,10,31,31,166,162,169,1,True,(,
305889,166,3,10,32,32,66,169,173,0,True,(,
305958,166,3,12,5,5,166,18,24,1,True,(,
306439,166,3,12,40,40,66,173,180,0,True,(,
327688,172,3,10,9,9,172,120,129,0,True,(,
331665,173,2,5,2,2,173,75,101,2,True,(,


In [17]:
# df_sents = df.groupby(['conv', 'run', 'trial', 'speaker', 'utterance', 'sentence_id'], sort=False).token.apply(lambda x: ''.join(x))
df_sents = df.groupby(
    ["conv", "run", "trial", "speaker", "utterance", "sentence_id"], sort=False
).agg({"onset": "first", "offset": "last", "set": "first", "item": "first", "token": lambda x: "".join(x)})
df_sents.to_csv("sentences.csv")

In [18]:
df_sents

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,onset,offset,set,item,token
conv,run,trial,speaker,utterance,sentence_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
103,1,2,103,0,0,42,43,1,2,"Um, uh I'd want to be famous for something, I ..."
103,1,2,103,1,0,43,44,1,2,"But honestly, I, like if you make a big discov..."
103,1,2,103,1,1,43,44,1,2,"I don't know, that'd be pretty cool."
103,1,2,103,1,2,43,44,1,2,"Sure, I could do it."
103,1,2,3,2,0,44,46,1,2,"Um I think I want to be famous also, probably ..."
...,...,...,...,...,...,...,...,...,...,...
175,5,20,75,4,6,123,154,3,20,"Like, I don't know."
175,5,20,175,5,0,154,174,3,20,"Yeah, I don't think that's okay."
175,5,20,175,5,1,154,174,3,20,"I mean, I think, I don't know, maybe if I was ..."
175,5,20,75,6,0,174,180,3,20,Yeah.


## Aligned transcripts
first, check that MFA created TextGrids

In [36]:
transpath = Path(root="stimuli", datatype="aligned", ext=".TextGrid")
files = glob(transpath.starstr(["conv", "datatype"]))
len(files)

309

In [37]:
records = []
for file in files:
    path = Path.frompath(file)
    records.append([path["conv"], path["run"], path["trial"]])
df = pd.DataFrame(records, columns=["conv", "run", "trial"])
df.sort_values(["conv", "run", "trial"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,conv,run,trial
0,101,1,1
1,101,1,3
2,101,2,6
3,101,2,8
4,101,3,10
...,...,...,...
304,174,3,12
305,174,4,13
306,174,4,15
307,174,5,18


In [38]:
# what conversations are missing?
set(CONVS_STRANGERS) - set(df.conv.unique().tolist())

{119}

In [39]:
# Which trials are missing?
dft = df.groupby("conv").trial.count()
dft[dft.values != 10]

conv
143    11
171     8
Name: trial, dtype: int64

# Merged transcripts

In [51]:
transpath = Path(root="stimuli", datatype="aligned", ext=".csv")
files = glob(transpath.starstr(["conv", "datatype"]))
len(files)

332

In [54]:
dfs = []
for filename in files:
    if "analysis" in filename:
        continue
    df = pd.read_csv(filename, index_col=0)
    filepath = Path.frompath(filename)
    df.insert(0, "conv", filepath["conv"])
    dfs.append(df)
df = pd.concat(dfs)
df.sort_values(["conv", "run", "trial", "onset"], inplace=True)
df.reset_index(drop=True, inplace=True)

print(len(df))
df.head()

147285


Unnamed: 0,conv,run,trial,turn,utterance,speaker,sentence_id,token,token_norm,onset,offset
0,101,1,1,0,0,1,0.0,Hi,hi,4.0,4.34
1,101,1,1,0,0,1,1.0,Um,um,4.34,4.59
2,101,1,1,0,0,1,1.0,so,so,4.72,5.18
3,101,1,1,0,0,1,1.0,given,given,5.36,5.69
4,101,1,1,0,0,1,1.0,the,the,5.69,7.92


In [56]:
# what conversations are missing?
set(CONVS_STRANGERS) - set(df.conv.unique().tolist())

{119, 171}

In [57]:
# Which trials are missing?
dfc = df[["conv", "run", "trial"]].drop_duplicates().reset_index(drop=True)
dft = dfc.groupby("conv").trial.count()
dft[dft.values != 10]

conv
143    11
Name: trial, dtype: int64

In [61]:
dft

conv
101    10
104    10
105    10
106    10
107    10
108    10
111    10
112    10
114    10
116    10
117    10
120    10
122    10
123    10
126    10
128    10
129    10
131    10
132    10
133    10
137    10
138    10
142    10
143    11
153    10
156    10
157    10
158    10
163    10
174    10
Name: trial, dtype: int64

In [62]:
# Number of words per trial
df.groupby(["conv", "run", "trial"]).token.count().describe()

count    301.000000
mean     489.318937
std       59.304451
min      123.000000
25%      458.000000
50%      492.000000
75%      525.000000
max      665.000000
Name: token, dtype: float64

In [7]:
# Number of words per trial per speaker (how short turns are between speakers)
df.groupby(["conv", "run", "trial", "speaker"]).token.count().describe()

count    382.000000
mean     244.298429
std       62.107222
min       58.000000
25%      205.000000
50%      244.500000
75%      280.000000
max      469.000000
Name: token, dtype: float64

In [64]:
# Number of words per conversations
df.conv.value_counts(sort=False)

conv
101    4958
104    4848
105    5021
106    4734
107    5456
108    5181
111    4702
112    5036
114    5235
116    4204
117    5847
120    4907
122    4760
123    4457
126    5865
128    4477
129    5087
131    4570
132    4287
133    3985
137    5157
138    5565
142    4868
143    4501
153    5092
156    5121
157    5002
158    4591
163    4786
174    4985
Name: count, dtype: int64

In [63]:
# Runs per conversation
# Should be 5 runs per conversation
df[["conv", "run"]].drop_duplicates().reset_index(drop=True).groupby("conv").run.count()

conv
101    5
104    5
105    5
106    5
107    5
108    5
111    5
112    5
114    5
116    5
117    5
120    5
122    5
123    5
126    5
128    5
129    5
131    5
132    5
133    5
137    5
138    5
142    5
143    5
153    5
156    5
157    5
158    5
163    5
174    5
Name: run, dtype: int64