# Парсинг и предобработка корпуса

In [1]:
import pandas as pd
from string import punctuation as punct
from swda import Transcript, CorpusReader
from tqdm.auto import tqdm
import joblib
import os

pd.set_option('display.max_colwidth', -1)

In [2]:
metadata = pd.read_csv("swda/swda-metadata.csv")

In [3]:
metadata.head()

Unnamed: 0,conversation_no,talk_day,length,topic_description,prompt,from_caller,from_caller_sex,from_caller_education,from_caller_birth_year,from_caller_dialect_area,to_caller,to_caller_sex,to_caller_education,to_caller_birth_year,to_caller_dialect_area
0,1001,910207,7,WEATHER CLIMATE,DISCUSS THE WEATHER. WHAT HAS IT BEEN LIKE IN YOUR AREA? HAS IT BEEN TYPICAL FOR THIS TIME OF YEAR? COMPARE IT WITH THE OTHER CALLER'S WEATHER.,1165,MALE,2,1937,NORTHERN,1065,MALE,3,1952,WESTERN
1,1002,910207,10,CARE OF THE ELDERLY,PLEASE DISCUSS CARE OF THE ELDERLY. FIND OUT HOW THE OTHER CALLER FEELS ABOUT SENDING AN ELDERLY FAMILY MEMBER TO A NURSING HOME. WHAT SHOULD ONE KNOW ABOUT THE NURSING HOME ENVIRONMENT WHEN MAKING THIS DECISION?,1195,MALE,0,1947,NORTHERN,1037,MALE,3,1947,WESTERN
2,1003,910207,7,VACATION SPOTS,PLEASE DISCUSS TYPES OF VACATIONS AND TRIPS YOU ENJOY. FIND OUT WHETHER THE OTHER CALLER CAN INTEREST YOU IN A VACATION SPOT YOU HAVEN'T VISITED.,1196,MALE,0,1940,NYC,1192,MALE,0,1957,NORTHERN
3,1005,910207,2,CARE OF THE ELDERLY,PLEASE DISCUSS CARE OF THE ELDERLY. FIND OUT HOW THE OTHER CALLER FEELS ABOUT SENDING AN ELDERLY FAMILY MEMBER TO A NURSING HOME. WHAT SHOULD ONE KNOW ABOUT THE NURSING HOME ENVIRONMENT WHEN MAKING THIS DECISION?,1165,MALE,2,1937,NORTHERN,1203,MALE,0,1961,NORTH MIDLAND
4,1006,910207,5,WEATHER CLIMATE,DISCUSS THE WEATHER. WHAT HAS IT BEEN LIKE IN YOUR AREA? HAS IT BEEN TYPICAL FOR THIS TIME OF YEAR? COMPARE IT WITH THE OTHER CALLER'S WEATHER.,1190,MALE,3,1941,NYC,1198,MALE,0,1950,NYC


In [4]:
metadata.conversation_no.nunique()

2866

In [5]:
metadata.prompt.nunique()

66

## Вроде бы это означает, что в данных у нас 2866 диалогов

In [7]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2866 entries, 0 to 2865
Data columns (total 15 columns):
conversation_no             2866 non-null int64
talk_day                    2866 non-null int64
length                      2866 non-null int64
topic_description           2866 non-null object
prompt                      2866 non-null object
from_caller                 2866 non-null int64
from_caller_sex             2866 non-null object
from_caller_education       2866 non-null int64
from_caller_birth_year      2866 non-null int64
from_caller_dialect_area    2861 non-null object
to_caller                   2866 non-null int64
to_caller_sex               2866 non-null object
to_caller_education         2866 non-null int64
to_caller_birth_year        2866 non-null int64
to_caller_dialect_area      2862 non-null object
dtypes: int64(9), object(6)
memory usage: 335.9+ KB


In [8]:
print(len(metadata.topic_description.unique()))
metadata.topic_description.unique()

66


array(['WEATHER CLIMATE', 'CARE OF THE ELDERLY', 'VACATION SPOTS',
       'GUN CONTROL', 'AIR POLLUTION', 'MUSIC', 'UNIVERSAL PBLIC SERV',
       'CRIME', 'CREDIT CARD USE', 'AIDS', 'BOOKS AND LITERATURE',
       'CLOTHING AND DRESS', 'CAMPING', 'MOVIES', 'RIGHT TO PRIVACY',
       'TAXES', 'HOUSES', "WOMEN'S ROLES", 'PETS', 'FAMILY FINANCE',
       'TRIAL BY JURY', 'FAMILY LIFE', 'PUERTO RICAN STTEHD',
       'LATIN AMERICA', 'TV PROGRAMS', 'EXERCISE AND FITNESS',
       'HOBBIES AND CRAFTS', 'CHILD CARE', 'RECIPES/FOOD/COOKING',
       'NEWS MEDIA', 'JOB BENEFITS', 'CAPITAL PUNISHMENT', 'BUYING A CAR',
       'ELECTIONS AND VOTING', 'BASEBALL', 'FISHING', 'DRUG TESTING',
       'SOVIET UNION', 'RECYCLING', 'FOOTBALL', 'COMPUTERS',
       'RESTAURANTS', 'POLITICS', 'CHOOSING A COLLEGE', 'BASKETBALL',
       'PUBLIC EDUCATION', 'GARDENING', 'PAINTING', 'SVGS & LOAN BAILOUT',
       'VIETNAM WAR', 'FEDERAL BUDGET', 'MIDDLE EAST', 'IMMIGRATION',
       'UNIVERSAL HEALTH INS', 'AUTO REPAI

In [9]:
corpus = CorpusReader('swda')

Поля реплик: 
* caller	str	A, B, @A, @B, @@A, @@B
* caller_no	int	The caller Id.
* caller_sex	str	MALE or FEMALE
* caller_education	str	0, 1, 2, 3, 9
* caller_birth_year	int	4-digit year
* caller_dialect_area	str	MIXED, NEW ENGLAND, NORTH MIDLAND, NORTHERN, NYC, SOUTH MIDLAND, SOUTHERN, UNK, WESTERN
* transcript_index	int	line number relative to the whole transcript
* utterance_index	int	Utterance number (can span multiple TranscriptIndex numbers)
* subutterance_Index	int	Utterances can be broken across line. This gives the internal position.
* act_tag	list	strings; see below
* text	str	the text of the utterance
* pos	str	the part-of-speech tagged portion of the utterance
* trees	nltk.tree.Tree	the parse of Text; see below for discussion

In [10]:
def clever_string_join(words: list):
    
    res = []
    for utt in words:
        string = ""
        for tok in utt:
            if "'" in tok or tok in punct:
                string += tok
            else: 
                string += " " + tok
        res.append(string.strip())
        
    return res

In [13]:
def create_df(corpus):
        
    talk_indx = []
    callers = []
    utt_indx = []
    trans_indx = []
    texts = []
    tags = []
    words = []
    topics = []
    prompts = []

    for dialogue in corpus.iter_transcripts():
        
        for utt in dialogue.utterances:
            texts.append(utt.text)
            callers.append(utt.caller)
            tags.append(utt.damsl_act_tag())
            utt_indx.append(utt.utterance_index)
            trans_indx.append(utt.transcript_index)
            words.append(utt.pos_words())
            talk_indx.append(dialogue.conversation_no)
            topics.append(dialogue.topic_description)
            prompts.append(dialogue.prompt)
            
    new_words = clever_string_join(words)
        
    d = {"dialogue": talk_indx,
        "caller":callers, 
        "utt_indx": utt_indx, 
        "trans_indx": trans_indx,
        "orig_text": texts, 
        "tag": tags,
        "clean_text": new_words,
        "topic": topics,
        "prompt": prompts}
    
    return d

In [46]:
d = create_df(corpus) # несколько минут

transcript 1155


In [147]:
df = pd.DataFrame(d)
df.head(40)

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and --",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
3,3774,B,4,3,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
4,3774,A,5,4,-- to hear about all the problems. /,+,-- to hear about all the problems.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
5,3774,A,5,5,I wondered if you were taking any special precautions in your neighborhood? /,qy^d,I wondered if you were taking any special precautions in your neighborhood?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
6,3774,B,6,6,"{D Well, } [ I, + I ] think we have a neighborhood watch <laughter>. /",na,"Well, I, I think we have a neighborhood watch.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
7,3774,A,7,7,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
8,3774,B,8,8,I think. /,sd,I think.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?
9,3774,A,9,9,<Laughter>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?


# Попробуем посмотреть распределение каждого тега

In [148]:
len(df)

221616

In [149]:
df["extra"] = 1

In [150]:
tag_stats = df.groupby(by="tag", as_index=False).extra.sum()
tag_stats

Unnamed: 0,tag,extra
0,%,15550
1,+,17824
2,^2,723
3,^g,92
4,^h,556
5,^q,983
6,aa,11133
7,aap_am,105
8,ad,746
9,ar,345


In [151]:
tag_stats["extra"] = tag_stats.extra.apply(lambda x: round(x/221616,2))
tag_stats.sort_values(by="extra", ascending=False)

Unnamed: 0,tag,extra
38,sd,0.34
11,b,0.17
39,sv,0.12
1,+,0.08
0,%,0.07
6,aa,0.05
36,qy,0.02
13,ba,0.02
42,x,0.02
27,nn,0.01


In [152]:
# смотрим, какие предложения есть у каждого тега
df[df.tag == "%"].clean_text

10        I'm not real,                  
13        so --                          
50        and,                           
58        so --                          
61        so,                            
69        that's --                      
71        -- what it is.                 
79        and they don't, uh,            
86        and, course, you know,         
87        I don't know,                  
116       Really is.                     
126       Um.                            
163       Um.                            
182       so,                            
194       and yet there's,               
199       so,                            
210       -- and, uh, so,                
222       Uh, but we,                    
229       So,                            
236       Uh, but, uh, anyway, but, okay.
256       so,                            
261       So it's,                       
269       Uh, but, uh,                   
277       Uh-,                    

## Теперь избавимся от фраз с тегом "b", которые прерывают текущего говорящего (т.е. когда предыдущая фраза оканчивается на "--")

In [153]:
df.head()

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and --",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
3,3774,B,4,3,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
4,3774,A,5,4,-- to hear about all the problems. /,+,-- to hear about all the problems.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [154]:
to_delete = []
counter = 0

for i in df.index:
    if df.loc[i, "tag"] == "b" and df.loc[i-1, "orig_text"].endswith("--"):
        counter += 1
        to_delete.append(i)

In [155]:
counter

5284

In [156]:
to_delete[:20]

[3,
 24,
 27,
 30,
 38,
 40,
 43,
 81,
 90,
 92,
 98,
 110,
 130,
 142,
 158,
 177,
 190,
 197,
 207,
 215]

In [157]:
df.iloc[22:26, :]

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
22,3774,A,15,22,"{D Well } <breathing>, [ we moved in, + when we moved in, ] [ there, + there ] wasn't any outside lights /",sd,"Well, we moved in, when we moved in, there, there wasn't any outside lights",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
23,3774,A,15,23,{C and --,sd,and --,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
24,3774,B,16,24,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
25,3774,A,17,25,"-- so } we've been trying to install some, {F uh, } outside lights /",+,"-- so we've been trying to install some, uh, outside lights",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [158]:
df.iloc[195:200, :]

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
195,3184,A,11,26,"where we can view the river from that distance, there's still homes within our area that are, {D you know, } pretty typical of what our home is, /",sd,"where we can view the river from that distance, there's still homes within our area that are, you know, pretty typical of what our home is,",HOUSES,FIND OUT ABOUT THE OTHER CALLER'S HOME. IS IT A TYPICAL HOME FOR THE AREA? HOW DOES IT COMPARE TO YOUR HOME?,1
196,3184,A,11,27,{C so } --,sd,so --,HOUSES,FIND OUT ABOUT THE OTHER CALLER'S HOME. IS IT A TYPICAL HOME FOR THE AREA? HOW DOES IT COMPARE TO YOUR HOME?,1
197,3184,B,12,28,Uh-huh /,b,Uh-huh,HOUSES,FIND OUT ABOUT THE OTHER CALLER'S HOME. IS IT A TYPICAL HOME FOR THE AREA? HOW DOES IT COMPARE TO YOUR HOME?,1
198,3184,A,13,29,"-- it's, {F uh, } pretty much that, {F uh, } type of, {F uh, } home /",+,"-- it's, uh, pretty much that, uh, type of, uh, home",HOUSES,FIND OUT ABOUT THE OTHER CALLER'S HOME. IS IT A TYPICAL HOME FOR THE AREA? HOW DOES IT COMPARE TO YOUR HOME?,1
199,3184,A,13,30,"{C so, } -/",%,"so,",HOUSES,FIND OUT ABOUT THE OTHER CALLER'S HOME. IS IT A TYPICAL HOME FOR THE AREA? HOW DOES IT COMPARE TO YOUR HOME?,1


In [159]:
# вроде бы все в порядке - можно удалять

In [160]:
new_df = df.drop(to_delete)
new_df = new_df.reset_index(drop=True)

In [162]:
new_df.head()

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and --",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
3,3774,A,5,4,-- to hear about all the problems. /,+,-- to hear about all the problems.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
4,3774,A,5,5,I wondered if you were taking any special precautions in your neighborhood? /,qy^d,I wondered if you were taking any special precautions in your neighborhood?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


## Теперь объединим высказывания с тегом "+" с предыдущими высказываниями, если они относятся к тому же спикеру и кончаются на "--"

In [163]:
to_merge = []
counter = 0

for i in new_df.index:
    if (new_df.loc[i, "tag"] == "+" 
        and new_df.loc[i-1, "orig_text"].endswith("--")
       and new_df.loc[i-1, "caller"] == new_df.loc[i, "caller"]):
        counter += 1
        to_merge.append(i)

In [164]:
counter

5050

In [165]:
to_merge[-5:]

[215595, 215620, 216220, 216258, 216277]

In [166]:
new_df.iloc[215593:215597]

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
215593,3018,A,15,26,"<inhaling>. {C And } [ I, + I ] have watched THIRTYSOMETHING some in the last couple seasons. /",sd,". And I, I have watched THIRTYSOMETHING some in the last couple seasons.",TV PROGRAMS,FIND OUT WHAT THE OTHER CALLER'S FAVORITE TV SHOWS ARE AND WHY. ARE YOUR INTERESTS SIMILAR OR DIFFERENT?,1
215594,3018,A,15,27,I was sort of sorry to see it go --,sd,I was sort of sorry to see it go --,TV PROGRAMS,FIND OUT WHAT THE OTHER CALLER'S FAVORITE TV SHOWS ARE AND WHY. ARE YOUR INTERESTS SIMILAR OR DIFFERENT?,1
215595,3018,A,17,29,-- too. /,+,-- too.,TV PROGRAMS,FIND OUT WHAT THE OTHER CALLER'S FAVORITE TV SHOWS ARE AND WHY. ARE YOUR INTERESTS SIMILAR OR DIFFERENT?,1
215596,3018,B,18,30,"[ It, + it ] was kind of interesting, {F uh, } some people complained about, {F uh, } {F uh, } the kind of whining or whatever on it. /",sd,"It, it was kind of interesting, uh, some people complained about, uh, uh, the kind of whining or whatever on it.",TV PROGRAMS,FIND OUT WHAT THE OTHER CALLER'S FAVORITE TV SHOWS ARE AND WHY. ARE YOUR INTERESTS SIMILAR OR DIFFERENT?,1


In [167]:
for indx in to_merge:
    old_text = new_df.loc[indx-1, "clean_text"]
    target_text = new_df.loc[indx, "clean_text"].lstrip("--")
    new_df.loc[indx-1, "clean_text"] = old_text.rstrip("--") + target_text  

In [170]:
new_df.iloc[20:30]

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
20,3774,B,14,21,How about you? /,qo,How about you?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
21,3774,A,15,22,"{D Well } <breathing>, [ we moved in, + when we moved in, ] [ there, + there ] wasn't any outside lights /",sd,"Well, we moved in, when we moved in, there, there wasn't any outside lights",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
22,3774,A,15,23,{C and --,sd,"and so we've been trying to install some, uh, outside lights",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
23,3774,A,17,25,"-- so } we've been trying to install some, {F uh, } outside lights /",+,"-- so we've been trying to install some, uh, outside lights",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
24,3774,A,17,26,{C and } --,sd,and we put up a fence in the backyard.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
25,3774,A,19,28,-- we put up a fence in the backyard. /,+,-- we put up a fence in the backyard.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
26,3774,A,19,29,"Mostly, {D you know, } not so much thinking that we would deter someone to break in, but that our children would be safe --",sd,"Mostly, you know, not so much thinking that we would deter someone to break in, but that our children would be safe playing in the yard.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
27,3774,A,21,31,-- playing in the yard.,+,-- playing in the yard.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
28,3774,B,22,32,Yeah. /,b,Yeah.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
29,3774,A,23,33,{D You know. } /,+,You know.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [172]:
# теперь можно удалить высказывания с "+"
new_df = new_df.drop(to_merge)
new_df = new_df.reset_index(drop=True)

In [173]:
new_df.head()

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and to hear about all the problems.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
3,3774,A,5,5,I wondered if you were taking any special precautions in your neighborhood? /,qy^d,I wondered if you were taking any special precautions in your neighborhood?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
4,3774,B,6,6,"{D Well, } [ I, + I ] think we have a neighborhood watch <laughter>. /",na,"Well, I, I think we have a neighborhood watch.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


## Теперь посмотрим статистику еще раз

In [177]:
len(new_df)

211282

In [178]:
tag_stats = new_df.groupby(by="tag", as_index=False).extra.sum()
tag_stats["extra"] = tag_stats.extra.apply(lambda x: round(x/211282,2))
tag_stats.sort_values(by="extra", ascending=False)[:7]

Unnamed: 0,tag,extra
38,sd,0.36
11,b,0.16
39,sv,0.13
0,%,0.07
1,+,0.06
6,aa,0.05
36,qy,0.02


In [None]:
# доля b немного уменьшилась, но эти теги нам уже не мешают объединять реплики
# + еще остался. Посмотрим, что это за ситуации

In [179]:
new_df.iloc[:50, :]
# есть случаи, когда несколько плюсов идут подряд, разделенные x
# поэтому сначала удалим x 

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and to hear about all the problems.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
3,3774,A,5,5,I wondered if you were taking any special precautions in your neighborhood? /,qy^d,I wondered if you were taking any special precautions in your neighborhood?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
4,3774,B,6,6,"{D Well, } [ I, + I ] think we have a neighborhood watch <laughter>. /",na,"Well, I, I think we have a neighborhood watch.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
5,3774,A,7,7,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
6,3774,B,8,8,I think. /,sd,I think.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
7,3774,A,9,9,<Laughter>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
8,3774,B,10,10,"I'm not real, - /",%,"I'm not real,",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
9,3774,B,10,11,we don't get real involved. /,sd,we don't get real involved.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [181]:
to_delete = []
counter = 0

for i in new_df.index:
    if new_df.loc[i, "tag"] == "x":
        counter += 1
        to_delete.append(i)

In [182]:
counter

3630

In [184]:
to_delete[:5]

[7, 13, 31, 33, 58]

In [186]:
new_df[new_df.tag == "x"][:5]

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
7,3774,A,9,9,<Laughter>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
13,3774,A,11,15,<laughter>.,x,.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
31,3774,B,34,46,<Laughter>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
33,3774,B,36,48,<Laughter>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
58,3774,A,55,78,<Throat_clearing>.,x,,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [187]:
new_df = new_df.drop(to_delete)
new_df = new_df.reset_index(drop=True)

## Теперь надо придумать, что делать с другими "плюсами"
## Во избежание ошибок пока просто проставим им предыдущий тег того же спикера

In [196]:
new_df_copy = new_df.copy()

In [198]:
def replace_plus(df: pd.DataFrame, i:int, j:int):
    speaker = df.loc[i, "caller"]
    if df.loc[j-1, "caller"] == speaker:
        df.loc[i, "tag"] = df.loc[j-1, "tag"]
    else:
        j -= 1
        return replace_plus(df, i, j)
    

for i in tqdm(new_df_copy.index):
    if new_df_copy.loc[i, "tag"] == "+":      
        j = i
        replace_plus(new_df_copy, i, j)

HBox(children=(IntProgress(value=0, max=207652), HTML(value='')))




In [207]:
# вроде бы все получилось

# new_df.iloc[0:50, :]

In [206]:
# new_df_copy.iloc[0:50, :]

In [235]:
new_df_copy.head(14)

Unnamed: 0,dialogue,caller,utt_indx,trans_indx,orig_text,tag,clean_text,topic,prompt,extra
0,3774,A,1,0,{C So } I've been concerned about crime lately. /,sd,So I've been concerned about crime lately.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
1,3774,B,2,1,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
2,3774,A,3,2,"{F Uh, } it's really scary to listen to the news every night and --",sv,"Uh, it's really scary to listen to the news every night and to hear about all the problems.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
3,3774,A,5,5,I wondered if you were taking any special precautions in your neighborhood? /,qy^d,I wondered if you were taking any special precautions in your neighborhood?,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
4,3774,B,6,6,"{D Well, } [ I, + I ] think we have a neighborhood watch <laughter>. /",na,"Well, I, I think we have a neighborhood watch.",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
5,3774,A,7,7,Uh-huh. /,b,Uh-huh.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
6,3774,B,8,8,I think. /,sd,I think.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
7,3774,B,10,10,"I'm not real, - /",%,"I'm not real,",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
8,3774,B,10,11,we don't get real involved. /,sd,we don't get real involved.,CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1
9,3774,B,10,12,"We're never home, /",sd,"We're never home,",CRIME,DISCUSS CRIME IN AMERICAN CITIES TODAY. WHAT ARE YOUR CONCERNS AND THE CONCERNS OF THE OTHER CALLER? WHAT STEPS CAN BE TAKEN TO REDUCE CRIME?,1


In [None]:
# потом надо написать правило, что если между одинаковыми единицами стоит b, то игнорируем b и мерджим все одинаковые теги

## Теперь надо решить, что делать с "%" - этот тег означает, что сложно понять, что это за речевой акт 
Наверное, можно оставить его как есть, но потом не учитывать

## Попробуем объединить данные из датасета в целые реплики и теги по спикерам

In [225]:
# sample_df = new_df_copy.iloc[0:200, :]

In [275]:
grouped = new_df_copy.groupby(by="dialogue")

In [276]:
d = {}

for name,group in tqdm(grouped):
    dialogue_data = []
    speaker_data = []
    for i in group.index:
        if speaker_data == []:
            text = group.loc[i, "clean_text"].strip("--")
            tag = group.loc[i, "tag"]
            speaker_data.append((text, tag))
        else:
            if group.loc[i,"caller"] == group.loc[i-1,"caller"]:
                text = group.loc[i, "clean_text"].strip("--")
                tag = group.loc[i, "tag"]
                speaker_data.append((text, tag))
            else:
                dialogue_data.append(speaker_data)
                speaker_data = []
                text = group.loc[i, "clean_text"].strip("--")
                tag = group.loc[i, "tag"]
                speaker_data.append((text, tag))
    d[name] = dialogue_data

HBox(children=(IntProgress(value=0, max=1155), HTML(value='')))




In [277]:
len(d)

1155

In [278]:
d[3774][:7]

[[("So I've been concerned about crime lately.", 'sd')],
 [('Uh-huh.', 'b')],
 [("Uh, it's really scary to listen to the news every night and  to hear about all the problems.",
   'sv'),
  ('I wondered if you were taking any special precautions in your neighborhood?',
   'qy^d')],
 [('Well, I, I think we have a neighborhood watch.', 'na')],
 [('Uh-huh.', 'b')],
 [('I think.', 'sd'),
  ("I'm not real,", '%'),
  ("we don't get real involved.", 'sd'),
  ("We're never home,", 'sd'),
  ('so ', '%')],
 [('Uh-huh', 'b')]]

Вроде бы все ок

## Теперь объединим одинаковые теги в подряд идущих тегах одного спикера и, соответственно, их тексты (игнорируя тег %).

In [279]:
for k, talk in d.items():
    new_turns = []
    for turn in talk:
        # unzip lists of tuples
        turn = [[i for i, j in turn], [j for i, j in turn]] 
        new_turns.append(turn)
    d[k] = new_turns

In [280]:
d[3774][:7]

[[["So I've been concerned about crime lately."], ['sd']],
 [['Uh-huh.'], ['b']],
 [["Uh, it's really scary to listen to the news every night and  to hear about all the problems.",
   'I wondered if you were taking any special precautions in your neighborhood?'],
  ['sv', 'qy^d']],
 [['Well, I, I think we have a neighborhood watch.'], ['na']],
 [['Uh-huh.'], ['b']],
 [['I think.',
   "I'm not real,",
   "we don't get real involved.",
   "We're never home,",
   'so '],
  ['sd', '%', 'sd', 'sd', '%']],
 [['Uh-huh'], ['b']]]

In [304]:
# теперь объединяем тексты с одинаковыми тегами

new_d = {}

for k, talk in d.items():
        
    new_turns = []
    
    for turn in talk:
        
        new_texts = []
        new_tags = []
        
        if len(turn[0]) == 1:
            new_turns.append(turn)
        
        if len(turn[0]) > 1:
            
            cur_texts = turn[0]
            cur_tags = turn[1]
        
            for i, tag in enumerate(cur_tags):
                if i == 0:
                    new_texts.append(cur_texts[i])
                    new_tags.append(cur_tags[i])
                else:
                    if cur_tags[i] == new_tags[-1]:
                        new_texts[-1] += " " + cur_texts[i]
                    else:
                        new_texts.append(cur_texts[i])
                        new_tags.append(cur_tags[i])
                    
        if new_texts != []:
            new_turns.append([new_texts, new_tags])
        
    new_d[k] = new_turns 

In [305]:
d[3774][:7]

[[["So I've been concerned about crime lately."], ['sd']],
 [['Uh-huh.'], ['b']],
 [["Uh, it's really scary to listen to the news every night and  to hear about all the problems.",
   'I wondered if you were taking any special precautions in your neighborhood?'],
  ['sv', 'qy^d']],
 [['Well, I, I think we have a neighborhood watch.'], ['na']],
 [['Uh-huh.'], ['b']],
 [['I think.',
   "I'm not real,",
   "we don't get real involved.",
   "We're never home,",
   'so '],
  ['sd', '%', 'sd', 'sd', '%']],
 [['Uh-huh'], ['b']]]

In [306]:
# вот что получается
new_d[3774][:7]

[[["So I've been concerned about crime lately."], ['sd']],
 [['Uh-huh.'], ['b']],
 [["Uh, it's really scary to listen to the news every night and  to hear about all the problems.",
   'I wondered if you were taking any special precautions in your neighborhood?'],
  ['sv', 'qy^d']],
 [['Well, I, I think we have a neighborhood watch.'], ['na']],
 [['Uh-huh.'], ['b']],
 [['I think.',
   "I'm not real,",
   "we don't get real involved. We're never home,",
   'so '],
  ['sd', '%', 'sd', '%']],
 [['Uh-huh'], ['b']]]

In [None]:
# осталось разобраться с % 

## Пока сохраню new_df_copy и текущую версию словаря диалогов

In [308]:
joblib.dump(new_df_copy, "corpus-df.pkl")
joblib.dump(new_d, "corpus-dict.pkl")

['corpus-dict.pkl']

In [222]:
print(sorted(df.tag.unique()))

['%', '+', '^2', '^g', '^h', '^q', 'aa', 'aap_am', 'ad', 'ar', 'arp_nd', 'b', 'b^m', 'ba', 'bd', 'bf', 'bh', 'bk', 'br', 'fa', 'fc', 'fo_o_fw_"_by_bc', 'fp', 'ft', 'h', 'na', 'ng', 'nn', 'no', 'ny', 'oo_co_cc', 'qh', 'qo', 'qrr', 'qw', 'qw^d', 'qy', 'qy^d', 'sd', 'sv', 't1', 't3', 'x']


Все теги:
  q      Question  
  s      Statement 
  b      Backchannel/Backwards-Looking
  f      Forward-Looking
  a      Agreements
  %  indeterminate, interrupted, or contains just a floor holder (see manual)
  (^u  [on anything] unrelated response (first utt is NOT response to previous q)
  *  comment  (followed by "*[[comment...]]" after transcription to explain)
  +  continued from previous by same speaker
  @,o@,+@  incorrect transcription (can add comment to specify problem further)
  ^2 collaborative completion
  ^c  about-communication
  ^d  declarative question (question asked like a structural statement)
  ^e  [on statements] elaborated reply to y/n question
  ^g  tag question (question asked like a structural statement with a question tag at end)
  ^h  hold (often but not always after a question) ('let me think'; question in response to a question)
  ^m  mimic other
  ^q  quotation
  ^r  repeat self
  ^t  about-task
  aap Accept-part    
  ad Action-directive  "Go ahead", "We could go back to television shows"
  aa Accept         "ok" , "i agree"
  am Maybe                         
  ar Reject "no", 
  arp Reject-part 
  b default agreement or continuer (uh-huh, right, yeah)
  b^m  Repeat-phrase  
  ba assessment/appreciation ("I can imagine")
  bc Correct-misspeaking  
  bd Downplaying-reponse-to-sympathy/compliments ("That's all right","that happens")
  bf reFormulate/summarize; paraphrase/summary of other's utterance (as opposed to a mimic)
  bh rhetorical question continuer ("Oh really?")
  bk ACKNOWLEDGE-ANSWER    "Oh, okay"
  br Signal-non-understanding (request for repeat)
  br^m Signal-non-understanding via mimic
  br^c non-understanding due to problems with phone line  
  by sYmpathetic comment ("I'm sorry to hear about that")
  cc Commit                          
  co Offer                           
  fa Apology "Apologies" (this is not the "I'm sorry" of sympathy which is "by")
  fc Conventional-closing            
  fe Exclamation "Ouch"
  fo Other-forward-function         
  fp Conventional-opening            
  ft Thanks "Thank you"
  fw Welcome "You're welcome"
  fx Explicit-performative  ("you're filed" )      
  na a descriptive/narrative statement which acts as an affirmative answer to a question 
  nd aNswer Dispreferred (Well...)
  ng a descriptive/narrative statement which acts as a negative answer to a question 
  nn  no or variations (only)
  no a response to a question that is neither affirmative nor negative (often "I don't know")
  ny  yes or variations (only)
  o other
  oo Open-option  "We could have lamb or chicken"
  qh  rhetorical question
  qo  open ended question
  qr  alternative ('or') question 
  qrr an or-question clause tacked onto a yes-no question
  qw  wh-question 
  qy  yes/no question
  sd  descriptive and/or narrative (listener has no basis to dispute)
  sv  viewpoint, from personal opinions to proposed general facts  (listener could have basis to dispute)
  t1  self-talk
  t3  3rd-party-talk
  x   nonspeech 