## Avoidance study: datasets to convokit-style corpora, augmentation with dependency arcs

In [1]:
import pandas as pd
import numpy as np
import os
from convokit import Corpus, Utterance, Speaker

In [2]:
# todo: test 'path'
def make_corpus(avoided=False, type='', path=''):
    if not avoided:
        df = pd.read_csv(path+"Avoidance_annotated.csv")
    elif not type:
        df = pd.read_csv(path+"Avoidance_annotated_avoided.csv")
    else:
        df = pd.read_csv(path+f"Avoidance_annotated_avoided_{type}.csv")
    df = df.replace(r'^\s*$', np.nan, regex=True)
    utt_list = []
    meta_conversations = {}
    for ind, row in df.iterrows():
        meta_conversations[row['id_q']] = {'dataset':row['dataset'], 'meta.pair_idx':row['meta.pair_idx'],
              'if_q_1':row['if_q_1'], 'avoid_rate_1':row['avoid_rate_1'], 'avoid_type_1':row['avoid_type_1'],
              'if_q_2':row['if_q_2'], 'avoid_rate_2':row['avoid_rate_2'], 'avoid_type_2':row['avoid_type_2'],
              'if_q_3':row['if_q_3'], 'avoid_rate_3':row['avoid_rate_3'], 'avoid_type_3':row['avoid_type_3'],
              'avoid_rate_avg':row['avoid_rate_avg'], 'avoid_type_avg':row['avoid_type_avg']}
        utt_a = Utterance(id=row['id_q'], root=row['id_q'], text=row['text_q'], speaker=Speaker(name="question"))
        utt_q = Utterance(id=row['id_a'], root=row['id_q'], reply_to=row['id_q'], text=row['text_a'], speaker=Speaker(name="answerer"))
        utt_list += [utt_a, utt_q]
        
    corpus = Corpus(utterances = utt_list)
    for conv in corpus.iter_conversations():
        conv.meta.update(meta_conversations[conv.get_id()])
    return corpus

Now we have corpora containing data from our annotated csvs. Next steps - preprocessing.

In [3]:
corpus_all = make_corpus()
corpus_avoided = make_corpus(avoided=True)
corpus_avoided_fight = make_corpus(avoided=True, type='fight')
corpus_avoided_flight = make_corpus(avoided=True, type='flight')
utt = corpus_avoided_fight.random_utterance()

Single pipeline: parse text, get arcs

In [4]:
from convokit.convokitPipeline import ConvokitPipeline
from convokit.text_processing import TextProcessor
from convokit.text_processing import TextParser
from convokit.text_processing import TextToArcs

def preprocess_text(text):
    # copy text to metadata of Utterance
    return text

def get_corpus_arcs(corpus, verbosity=100):
    prep = TextProcessor(proc_fn=preprocess_text, output_field='utt_text')
    corpus_prep = prep.transform(corpus)

    arc_pipe = ConvokitPipeline([
        ('parse_text', TextParser('parsed', input_field='utt_text', verbosity=verbosity)),
        ('get_arcs', TextToArcs('arcs', input_field='parsed', verbosity=verbosity))])
    corpus_arcs = arc_pipe.transform(corpus_prep)

    # from convokit.phrasing_motifs import CensorNouns
    # from convokit.convokitPipeline import ConvokitPipeline
    # their_arc_pipe = ConvokitPipeline([
    #     ('censor_nouns', CensorNouns('parsed_censored', verbosity=verbosity)),
    #     ('shallow_arcs', TextToArcs('arcs_censored', input_field='parsed_censored', 
    #                                root_only=False, verbosity=verbosity))])
    
    return corpus_arcs

In [5]:
get_corpus_arcs(corpus_all).dump('full-avoidance-corpus', base_path=os.getcwd())
get_corpus_arcs(corpus_avoided).dump('avoidance-corpus', base_path=os.getcwd())
get_corpus_arcs(corpus_avoided_fight).dump('fight-corpus', base_path=os.getcwd())
get_corpus_arcs(corpus_avoided_flight).dump('flight-corpus', base_path=os.getcwd())

100/844 utterances processed
200/844 utterances processed
300/844 utterances processed
400/844 utterances processed
500/844 utterances processed
600/844 utterances processed
700/844 utterances processed
800/844 utterances processed
844/844 utterances processed
100/844 utterances processed
200/844 utterances processed
300/844 utterances processed
400/844 utterances processed
500/844 utterances processed
600/844 utterances processed
700/844 utterances processed
800/844 utterances processed
844/844 utterances processed
100/346 utterances processed
200/346 utterances processed
300/346 utterances processed
346/346 utterances processed
100/346 utterances processed
200/346 utterances processed
300/346 utterances processed
346/346 utterances processed
038/038 utterances processed
038/038 utterances processed
100/262 utterances processed
200/262 utterances processed
262/262 utterances processed
100/262 utterances processed
200/262 utterances processed
262/262 utterances processed


In [2]:
# load corpora from disk

corpus_all = Corpus(os.getcwd()+'\\Annotated\\full-avoidance-corpus')
corpus_avoidance = Corpus(os.getcwd()+'\\Annotated\\avoidance-corpus')
corpus_fight = Corpus(os.getcwd()+'\\Annotated\\fight-corpus')
corpus_flight = Corpus(os.getcwd()+'\\Annotated\\flight-corpus')