## Avoidance study: datasets to convokit-style corpora, augmentation with dependency arcs

In [1]:
import pandas as pd
import numpy as np
import os
from convokit import Corpus, Utterance, Speaker

In [14]:
def add_scores(df):
    # will be filled later
    return df

In [16]:
# todo: test 'path'
ANN_PATH = 'Annotated\\'

def make_corpus(full=False, avoided=False, type='', add_scores=False, path=''):
    if full:
        df = pd.read_csv(path+"Avoidance_annotated.csv")
    else:
        if not avoided:
            df = pd.read_csv(path+"Avoidance_annotated_notavoided.csv")
        elif not type:
            df = pd.read_csv(path+"Avoidance_annotated_avoided.csv")
        else:
            df = pd.read_csv(path+f"Avoidance_annotated_avoided_{type}.csv")
    df = df.replace(r'^\s*$', np.nan, regex=True)
    if add_scores:
        df = add_scores(df)
    utt_list = []
    meta_conversations = {}
    for ind, row in df.iterrows():
        meta_conversations[row['id_q']] = {'dataset':row['dataset'], 'meta.pair_idx':row['meta.pair_idx'],
              'if_q_1':row['if_q_1'], 'avoid_rate_1':row['avoid_rate_1'], 'avoid_type_1':row['avoid_type_1'],
              'if_q_2':row['if_q_2'], 'avoid_rate_2':row['avoid_rate_2'], 'avoid_type_2':row['avoid_type_2'],
              'if_q_3':row['if_q_3'], 'avoid_rate_3':row['avoid_rate_3'], 'avoid_type_3':row['avoid_type_3'],
              'avoid_rate_avg':row['avoid_rate_avg'], 'avoid_type_avg':row['avoid_type_avg']}
        if add_scores:
            meta_conversations[row['id_q']].update([('levenstein_dist': row['lev_dist']), 
                                                    ('levenstein_dist_first2': row['lev_dist_first2'])])
            meta_a = {'negation_score': row['ans_negations'], 'sentiment_score': row['ans_sentiment'], 
                      'negation_score_first2': row['ans_negations_first2'],
                      'sentiment_score_first2': row['ans_sentiment_first2']}
            utt_a = Utterance(id=row['id_q'], root=row['id_q'], text=row['text_q'], speaker=Speaker(name="question"), meta=meta_a)
        else:
            utt_a = Utterance(id=row['id_q'], root=row['id_q'], text=row['text_q'], speaker=Speaker(name="question"))
            
        utt_q = Utterance(id=row['id_a'], root=row['id_q'], reply_to=row['id_q'], text=row['text_a'], speaker=Speaker(name="answerer"))
        utt_list += [utt_a, utt_q]
        
    corpus = Corpus(utterances = utt_list)
    for conv in corpus.iter_conversations():
        conv.meta.update(meta_conversations[conv.get_id()])
    return corpus

Now we have corpora containing data from our annotated csvs. Next steps - preprocessing.

In [4]:
corpus_all = make_corpus(full=True, path=ANN_PATH)
corpus_avoided = make_corpus(avoided=True, path=ANN_PATH)
corpus_not_avoided = make_corpus(avoided=False, path=ANN_PATH)
corpus_avoided_fight = make_corpus(avoided=True, type='fight', path=ANN_PATH)
corpus_avoided_flight = make_corpus(avoided=True, type='flight', path=ANN_PATH)

It's possible to save the corpora as they are: containing pure annotation data. Just uncomment the following lines and run the cell.

In [None]:
# corpus_all.dump('full-avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided.dump('avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_not_avoided.dump('non-avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided_fight.dump('fight-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided_flight.dump('flight-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)

### Adding linguistic scores
Adding negation, sentiment (for answers) and similarity (for q-a pairs) scores as metadata.

In [20]:
import en_core_web_sm
import spacy
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from nltk.metrics import edit_distance

def sentiment_score(s):
    sid = SentimentIntensityAnalyzer()
    sentences = tokenize.sent_tokenize(s)
    scores = []
    for sentence in sentences:
        scores.append(sid.polarity_scores(sentence)['compound'])
    return sum(scores)/len(scores)

def sentiment_score2(s):
    sid = SentimentIntensityAnalyzer()
    sentences = tokenize.sent_tokenize(s)
    scores = []
    for sentence in sentences[:2]:
        scores.append(sid.polarity_scores(sentence)['compound'])
    return sum(scores)/len(scores)

def neg_score(s):
    doc = nlp(s)
    negation = [tok for tok in doc if tok.dep_ == 'neg']
    return(len(negation))

def neg_score2(s):
    doc = nlp(s)
    sents = list(doc.sents)
    if len(sents) > 2:
        ss = sents[0].string.strip() + sents[1].string.strip()
        doc = nlp(ss)
    negation = [tok for tok in doc if tok.dep_ == 'neg']
    return(len(negation))

def levenstein_dist(row):
    return edit_distance(row['text_q'], row['text_a'])

def levenstein_dist2(row):
    q = tokenize.sent_tokenize(row['text_q'])[:2]
    a = tokenize.sent_tokenize(row['text_a'])[:2]
    return edit_distance(q, a)

def add_scores(df):
    # Negation score for answers (full text and first two sentences)
    nlp = en_core_web_sm.load()
    df['ans_negations'] = df['text_a'].apply(neg_score)
    df['ans_negations_first2'] = df['text_a'].apply(neg_score2)
    # Sentiment score for answers (full text and first two sentences)
    df['ans_sentiment'] = df['text_a'].apply(sentiment_score)
    df['ans_sentiment_first2'] = df['text_a'].apply(sentiment_score2)
    # Levenstein distance between question and answer (full text and first two sentences)
    df['lev_dist'] = df.apply(levenstein_dist, axis=1)
    df['lev_dist_first2'] = df.apply(levenstein_dist2, axis=1)
    return df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Yana\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# corpus_all = make_corpus(full=True, add_scores=True, path=ANN_PATH)
# corpus_avoided = make_corpus(avoided=True, add_scores=True, path=ANN_PATH)
# corpus_not_avoided = make_corpus(avoided=False, add_scores=True, path=ANN_PATH)
# corpus_avoided_fight = make_corpus(avoided=True, type='fight', add_scores=True, path=ANN_PATH)
# corpus_avoided_flight = make_corpus(avoided=True, type='flight', add_scores=True, path=ANN_PATH)

# corpus_all.dump('full-avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided.dump('avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_not_avoided.dump('non-avoidance-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided_fight.dump('fight-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)
# corpus_avoided_flight.dump('flight-corpus', base_path=os.getcwd()+'\\'+ANN_PATH)

### Convokit's politeness score

In [None]:
from convokit import TextParser

parser = TextParser(verbosity=1000)
corpus_all = parser.transform(corpus_all)

from convokit import PolitenessStrategies

ps = PolitenessStrategies()
corpus_all = ps.transform(corpus_all, markers=True)

In [None]:
# Downloading the wikipedia portion of annotated data
wiki_corpus = Corpus(download("wikipedia-politeness-corpus"))
wiki_corpus = parser.transform(wiki_corpus)
wiki_corpus = ps.transform(wiki_corpus, markers=True)

In [None]:
from sklearn import svm
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report
from convokit import Classifier

train_corpus = Corpus(utterances=[utt for utt in wiki_corpus.iter_utterances() if utt.meta["Binary"] != 0])
clf = Classifier(obj_type="utterance", 
                        pred_feats=["politeness_strategies"], 
                        labeller=lambda utt: utt.meta['Binary'] == 1)
clf.fit(train_corpus)
test_pred = clf.transform(corpus_all)
df_pred = clf.summarize(test_pred)
df_pred.head()

### Convokit's arcs

In [13]:
from convokit.convokitPipeline import ConvokitPipeline
from convokit.text_processing import TextProcessor
from convokit.text_processing import TextParser
from convokit.text_processing import TextToArcs

def preprocess_text(text):
    # copy text to metadata of Utterance
    return text

def get_corpus_arcs(corpus, verbosity=100):
    prep = TextProcessor(proc_fn=preprocess_text, output_field='utt_text')
    corpus_prep = prep.transform(corpus)

    arc_pipe = ConvokitPipeline([
        ('parse_text', TextParser('parsed', input_field='utt_text', verbosity=verbosity)),
        ('get_arcs', TextToArcs('arcs', input_field='parsed', verbosity=verbosity))])
    corpus_arcs = arc_pipe.transform(corpus_prep)
    
    return corpus_arcs

In [None]:
get_corpus_arcs(corpus_all).dump('full-avoidance-corpus', base_path=os.getcwd()+'\\'+path)
get_corpus_arcs(corpus_avoided).dump('avoidance-corpus', base_path=os.getcwd()+'\\'+path)
get_corpus_arcs(corpus_not_avoided).dump('non-avoidance-corpus', base_path=os.getcwd()+'\\'+path)
get_corpus_arcs(corpus_avoided_fight).dump('fight-corpus', base_path=os.getcwd()+'\\'+path)
get_corpus_arcs(corpus_avoided_flight).dump('flight-corpus', base_path=os.getcwd()+'\\'+path)

How to load saved corpora from disk? Example below:

In [None]:
corpus_all = Corpus(os.getcwd()+'\\Annotated\\full-avoidance-corpus')
corpus_avoidance = Corpus(os.getcwd()+'\\Annotated\\avoidance-corpus')
corpus_non_avoidance = Corpus(os.getcwd()+'\\Annotated\\non-avoidance-corpus')
corpus_fight = Corpus(os.getcwd()+'\\Annotated\\fight-corpus')
corpus_flight = Corpus(os.getcwd()+'\\Annotated\\flight-corpus')