### Convokit's Arcs

In [7]:
import pandas as pd
import os
from convokit import Corpus 

CORP_PATH = os.path.dirname(os.getcwd())+'\\Corpora\\'

# corpus_all = Corpus(CORP_PATH+'full-avoidance-corpus')
corpus_avoidance = Corpus(CORP_PATH+'avoidance-corpus')
# corpus_non_avoidance = Corpus(CORP_PATH+'non-avoidance-corpus')
# corpus_fight = Corpus(CORP_PATH+'fight-corpus')
# corpus_flight = Corpus(CORP_PATH+'flight-corpus')

In [8]:
from convokit.text_processing import TextProcessor
from nltk.corpus import stopwords
# STOPWORDS = stopwords.words("english")
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 
                 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 
                 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
                 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 
                 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'or', 
                 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 
                 'during', 'before', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
                 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
                 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 's', 'will', 'just', 'should', 'll', 'm', 'o', 
                 're', 'y', 'ma', 'hon']

class RemoveStopWords(TextProcessor):
	"""
		Transformer 
		:param output_field: name of attribute to output to.
		:param input_field: name of field to use as input. defaults to 'parsed', which stores dependency parses as returned by the TextParser transformer; otherwise expects similarly-formatted input.
		:param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. parses will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that arcs will be computed for all utterances.
		:param verbosity: frequency of status messages.
	"""

	def __init__(self, output_field, input_field='parsed', input_filter=None, verbosity=0):
		TextProcessor.__init__(self, censor_stopwords, 
			output_field=output_field, input_field=input_field,
			input_filter=input_filter, verbosity=verbosity)

def _is_stopword(tok):
	return tok['tok'].lower() in STOPWORDS

def _convert_stopword(tok, sent):
	if _is_stopword(tok):
# 		has_w = _get_w_det(tok, sent)
# 		if has_w:
# 			return has_w.lower()
# 		else:
		return 'STOPWORD'
	return tok['tok'].lower()

def censor_stopwords(text_entry):
	"""
		Stand-alone function that removes stopwords from parsed text.
		:param text_entry: parsed text
		:return: parse with stopwords censored out.
	"""

	sents = []
	for raw_sent in text_entry:
		sent = {'rt': raw_sent['rt'], 'toks': []}
		for raw_tok in raw_sent['toks']:
			tok = {k: raw_tok[k] for k in ['dep','dn','tag']}
			if 'up' in raw_tok: tok['up'] = raw_tok['up']
			tok['tok'] = _convert_stopword(raw_tok, raw_sent)
			sent['toks'].append(tok)
		sents.append(sent)
	return sents

In [9]:
from convokit import TextParser

parser = TextParser(verbosity=1000)
corpus = parser.transform(corpus_avoidance)

remove_stopwords = RemoveStopWords('parsed_cleaned', input_field='parsed', verbosity=100)
corpus = remove_stopwords.transform(corpus)

from convokit.text_processing import TextToArcs

get_cleaned_arcs = TextToArcs('arcs_cleaned', input_field='parsed_cleaned', verbosity=100)
corpus = get_cleaned_arcs.transform(corpus)

346/346 utterances processed
100/346 utterances processed
200/346 utterances processed
300/346 utterances processed
346/346 utterances processed
100/346 utterances processed
200/346 utterances processed
300/346 utterances processed
346/346 utterances processed


In [10]:
global_arc_list = []
for utt in corpus.iter_utterances():
    if utt.reply_to:
        arcs = utt.retrieve_meta('arcs_cleaned')
        spl_arcs = [arc.split(' ') for arc in arcs]
        for arc in spl_arcs:
            for subarc in arc:
                if 'stopword' not in subarc  and '_*' not in subarc:
                    global_arc_list.append(subarc)

from collections import Counter
freq = Counter(global_arc_list)
freq.most_common(100)

[('if>*', 11),
 ('but>*', 10),
 ("gentleman_'s", 10),
 ("friend_'s", 10),
 ('ireland_northern', 9),
 ('authorities_local', 8),
 ('however>*', 6),
 ('nations_united', 6),
 ('right_absolutely', 6),
 ('kingdom_united', 5),
 ('year_last', 5),
 ("'s_not", 5),
 ('deal_great', 5),
 ('union_european', 5),
 ('no>*', 4),
 ('friend_right', 4),
 ('make_can', 4),
 ('party_labour', 4),
 ('gave_ago', 4),
 ('much_very', 4),
 ('point_important', 4),
 ("government_'s", 4),
 ('assure_can', 4),
 ('service_health', 4),
 ('service_national', 4),
 ('states_united', 4),
 ('party_conservative', 4),
 ('minister_prime', 3),
 ('people_young', 3),
 ('ago_years', 3),
 ('system_voucher', 3),
 ('make_statement', 3),
 ('after>*', 3),
 ('yes>*', 3),
 ('thank_friend', 3),
 ('like>*', 3),
 ('yeah>*', 3),
 ('people_many', 3),
 ('ago_moments', 3),
 ('reply_gave', 3),
 ('year_next', 3),
 ('council_city', 3),
 ('government_labour', 3),
 ('one>*', 3),
 ('made_clear', 3),
 ('week_last', 3),
 ('destruction_mass', 3),
 ('hussein