Setup

In [1]:
import benepar
import spacy
import pandas as pd
import decimal
import re
from lxml import etree

import apted
from apted import APTED
from apted.helpers import Tree
from itertools import combinations
from math import comb

import time
import json
from datetime import datetime
import os
import numpy as np

benepar.download('benepar_en3')

nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\cedch\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


Helper Methods

In [31]:
# SEXP TO XML
def clean_xml(xml):
    xml = re.sub('<(/?)[^a-zA-Z/][^>]*>', '<\g<1>UNK>', xml) # invalid tokens labeled 'UNK'
    return xml.replace(' ', '')

def sexp_to_xml(sexp):
    def apply_inner_re(s):
        return re.sub('\(([^ ]*) ([^\)\(]*)\)', '<\g<1>> \g<2> </\g<1>>', s)

    xml = apply_inner_re(sexp)
    while xml.startswith('('):
        xml = apply_inner_re(xml)

    with open('special_chars.txt') as f:
        special_chars = dict([line.split() for line in f])

    def key_to_re(s):
        s = re.sub('(.*)([\\\.\+\*\?\^\$\(\)\[\]\{\}\|])(.*)', '\g<1>\\\\\g<2>\g<3>', s)
        return '<(/?)' + s + '>'

    for k, v in special_chars.items():
        xml = re.sub(key_to_re(k), f'<\g<1>{v}>', xml)

    return clean_xml(xml)

# TREE EDIT DISTANCE
def apted_format(parse_str):
    parse_str = re.sub('\(([^ ]+) [^ \(\)]+?\)', '(\g<1>)', parse_str)
    parse_str = parse_str.replace(' ', '')
    parse_str = parse_str.replace('(', '{')
    parse_str = parse_str.replace(')', '}')
    return parse_str

# PARATACTIC CHILDREN STRICT 
def find_parataxis_strict(e):
    global parataxis_clause_tags
    print(e.getchildren())
    children = [c.tag for c in e.getchildren() if not(c.tag.startswith('PUNCT-'))] # excludes punct
    sum = 0
    in_group = False
    print(children)
    
    for i in range(len(children) - 1):
        if children[i] in parataxis_clause_tags and children[i + 1] in parataxis_clause_tags:
            sum += 1
            if not(in_group):
                sum += 1
                in_group = True
        else:
            in_group = False
    return sum if sum != 0 else 1

# LEMMATIZATION
def lemmatize(word: str):
    return lemmatizer(word)[0].lemma_

# AOA
def aoa_of(word: str):
    global aoa_df, aoa_mode
    
    search = aoa_df[aoa_df['Word'] == word]['Rating.Mean']
    if len(search) == 1:
        return float(search)
    
    lemma_search = aoa_df[aoa_df['Word'] == lemmatize(word)]['Rating.Mean']
    if len(lemma_search) == 1:
        return float(lemma_search)
    
    lemmas = aoa_df[aoa_df['Lemma'] == lemmatize(word)]['Rating.Mean']
    if len(lemmas) == 0:
        return -1
    elif len(lemmas) == 1:
        return float(lemmas)
    elif aoa_mode == 'avg':
        return sum(lemmas) / len(lemmas)
    elif aoa_mode == 'max':
        return max(lemmas)
    
    return -1

#Average
def list_avg(l):
    return sum(l) / len(l) if len(l) > 0 else 0


Analysis Functions

In [24]:
clause_tags = ['S', 'SBARQ', 'SINV'] # Not included: 'SQ', 'SBAR'
parataxis_clause_tags = ['S', 'SBARQ', 'SINV', 'SQ', 'SBAR']
clause_re = re.compile('(' + '/|'.join(clause_tags) + ')')
pronoun_tags = ['PRP', 'PRPS']

aoa_df = pd.read_csv('reference/aoa/aoa_lemmas.csv')
with open('reference/word_frequency/subtlexus_lower.json', encoding='utf-8') as f:
    wf_dict = json.load(f)

def benepar_analysis(sent):
    global num_clauses, num_sbar, num_unk, depth_sum, max_depth, max_clause_depth, clause_depth_sum
    global pronoun_sum, num_leaf_nps, num_nps, np_leaf_sum, clause_length_sum
    global paratactic_sum, root_parataxis_strict, root_parataxis_loose
    paratactic_sum_local = 0

    xml = sexp_to_xml(sent._.parse_string)
    root = etree.fromstring(xml) 
    tree = etree.ElementTree(root)

    root_parataxis_loose += max(1, sum(int(bool(c.tag in parataxis_clause_tags)) for c in root.getchildren()))
    root_parataxis_strict += find_parataxis_strict(root)

    num_sbar += sum(int(e.tag == 'SBAR') for e in root.iter())
    pronoun_sum += sum(int(e.tag in pronoun_tags) for e in root.iter())
    num_unk += sum(int(e.tag == 'UNK') for e in root.iter())
    for e in root.iter():
        tag = e.tag
        if tag in clause_tags: 
            num_clauses += 1
            clause_length_sum += sum(int(not(d.tag.startswith('PUNCT-') and bool(d.text))) for d in e.iterdescendants())
            
            paratactic_sum_local += sum(int(bool(c.tag in parataxis_clause_tags)) for c in e.getchildren())
        elif e.tag == 'NP':
            num_nps += 1
            is_leaf_np = True
            for c in e.iterdescendants():
                if c.text: 
                    if not(c.tag.startswith('PUNCT-') or c.tag == 'DT'): # ignore determiners and punctuation
                        np_leaf_sum += 1
                else:
                    is_leaf_np = False
            if is_leaf_np:
                num_leaf_nps += 1
        
        if e.text:
            path = tree.getpath(e)

            depth = len(re.findall('/', path))
            depth_sum += depth # Number of times '/' appears, excluding first
            max_depth = max(max_depth, depth)

            clause_depth = len(clause_re.findall(path))
            clause_depth_sum += clause_depth
            max_clause_depth = max(max_clause_depth, clause_depth)
    paratactic_sum += max(1, paratactic_sum_local)

def spacy_analysis(sent):
    global dep_dist_sum, num_words, words_before_root_sum, uniq_words, num_words_no_nums, num_stop_words
    global max_dep_dist
    global aoa_list, aoa_stopless_list, aoa_uniq_list, aoa_stopless_uniq_list
    global wf_list, wf_stopless_list, wf_uniq_list, wf_stopless_uniq_list

    num_stop_words += sum(int(token.is_stop) for token in sent)

    for token in sent:
        if not(token.is_punct or token.is_space):
            num_words += 1
            dep_dist = abs(token.head.i - token.i)
            dep_dist_sum += dep_dist
            max_dep_dist = max(dep_dist, max_dep_dist)
            
            if (wf := wf_dict.get(token.lower_)) is not None:
                wf_list.append(wf)
                if not token.is_stop:
                    wf_stopless_list.append(wf)

            if (aoa := aoa_of(token.lower_)) != -1:
                aoa_list.append(aoa)
                if not token.is_stop:
                    aoa_stopless_list.append(aoa)

            if token.i < sent.root.i: words_before_root_sum += 1

            if not(token.like_num):
                if not(token.lower_ in uniq_words):
                    uniq_words.append(token.lower_)

                    if wf is not None:
                        wf_uniq_list.append(wf)
                        if not token.is_stop:
                            wf_stopless_uniq_list.append(wf)
                    
                    if aoa != -1:
                        aoa_uniq_list.append(aoa)
                        if not token.is_stop:
                            aoa_stopless_uniq_list.append(aoa)

                num_words_no_nums += 1
    
def ted_analysis(sent1, sent2):
    global ted_sum

    tree1 = Tree.from_text(apted_format(sent1._.parse_string))
    tree2 = Tree.from_text(apted_format(sent2._.parse_string))

    apted = APTED(tree1, tree2, )
    ted = apted.compute_edit_distance()
    ted_sum += ted

In [42]:
p_sum = 0
p_suml = 0
d = nlp("This oath I am now about to take, and in your presence: That if it shall be found during my administration of the Government I have in any instance violated willingly or knowingly the injunctions thereof, I may (besides incurring constitutional punishment) be subject to the upbraidings of all who are now witnesses of the present solemn ceremony.")
d = nlp("this that he will go: that i am clear")
# d=nlp("to go: that i am clear")
# returns p_sum 2, p_suml 4
s = list(d.sents)[0]
print(s)
xml = sexp_to_xml(s._.parse_string)
print(s._.parse_string)
print(xml)
root = etree.fromstring(xml) 
tree = etree.ElementTree(root)
print(root.getchildren())
print(root)

p_sum += find_parataxis_strict(root)
for e in root.iter():
    tag = e.tag
    if tag in clause_tags:
        p_suml += sum(int(bool(c.tag in parataxis_clause_tags)) for c in e.getchildren())


print(p_sum, p_suml)

this that he will go: that i am clear
(FRAG (NP (NP (DT this)) (SBAR (WHNP (WDT that)) (S (NP (PRP he)) (VP (MD will) (VP (VB go)))))) (: :) (SBAR (IN that) (S (NP (PRP i)) (VP (VBP am) (ADJP (JJ clear))))))
<FRAG><NP><NP><DT>this</DT></NP><SBAR><WHNP><WDT>that</WDT></WHNP><S><NP><PRP>he</PRP></NP><VP><MD>will</MD><VP><VB>go</VB></VP></VP></S></SBAR></NP><PUNCT-COLON>:</PUNCT-COLON><SBAR><IN>that</IN><S><NP><PRP>i</PRP></NP><VP><VBP>am</VBP><ADJP><JJ>clear</JJ></ADJP></VP></S></SBAR></FRAG>
[<Element NP at 0x2b1daae16c0>, <Element PUNCT-COLON at 0x2b1da7e5000>, <Element SBAR at 0x2b1da7e4780>]
<Element FRAG at 0x2b1da7ab300>
[<Element NP at 0x2b1c45ab200>, <Element PUNCT-COLON at 0x2b1da7e47c0>, <Element SBAR at 0x2b1da7e5000>]
['NP', 'SBAR']
1 0


(NP (DT this) (SBAR (IN that) (S (NP (PRP he)) (VP (MD will) (VP (VB go))))) (: :) (SBAR (IN that) (S (NP (PRP i)) (VP (VBP am) (ADJP (JJ clear))))))
(NP (NP (DT this)) (SBAR (WHNP (WDT that)) (S (NP (PRP he)) (VP (MD will) (VP (VB go)))))) (: :) (SBAR (IN that) (S (NP (PRP i)) (VP (VBP am) (ADJP (JJ clear)))))

Main

In [17]:
results = pd.DataFrame([])

# TED Mode
#  - 'combinations': Averages the TED of every pair of sentences in a doc
#  - 'adjacent': Averages the TED of a sentence and the next sentence in a doc
ted_mode = 'adjacent'
# AoA Mode: If word is not in AoA list and multiple lemma matches exist
#  - 'max': Chooses max of lemma matches
#  - 'avg': Averages lemma matches
aoa_mode = 'max'
log = ""

for i, file in enumerate(os.scandir('data/text_jsons/')):
    if i == 2: break
    
    file_time = time.perf_counter()
    file_name = re.sub('\.json$', '', file.name)
    with open(file, encoding='utf-8') as f:
        metadata = json.load(f)
    text = metadata['text']
    print(text)
    text = text.replace('\n', ' ').strip()
    text = re.sub('\s{2,}', ' ', text)
    
    try:
        doc = nlp(text)
    except ValueError:
        error_text = f"ValueError in '{file_name}'. Likely exists too long sentence. Skipping."
        log += error_text + '\n'
        continue
    except:
        error_text = f"Some other error occured in '{file_name}'. Skipping."
        log += error_text + '\n'
        continue

    sents = list(doc.sents)

    # Doc-level
    num_tokens = len(doc)
    num_sents = len(sents)
    ted_sum = 0

    # Benepar
    num_clauses = 0
    num_sbar = 0
    num_unk = 0
    depth_sum = 0
    max_depth = 0
    clause_depth_sum = 0
    max_clause_depth = 0
    clause_length_sum = 0
    pronoun_sum = 0
    num_leaf_nps = 0 # all children are leaves
    num_nps = 0 # all NPs
    np_leaf_sum = 0 # number of leaf descendents a NP has (i.e., number of modifying words)
    # includes coord conj, goes through ever node, only counts children, min 1 per sent
    paratactic_sum = 0 
    root_parataxis_strict = 0 # only looks at punctuation-separated children of root, 1 if none
    root_parataxis_loose = 0 # includes coordinating conj, only root, 1 if none

    # spaCy
    dep_dist_sum = 0
    max_dep_dist = 0
    num_words = 0
    num_words_no_nums = 0
    uniq_words = []
    words_before_root_sum = 0 # Root as in word whose head is self
    num_stop_words = 0
    aoa_list = []
    aoa_uniq_list = []
    aoa_stopless_list = []
    aoa_stopless_uniq_list = []
    wf_list = []
    wf_uniq_list = []
    wf_stopless_list = []
    wf_stopless_uniq_list = []

    for sent in sents:
        benepar_time = time.perf_counter()
        benepar_analysis(sent)
        benepar_time = time.perf_counter() - benepar_time

        spacy_time = time.perf_counter()
        spacy_analysis(sent)
        spacy_time = time.perf_counter() - spacy_time

    # TREE EDIT DISTANCE
    ted_time = time.perf_counter()
    if ted_mode == 'adjacent':    
        for i in range(num_sents - 1):
            ted_analysis(sents[i], sents[i + 1])
        ted_avg = ted_sum / (num_sents - 1)
    elif ted_mode == 'combinations':
        for sent1, sent2 in combinations(sents, 2):
            ted_analysis(sent1, sent2)
        ted_avg = ted_sum / comb(num_sents, 2)
    else:
        print('Invalid ted_mode:', ted_mode)
        ted_avg = -1
    ted_time = time.perf_counter() - ted_time

    summary = {
        # File-level
        'date' : metadata['date'],
        'pres_name' : metadata['pres_name'],
        'byline' : metadata['byline'],
        'title' : metadata['title'],

        # Performance time
        'benepar_analysis_time' : benepar_time,
        'spacy_analysis_time' : spacy_time,
        'tree_edit_distance_time' : ted_time,
        'total_file_analysis_time' : time.perf_counter() - file_time,

        # Doc-level
        'num_tokens' : num_tokens,
        'num_sentences' : num_sents, 
        # 'avg_ted_adj' : ted_avg_adj,
        # 'avg_ted_comb' : ted_avg_comb,
        f'avg_tree_edit_dist_{ted_mode}' : ted_avg,

        # Benepar
        'avg_node_depth' : depth_sum / num_tokens, 
        'max_node_depth' : max_depth, # Equivalent to tree height
        'avg_node_clause_depth' : clause_depth_sum / num_tokens,
        'max_node_clause_depth' : max_clause_depth,
        'avg_clause_length' : clause_length_sum / num_clauses,
        'clauses_per_sent' : num_clauses / num_sents, 
        'sbars_per_sent' : num_sbar / num_sents,
        'pronouns_per_sent' : pronoun_sum / num_sents,
        'pronouns_per_clause' : pronoun_sum / num_clauses,
        'pronoun_prop_of_leaf_nps' : pronoun_sum / num_leaf_nps,
        'avg_num_np_modifiers' : np_leaf_sum / num_nps,
        'loose_parataxis_per_sent' : paratactic_sum / num_sents,
        'root_parataxis_per_sent_strict' : root_parataxis_strict / num_sents,
        'root_parataxis_per_sent_loose' : root_parataxis_loose / num_sents,
        'num_unk' : num_unk,

        # spaCy
        'num_words' : num_words,
        'avg_dependency_distance' : dep_dist_sum / num_words,
        'max_dependency_distance' : max_dep_dist,
        'avg_sentence_length_by_tok' : num_tokens / num_sents, 
        'avg_sentence_length_by_word' : num_words / num_sents,
        'avg_words_before_root' : words_before_root_sum / num_sents,
        'num_uniq_words' : len(uniq_words), 
        'proportion_uniq' : len(uniq_words) / num_words_no_nums,
        'stop_words_per_clause' : num_stop_words / num_clauses,
        'stop_words_per_sentence' : num_stop_words / num_sents,
        f'avg_aoa_{aoa_mode}' : list_avg(aoa_list),
        f'avg_aoa_uniq_{aoa_mode}' : list_avg(aoa_uniq_list),
        f'avg_stopless_aoa_{aoa_mode}' : list_avg(aoa_stopless_list),
        f'avg_stopless_aoa_uniq_{aoa_mode}' : list_avg(aoa_stopless_uniq_list),
        'avg_word_freq' : list_avg(wf_list),
        'avg_word_freq_uniq' : list_avg(wf_uniq_list),
        'avg_word_freq_stopless' : list_avg(wf_stopless_list),
        'avg_word_freq_stopless_uniq' : list_avg(wf_stopless_uniq_list),
    }      
    
    results[file_name] = summary
    
    print(paratactic_sum, root_parataxis_loose, root_parataxis_strict)

print("Error Log:")
print(log)
pd.set_option('display.precision', 2)
print(results)
results.to_csv(f"results/{datetime.now().strftime('%m-%d-%Y_%H-%M')}.csv")


Fellow-Citizens of the Senate and of the House of Representatives:

Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could not but o



32 27 23
Fellow Citizens:

I AM again called upon by the voice of my country to execute the functions of its Chief Magistrate. When the occasion proper for it shall arrive, I shall endeavor to express the high sense I entertain of this distinguished honor, and of the confidence which has been reposed in me by the people of united America.

Previous to the execution of any official act of the President the Constitution requires an oath of office. This oath I am now about to take, and in your presence: That if it shall be found during my administration of the Government I have in any instance violated willingly or knowingly the injunctions thereof, I may (besides incurring constitutional punishment) be subject to the upbraidings of all who are now witnesses of the present solemn ceremony.




4 4 4
Error Log:

                                              1789_Washington_Inaugural_Address  \
date                                                                 1789-04-30   
pres_name                                                     George Washington   
byline                          1st President of the United States: 1789 ‐ 1797   
title                                                         Inaugural Address   
benepar_analysis_time                                                      0.01   
spacy_analysis_time                                                        0.31   
tree_edit_distance_time                                                    9.08   
total_file_analysis_time                                                  16.75   
num_tokens                                                                 1546   
num_sentences                                                                23   
avg_tree_edit_dist_adjacent                                          

PCA

In [170]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.decomposition import FactorAnalysis

# print(results['num_tokens':])
np.set_printoptions(precision=100)

results_data = results['num_tokens':].drop(['num_tokens', 'num_words', 
    'num_sentences', 'num_unk', 'num_uniq_words', 'avg_word_freq_uniq'])
# results_data = results['num_tokens':]
results_std = StandardScaler().fit_transform(results_data).T
# results_std = normalize(results_data).T

# print([np.max(i)-np.min(i) for i in results_std])

# print(results_std.shape)
# fa = FactorAnalysis(n_components=3)
# f = fa.fit(results_std).components_.T
# # print(np.sum(f.T[0]))
# # print(f)

# # print(fa.fit_transform(results_std.T))

# faf = pd.DataFrame(f, index=list(results_data.index), columns=['1', '2', '3'])
# print(faf.nlargest(10, ['1']))



# PCA
pca = PCA(n_components=3).fit(results_std)
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
# print(pca.components_.shape)
# print(pca.components_.T)

comps = pd.DataFrame(pca.components_.T, index=list(results_data.index), columns=['1', '2', '3'])
m = comps.abs().nlargest(20, ['1']).index
print(comps.loc[m])

print('num_tokens' in comps.index)

# print('-'*100)

# cov = np.cov(results_std.T)

# evals, evecs = np.linalg.eig(cov)

# print(evecs)
# print(evecs.shape)

# explained_variances = []
# for i in range(len(evals)):
#     explained_variances.append(evals[i] / np.sum(evals))
 
# print(np.sum(explained_variances), '\n', explained_variances)



[0.798829799226471   0.13868336305891654 0.03236415372895239]
0.96987731601434
                                1         2         3
avg_word_freq_stopless       0.84  1.72e-01 -4.53e-01
avg_word_freq_stopless_uniq  0.43  4.56e-02  8.78e-01
avg_tree_edit_dist_adjacent -0.18  2.06e-01  6.46e-02
max_dependency_distance     -0.17  8.83e-01 -2.74e-02
avg_sentence_length_by_tok  -0.10  1.34e-02 -1.67e-03
avg_sentence_length_by_word -0.10  3.90e-03 -2.90e-03
avg_clause_length           -0.08  7.60e-03 -2.08e-02
stop_words_per_sentence     -0.06 -2.67e-02 -4.49e-03
max_node_depth              -0.06  1.76e-01  6.01e-02
avg_words_before_root       -0.05 -3.63e-02 -1.58e-02
avg_node_depth              -0.03 -5.57e-02 -1.47e-02
stop_words_per_clause       -0.03 -7.92e-02 -2.89e-02
avg_num_np_modifiers        -0.03 -6.82e-02 -1.99e-02
clauses_per_sent            -0.03 -7.66e-02 -2.25e-02
avg_word_freq               -0.03 -2.72e-02  2.35e-03
avg_stopless_aoa_max        -0.03 -8.39e-02 -2.90e-02
pro

- vocab measures
    - measures of polysemy
- weight ted differently --> should adding/removing be weighted less because it simply indicates a different length sentence?
    - how to remove determiners?
- make scraper more robust
- unit testing for parataxis measures

- word not in AoA list 
    - opt 1: average lemmas
    - opt 2: take max

- cosine similarity
    - Something at the end???

- PCA and RFE
- maybe paratactic should go back to the regular clause tags because we don't want S:SBAR cuz they're related


Questions
- Workaround for the spacy max length? <code>ValueError: Sentence of length 965 (in sub-word tokens) exceeds the maximum supported length of 512</code>
- How to compare syntax trees better?
    - Should I do TED weighting?
- Visually displaying / analyzing the data
- Code not running on harris...

- View: https://scholarworks.gsu.edu/cgi/viewcontent.cgi?article=1035&context=alesl_diss
- View: http://cohmetrix.memphis.edu/cohmetrixhome/documentation_indices.html#Complexity
- MORE RESEARCH NEEDED: compare tree similarity (SEARCH TREE EDIT DISTANCE) of sentences in doc, pq-gram distance
(CITING APTED: https://pypi.org/project/apted/#description)

Download Libraries

In [None]:
! pip install benepar
! pip install spacy
! pip install apted
! pip install bs4
! python -m spacy download en_core_web_md

In [49]:
a=[("13.5",100)]
b=[("14.5",100), ("15.5", 100)]
c=[("15.5",100), ("16.5", 100)]
b={"14.5":100, "15.5": 100}
c={"15.5":100, "16.5": 100}
input=[b, c]

from collections import Counter
print(Counter(c))
print(sum(
    (Counter(x) for x in input),
    Counter()))

Counter({'15.5': 100, '16.5': 100})
Counter({'15.5': 200, '14.5': 100, '16.5': 100})
