Downloads

In [None]:
! pip install benepar
! pip install spacy
! pip install apted
! pip install bs4

Setup (Imports and Install Configurations)

In [1]:
import benepar
import spacy
import pandas as pd
import decimal
import re
from lxml import etree

import apted
from apted import APTED
from apted.helpers import Tree
from itertools import combinations
from math import comb

import time
import json
from datetime import datetime
import os

benepar.download('benepar_en3')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\cedch\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


<benepar.integrations.spacy_plugin.BeneparComponent at 0x17d88366fb0>

Helper Methods

In [91]:
# SEXP TO XML
def clean_xml(xml):
    xml = re.sub('<(/?)[^a-zA-Z/][^>]*>', '<\g<1>UNK>', xml) # invalid tokens labeled 'UNK'
    return xml.replace(' ', '')

def sexp_to_xml(sexp):
    def apply_inner_re(s):
        return re.sub('\(([^ ]*) ([^\)\(]*)\)', '<\g<1>> \g<2> </\g<1>>', s)

    xml = apply_inner_re(sexp)
    while xml.startswith('('):
        xml = apply_inner_re(xml)

    with open('special_chars.txt') as f:
        special_chars = dict([line.split() for line in f])

    def key_to_re(s):
        s = re.sub('(.*)([\\\.\+\*\?\^\$\(\)\[\]\{\}\|])(.*)', '\g<1>\\\\\g<2>\g<3>', s)
        return '<(/?)' + s + '>'

    for k, v in special_chars.items():
        xml = re.sub(key_to_re(k), f'<\g<1>{v}>', xml)

    return clean_xml(xml)

# TREE EDIT DISTANCE
def apted_format(parse_str):
    parse_str = re.sub('\(([^ ]+) [^ \(\)]+?\)', '(\g<1>)', parse_str)
    parse_str = parse_str.replace(' ', '')
    parse_str = parse_str.replace('(', '{')
    parse_str = parse_str.replace(')', '}')
    return parse_str

# PARATACTIC CHILDREN STRICT 
def find_parataxis_strict(e):
    global clause_tags
    
    children = [c.tag for c in e.getchildren() if not(c.tag.startswith('PUNCT-'))] # excludes punct
    sum = 0
    in_group = False
    
    for i in range(len(children) - 1):
        if children[i] in clause_tags and children[i + 1] in clause_tags:
            sum += 1
            if not(in_group):
                sum += 1
                in_group = True
        else:
            in_group = False
    return sum

Analysis Functions

In [92]:
clause_tags = ['S', 'SBARQ', 'SINV'] # Not included: 'SQ', 'SBAR'
clause_re = re.compile('(' + '/|'.join(clause_tags) + ')')
pronoun_tags = ['PRP', 'PRPS']

def constituency_analysis(sent):
    global num_clauses, num_sbar, num_unk, depth_sum, max_depth, max_clause_depth, clause_depth_sum
    global pronoun_sum, num_leaf_nps, num_nps, np_leaf_sum, clause_length_sum
    global paratactic_sum_loose, paratactic_sum_strict

    xml = sexp_to_xml(sent._.parse_string)
    root = etree.fromstring(xml) 
    tree = etree.ElementTree(root)

    num_sbar += sum(int(e.tag == 'SBAR') for e in root.iter())
    pronoun_sum += sum(int(e.tag in pronoun_tags) for e in root.iter())
    num_unk += sum(int(e.tag == 'UNK') for e in root.iter())
    for e in root.iter():
        tag = e.tag
        paratactic_sum_strict += find_parataxis_strict(e)
        if tag in clause_tags: 
            num_clauses += 1
            clause_length_sum += sum(int(not(d.tag.startswith('PUNCT-') and bool(d.text))) for d in e.iterdescendants())
            
            paratactic_sum_loose += sum(int(bool(c.tag in clause_tags)) for c in e.getchildren())
        elif e.tag == 'NP':
            num_nps += 1
            is_leaf_np = True
            for c in e.iterdescendants():
                if c.text: 
                    if not(c.tag.startswith('PUNCT-') or c.tag == 'DT'): # ignore determiners and punctuation
                        np_leaf_sum += 1
                else:
                    is_leaf_np = False
            if is_leaf_np:
                num_leaf_nps += 1
        
        if e.text:
            path = tree.getpath(e)

            depth = len(re.findall('/', path))
            depth_sum += depth # Number of times '/' appears, excluding first
            max_depth = max(max_depth, depth)

            clause_depth = len(clause_re.findall(path))
            clause_depth_sum += clause_depth
            max_clause_depth = max(max_clause_depth, clause_depth)

def dependency_analysis(sent):
    global dep_dist_sum, num_words, words_before_root_sum, uniq_words, num_words_no_nums, num_stop_words

    num_stop_words += sum(int(token.is_stop) for token in sent)

    for token in sent:
        if not(token.is_punct or token.is_space):
            num_words += 1
            dep_dist_sum += abs(token.head.i - token.i)

            if token.i < sent.root.i: words_before_root_sum += 1

            if not(token.like_num):
                if not(token.lower_ in uniq_words):
                    uniq_words.append(token.lower_)
                num_words_no_nums += 1

def ted_analysis(sent1, sent2):
    global ted_sum

    tree1 = Tree.from_text(apted_format(sent1._.parse_string))
    tree2 = Tree.from_text(apted_format(sent2._.parse_string))

    apted = APTED(tree1, tree2, )
    ted = apted.compute_edit_distance()
    ted_sum += ted

In [97]:
results = pd.DataFrame([])

# Modes: 'combinations', 'adjacent'
ted_mode = 'combinations'
log = ""

for i, file in enumerate(os.scandir('text_jsons/')):
    # if i == 10: break
    
    file_time = time.perf_counter()
    file_name = re.sub('\.json$', '', file.name)
    with open(file, encoding='utf-8') as f:
        metadata = json.load(f)
    text = metadata['text']
    text = text.replace('\n', ' ').strip()
    text = re.sub('\s{2,}', ' ', text)

    try:
        doc = nlp(text)
    except ValueError:
        error_text = f"ValueError in '{file_name}'. Likely exists too long sentence. Skipping."
        print(error_text)
        log += error_text + '\n'
        continue
    except:
        error_text = f"Some other error occured in '{file_name}'. Skipping."
        print(error_text)
        log += error_text + '\n'
        continue

    sents = list(doc.sents)

    # Doc-level
    num_tokens = len(doc)
    num_sents = len(sents)
    ted_sum = 0

    # Constituency
    num_clauses = 0
    num_sbar = 0
    num_unk = 0
    depth_sum = 0
    max_depth = 0
    clause_depth_sum = 0
    max_clause_depth = 0
    clause_length_sum = 0
    pronoun_sum = 0
    num_leaf_nps = 0 # all children are leaves
    num_nps = 0 # all NPs
    np_leaf_sum = 0 # number of leaf descendents a NP has (i.e., number of modifying words)
    paratactic_sum_strict = 0 # excludes coordinating conjunctions
    paratactic_sum_loose = 0 # includes

    # Dependency
    dep_dist_sum = 0
    num_words = 0
    num_words_no_nums = 0
    uniq_words = []
    words_before_root_sum = 0 # Root as in word whose head is self
    num_stop_words = 0

    for sent in sents:
        constit_time = time.perf_counter()
        constituency_analysis(sent)
        constit_time = time.perf_counter() - constit_time

        dep_time = time.perf_counter()
        dependency_analysis(sent)
        dep_time = time.perf_counter() - dep_time

    # TREE EDIT DISTANCE
    ted_time = time.perf_counter()
    if ted_mode == 'adjacent':    
        for i in range(num_sents - 1):
            ted_analysis(sents[i], sents[i + 1])
        ted_avg = ted_sum / (num_sents - 1)
    elif ted_mode == 'combinations':
        for sent1, sent2 in combinations(sents, 2):
            ted_analysis(sent1, sent2)
        ted_avg = ted_sum / comb(num_sents, 2)
    else:
        print('Invalid ted_mode:', ted_mode)
        ted_avg = -1
    ted_time = time.perf_counter() - ted_time

    summary = {
        # File-level
        'date' : metadata['date'],
        'pres_name' : metadata['pres_name'],
        'byline' : metadata['byline'],
        'title' : metadata['title'],

        # Doc-level
        'num_tokens' : num_tokens,
        'num_sentences' : num_sents, 
        # 'avg_ted_adj' : ted_avg_adj,
        # 'avg_ted_comb' : ted_avg_comb,
        f'avg_tree_edit_dist_{ted_mode}' : ted_avg,

        # Constituency
        'avg_node_depth' : depth_sum / num_tokens, 
        'max_node_depth' : max_depth, # Equivalent to tree height
        'avg_node_clause_depth' : clause_depth_sum / num_tokens,
        'max_node_clause_depth' : max_clause_depth,
        'avg_clause_length' : clause_length_sum / num_clauses,
        'clauses_per_sent' : num_clauses / num_sents, 
        'sbars_per_sent' : num_sbar / num_sents,
        'pronouns_per_sent' : pronoun_sum / num_sents,
        'pronouns_per_clause' : pronoun_sum / num_clauses,
        'pronoun_prop_of_leaf_nps' : pronoun_sum / num_leaf_nps,
        'avg_num_np_modifiers' : np_leaf_sum / num_nps,
        'parataxis_per_sent_strict' : paratactic_sum_strict / num_sents,
        'parataxis_per_sent_loose' : paratactic_sum_loose / num_sents,
        'num_unk' : num_unk,

        # Dependency
        'num_words' : num_words,
        'avg_dependency_distance' : dep_dist_sum / num_words,
        'avg_sentence_length_by_tok' : num_tokens / num_sents, 
        'avg_sentence_length_by_word' : num_words / num_sents,
        'avg_words_before_root' : words_before_root_sum / num_sents,
        'num_uniq_words' : len(uniq_words), 
        'proportion_uniq' : len(uniq_words) / num_words_no_nums,
        'stop_words_per_clause' : num_stop_words / num_clauses,
        'stop_words_per_sentence' : num_stop_words / num_sents,

        # Performance time
        'constituency_analysis_time' : constit_time,
        'dependency_analysis_time' : dep_time,
        'tree_edit_distance_time' : ted_time,
        'total_file_analysis_time' : time.perf_counter() - file_time
    }      

    results[file_name] = summary

print("Error Log:")
print(log)
pd.set_option('display.precision', 2)
print(results)
results.to_csv(f"results/{datetime.now().strftime('%m-%d-%Y_%H-%M')}.csv")




ValueError in '1797_Adams_Inaugural_Address'. Likely exists too long sentence. Skipping.




Error Log:
ValueError in '1797_Adams_Inaugural_Address'. Likely exists too long sentence. Skipping.

                                               1789_Washington_Inaugural_Address  \
date                                                                  1789-04-30   
pres_name                                                      George Washington   
byline                           1st President of the United States: 1789 ‐ 1797   
title                                                          Inaugural Address   
num_tokens                                                                  1546   
num_sentences                                                                 23   
avg_tree_edit_dist_combinations                                           143.17   
avg_node_depth                                                             12.68   
max_node_depth                                                                50   
avg_node_clause_depth                                      

- vocab measures
    - measures of polysemy
    - age of acquisition
    - word frequency
- weight ted differently --> should adding/removing be weighted less because it simply indicates a different length sentence?
    - how to remove determiners?
- preprocess out stop words
- make scraper more robust
- unit testing for parataxis measures


Workaround for the spacy max length? <code>ValueError: Sentence of length 965 (in sub-word tokens) exceeds the maximum supported length of 512</code>

- View: https://scholarworks.gsu.edu/cgi/viewcontent.cgi?article=1035&context=alesl_diss
- View: http://cohmetrix.memphis.edu/cohmetrixhome/documentation_indices.html#Complexity
- MORE RESEARCH NEEDED: compare tree similarity (SEARCH TREE EDIT DISTANCE) of sentences in doc, pq-gram distance
(CITING APTED: https://pypi.org/project/apted/#description)

Scraper

In [None]:
from bs4 import BeautifulSoup
import requests

folder_path = 'spoken_addresses/'
overwrite = True
link_head = 'https://www.presidency.ucsb.edu'

# inaugural addresses
# nav_link = 'https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/inaugural-addresses'

# all spoken addresses
nav_link = 'https://www.presidency.ucsb.edu/documents/app-categories/presidential/spoken-addresses-and-remarks?items_per_page=60'

if not(os.path.exists(folder_path)):
    os.makedirs(folder_path)

def request_persistant(link):
    try:
        page = requests.get(link)
        return page
    except:
        print('Request error. Retrying.')
        time.sleep(1)
        return request_persistant(link)

def save_file(data):
    global overwrite, folder_path

    last_name = re.sub('.* (.*)', '\g<1>', data['pres_name'])
    year = re.sub('(\d+)-\d+-\d+.*', '\g<1>', data['date'])
    title_clipped = re.sub('^((?:\w+\s?){1,5}).*', '\g<1>', data['title']).strip()
    save_name = folder_path + f'{year}_{last_name}_{title_clipped}.json'.replace(' ', '_')
    
    if os.path.exists(save_name) and not(overwrite):
        i = 1
        save_name += str(i)
        while os.path.exists(save_name):
            i += 1
            save_name = save_name[:len(save_name) - len(str(i - 1))] + str(i)
    
    with open(save_name, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

has_next_nav = True
while has_next_nav:
    nav = request_persistant(nav_link)
    nav_soup = BeautifulSoup(nav.text, 'lxml')
  
    page_link_containers = nav_soup.find_all(class_='field-title')
    # Searches for <a> within 'field-title' class, pulls 'href' and adds website head
    page_links = [link_head + container.find('a')['href'] for container in page_link_containers]

    for link in page_links:
        page = request_persistant(link)
        page_soup = BeautifulSoup(page.text, 'lxml')

        pres_name_raw = page_soup.find(class_='diet-title')
        pres_name = pres_name_raw.text.strip() if pres_name_raw is not None else ''

        byline_raw = page_soup.find(class_='diet-by-line president')
        byline = byline_raw.text.strip() if byline_raw is not None else ''

        title_raw = page_soup.find(class_='field-ds-doc-title')
        title = title_raw.text.strip() if pres_name_raw is not None else ''

        date_raw = page_soup.find(class_='date-display-single')['content']
        date = re.sub('(\d+-\d+-\d+).*', '\g<1>', date_raw).strip() if date_raw is not None else ''

        garbage_collected = []
        paragraph_containers = page_soup.find(class_='field-docs-content')
        paragraphs_raw = paragraph_containers.find_all('p')
        text_list = []
        for paragraph in paragraphs_raw:
            if len(paragraph.contents) > 1:
                contents = [c.text for c in paragraph.contents]
                content = ' '.join(contents)
            else:
                content = paragraph.text
            
            m = re.match('(.*)(\[.*\])(.*)', content)
            if bool(m):
                garbage_collected.append(m.group(2))
                content = m.group(1) + " " + m.group(3)

            content = re.sub('\s{2,}', ' ', content)
            text_list.append(content)
        text = '\n\n'.join(text_list)
        
        data = {
            'pres_name' : pres_name,
            'byline' : byline,
            'title' : title,
            'date' : date,
            'text' : text,
            'garbage_collected' : garbage_collected
        }

        save_file(data)

    next_container = nav_soup.find(class_='next')
    if next_container:
        next_stub = next_container.find('a')['href']
        nav_link = link_head + next_stub
    else: 
        has_next_nav = False
