In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
import random
from transformers import BertTokenizer

2022-02-18 22:57:26.420014: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-18 22:57:26.420075: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
data_file = os.path.join(indir, 'cofea.jsonlist')
special_terms_file = os.path.join('data','interest_terms.txt')
target_index_file = os.path.join(indir,'target_word_index.dict')
filter_source = ['Evans Early American Imprints','HeinOnline','National Archives Founders Online']
filter_year = [1750,1810]
sample_size = 5000
random.seed(42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# get the file info for cofea
with open(data_file) as f:
     cofea_data = f.readlines()

In [4]:
# Collect index of all terms
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)

# list of terms of interest
with open(special_terms_file, 'r',encoding = 'utf-8') as f:
    special_terms = f.read().splitlines()
    
# they were indexed and saved in their tokenized form 
special_terms = [tokenizer.tokenize(x) for x in special_terms]

special_terms_cleaned = []
for x in special_terms:
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(x):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    special_terms_cleaned.append(' '.join(rejoined_pieces))
special_terms_cleaned = set(special_terms_cleaned)

In [5]:
# Evans pre constitution docs
# its faster to filter the docs before counting
pre_constitution_docs = []
post_constitution_docs = []
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    if doc['source'] == 'Evans Early American Imprints':
        if doc['year'] < 1787:
            pre_constitution_docs.append(x)
        else:
            post_constitution_docs.append(x)

pre_constitution_docs = set(pre_constitution_docs)
post_constitution_docs = set(post_constitution_docs)

pre_constitution_frequency = defaultdict(int)
post_constitution_frequency = defaultdict(int)

for word in tqdm(special_terms_cleaned):
    for f_ind,doc_ind,ind in target_index[word]:
        # only sample documents from the sources we want
        if doc_ind in pre_constitution_docs:
            pre_constitution_frequency[word]+=1
        elif doc_ind in post_constitution_docs:
            post_constitution_frequency[word]+=1

100%|█████████████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 354.06it/s]


In [6]:
pre_constitution_frequency

defaultdict(int,
            {'export##ation': 413,
             'cruel and unusual': 2,
             'good behavior': 26,
             'recess': 187,
             'imp##ost##s': 114,
             'united states': 1449,
             'confederation': 179,
             'pi##rac##ies': 61,
             'in law': 786,
             'pardon##s': 210,
             'controversies': 592,
             'for##feit##ure': 566,
             'confronted': 37,
             'due process': 23,
             'compact': 663,
             'public use': 26,
             'north carolina': 94,
             'declare war': 34,
             'executive power': 183,
             'south carolina': 101,
             'imminent danger': 155,
             'pro tempo##re': 32,
             'indictment': 651,
             'bills of credit': 880,
             'privileges': 4002,
             'suppress insurrection##s': 2,
             'criminal prosecution##s': 14,
             'indian tribes': 51,
             'ambassador

In [7]:
post_constitution_frequency

defaultdict(int,
            {'export##ation': 465,
             'cruel and unusual': 9,
             'appellate jurisdiction': 49,
             'good behavior': 28,
             'recess': 413,
             'imp##ost##s': 191,
             'united states': 13484,
             'confederation': 595,
             'pi##rac##ies': 87,
             'direct taxes': 29,
             'in law': 343,
             'pardon##s': 174,
             'controversies': 299,
             'for##feit##ure': 205,
             'confronted': 67,
             'due process': 13,
             'compact': 746,
             'public use': 53,
             'north carolina': 260,
             'declare war': 107,
             'executive power': 229,
             'south carolina': 378,
             'imminent danger': 195,
             'pro tempo##re': 31,
             'indictment': 818,
             'bills of credit': 148,
             'privileges': 2307,
             'capita##tion': 34,
             'suppress insurrectio

In [8]:
# missing terms
missing_pre = []
missing_post = []
for term in special_terms_cleaned:
    if term not in pre_constitution_frequency:
        missing_pre.append(term)
    if term not in post_constitution_frequency.keys():
        missing_post.append(term)

In [9]:
missing_pre

['appellate jurisdiction',
 'direct taxes',
 'capita##tion',
 'confrontation',
 'ha##be##as corpus',
 'maritime jurisdiction',
 'rep##el invasions',
 'receive ambassadors',
 'here##in granted',
 'over##t act']

In [10]:
missing_post

['ha##be##as corpus', 'rep##el invasions', 'here##in granted', 'over##t act']

In [12]:
# check all of COFEA for every decade
decades = []
decade_docs = defaultdict(list)
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    decade = doc['decade']
    decades.append(decade)
    decade_docs[decade].append(x)

for decade in decade_docs:
    decade_docs[decade] = set(decade_docs[decade])
decades = set(decades)

decade_frequency = {}

for word in tqdm(special_terms_cleaned):
    decade_frequency[word] = {}
    for f_ind,doc_ind,ind in target_index[word]:
        for decade in decade_docs:
            if doc_ind in decade_docs[decade]:
                decade_frequency[word][decade] = decade_frequency[word].get(decade,0)+1
                break
                

    
     

100%|██████████████████████████████████████████████████████████████████| 72/72 [00:01<00:00, 65.89it/s]


In [13]:
decade_frequency

{'export##ation': {1720: 24,
  1770: 871,
  1780: 504,
  1790: 1714,
  1800: 229,
  1710: 3,
  1760: 167,
  1830: 242,
  1690: 28,
  1810: 78,
  1750: 20,
  1740: 53,
  1680: 4,
  1730: 7,
  1700: 5,
  1820: 11,
  1660: 4,
  1670: 1},
 'cruel and unusual': {1790: 15, 1780: 11, 1770: 2, 1760: 1},
 'appellate jurisdiction': {1780: 111, 1800: 9, 1830: 8, 1790: 34, 1770: 1},
 'good behavior': {1770: 45,
  1760: 18,
  1780: 82,
  1820: 12,
  1800: 49,
  1790: 40,
  1830: 15,
  1750: 2,
  1810: 10,
  1690: 1,
  1640: 1},
 'recess': {1790: 831,
  1770: 430,
  1780: 272,
  1830: 42,
  1820: 66,
  1810: 55,
  1760: 51,
  1800: 223,
  1750: 10,
  1730: 3,
  1720: 5,
  1690: 2,
  1680: 2,
  1740: 23,
  1710: 5},
 'imp##ost##s': {1690: 23,
  1780: 416,
  1790: 411,
  1820: 23,
  1770: 153,
  1740: 3,
  1810: 13,
  1750: 5,
  1670: 15,
  1830: 20,
  1800: 31,
  1760: 17,
  1680: 7,
  1660: 1,
  1700: 2,
  1730: 1},
 'united states': {1790: 60554,
  1830: 13904,
  1800: 14699,
  1770: 17649,
  1780: