In [8]:
import numpy as np
from os.path import join,isfile
from os import listdir
from tqdm import tqdm
import json
import pickle
from collections import defaultdict
from os.path import join,isfile
from transformers import BertTokenizer
import random

In [9]:
indir = join('data','preprocessed')
constitution_terms_file = join('data','constitution_words.txt') # did not prune the names from the original list
sample_index_file = join(indir,'sample_target_index_2.dict')
tokenized_file = join(indir,'cofea_tokenized.jsonlist')
data_file = join(indir, 'cofea.jsonlist')
kmeans_file = join(indir,'cofea_hist_bert_vecotrs_masked_masked_kmeans_labels.dict')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
random.seed(42)

In [3]:
# get the sampled file index
with open(sample_index_file,'rb') as f:
    sample_index = pickle.load(f)

# get the kmeans labels
with open(kmeans_file,'rb') as f:
    kmeans_labels = pickle.load(f)
    
# cofea file data for dates, source, etc
with open(data_file) as f:
     cofea_data = f.readlines()
        
# get the tokenized documents
with open(tokenized_file) as f:
     tokenized_data = f.readlines()
        
#get the constitution words tokenized and rejoined
# load constitution
with open(constitution_terms_file, 'r',encoding = 'utf-8') as f:
    constitution_terms = f.read().splitlines()
# they were indexed and saved in their tokenized form 
constitution_terms = [tokenizer.tokenize(x) for x in constitution_terms]

constitution_terms_cleaned = []
for x in constitution_terms:
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(x):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    constitution_terms_cleaned.append(' '.join(rejoined_pieces))
constitution_terms_cleaned = set(constitution_terms_cleaned)

# to save time, load the cofea and token data
cofea_data = [ json.loads(doc) for doc in cofea_data]
tokenized_data = [ json.loads(doc) for doc in tokenized_data]

KeyboardInterrupt: 

In [None]:
# get the labels
sample_size = 10
# minimum threshold
minimum_size = 55 # we do not calculate the silhouette score for terms less than 55 

source_examples = {}
kmeans_examples = {}
for target_word in tqdm(sample_index):
    source_examples[target_word] = defaultdict(list)
    has_kmeans = False # we may not have kmeans labels
    if target_word in kmeans_labels.keys():
        kmeans_examples[target_word] = defaultdict(list)
        has_kmeans = True
    if len(sample_index[target_word]) > minimum_size:
        for x,index in enumerate(sample_index[target_word]):
                # pull the source data
                _,doc_index,token_id = index
                doc = cofea_data[doc_index]
                #doc = json.loads(doc)
                # text span
                #token_data = json.loads(tokenized_data[doc_index])
                token_data = tokenized_data[doc_index]
                text = token_data['tokens'][max(token_id-50,0):min(token_id+50,len(token_data['tokens']))]
                # source label
                source = doc['source']
                decade =  doc['decade']
                source_examples[target_word][source].append((text,decade))
                
                # pull the kmeans label
                if has_kmeans:
                    klabel = kmeans_labels[target_word][x]
                    kmeans_examples[target_word][klabel].append((text,source,decade))
        # sample for each source cluster
        for source in source_examples[target_word]:
            if len(source_examples[target_word][source]) > sample_size:
                source_examples[target_word][source] = random.sample(source_examples[target_word][source],
                                                                     sample_size)
        #sample for each kmeans cluster
        if has_kmeans:
            for klabel in kmeans_examples[target_word]:
                if len(kmeans_examples[target_word][klabel]) > sample_size:
                    kmeans_examples[target_word][klabel] = random.sample(kmeans_examples[target_word][klabel],
                                                                         sample_size)



In [5]:
# save the examples
with open(join(indir,'histbert_masked_kmeans_examples.dict'),'wb') as f:
    pickle.dump(kmeans_examples,file=f)
with open(join(indir,'histbert_masked_source_examples.dict'),'wb') as f:
    pickle.dump(source_examples,file=f)

In [10]:
#load the examples if not rerunning
with open(join(indir,'histbert_masked_kmeans_examples.dict'),'rb') as f:
    kmeans_examples = pickle.load(f)
with open(join(indir,'histbert_masked_source_examples.dict'),'rb') as f:
    source_examples = pickle.load(f)

In [11]:
def rejoin_tokens(token_list,source,decade):
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(token_list):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    return('example source: '+source+'\n'+ 
           'decade: ' + str(decade) + '\n'
           +' '.join(rejoined_pieces).replace('##',''))

In [12]:
def print_examples(word,examples):
    print("Target word: "+word + '\n')
    word = ' '.join(tokenize_and_rejoin(word))
    for cluster in examples[word]:
        print('Cluster: '+str(cluster) + '\n')
        for example in examples[word][cluster]:
            if len(example) == 3:
                tokens,source,decade = example
                print(rejoin_tokens(tokens,source,decade)+'\n')
            else:
                tokens,decade = example
                print(rejoin_tokens(example,'',decade)+'\n')
        print('\n')

In [13]:
def tokenize_and_rejoin(word):
    rejoined_pieces = []
    for p_i, piece in enumerate(tokenizer.tokenize(word)):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    return rejoined_pieces

In [22]:
tokenize_and_rejoin('habeas corpus')

['ha##be##as', 'corpus']

In [23]:
print_examples('habeas corpus',kmeans_examples)
"""
Habeas corpus has two discinct clusters, but the usage appears to bethe same
"""

Target word: habeas corpus

Cluster: 1

example source: National Archives Founders Online
decade: 1780
to justice . before this could take effect he was apprehended and confined in prison but soon after admitted to bail . as his going at large gave offence he was delivered up by his bail and again confined . but having applied for a writ of habeas corpus he was brought before justice bryan and on his return found means to escape from the officer who had him in custody . however he was again taken in a few days and is now confined and to take his trial on the 24 of this

example source: Evans Early American Imprints
decade: 1790
brought into parliament similar to the irish convention bill , that the people should meet and assert their rights : afterwards the spirit of the motion was retained , but extending to certain other events , in which the convention should likewise meet in case of the suspension of the habeas corpus act , in case of an invasion , and in case of landing foreign tr

'\nHabeas corpus has two discinct clusters, but the usage appears to bethe same\n'

In [24]:
print(print_examples('vacancies',kmeans_examples))

Target word: vacancies

Cluster: 0

example source: National Archives Founders Online
decade: 1790
war department 9 may 1799 . sir . i have been honoured with your letters of the 19th & 27th , and this morning with that of the 29th of april ulto . the inclosed list of names to fill up the vacancies in the 16th regiment of infantry , and the vacancy of major in the 12th regiment is respectfully submitted . the recommendations which governed in forming the list for the former regiment are also inclosed . with respect to the latter , general hamilton

example source: National Archives Founders Online
decade: 1780
order among the troops . and the less is the mass of an undisciplined body , the easier it is to him who has the direction of it , to make it act , & restore it to order , in case of confusion . the vacancies of subalterns in several regiments , appear to be one of the motives , which might the most strongly ingage us to an incorporation . but i believe that the nomination might 

In [8]:
print_examples('united',kmeans_examples)

Target word: united

Cluster: 0

example source: National Archives Founders Online
decade: 1790
the liberty to mention to you that mr woolcott the present auditor would be in every respect worthy of your consideration as his successor in office . now that the event has happened , a concern as anxious as it is natural , for the success of the department united with a sentiment of justice towards mr woolcott leads me to a repetition of that idea — this gentleman ’ s conduct in the station he now fills , has been that of an excellent officer . it has not only been good but distinguished . it has

example source: National Archives Founders Online
decade: 1790
improvement of the commerce and navigation of the same , has had the same under consideration , and thereupon makes the following report , the countries with which the united states have their chief commercial intercourse , are spain , portugal , france , great britain , the united netherlands , denmark , and sweden , and their americ

In [9]:
print_examples('vice',kmeans_examples)

Target word: vice

Cluster: 0

example source: Evans Early American Imprints
decade: 1790
imagine , that the pain of self - denial is confined to virtue . he who follows the world , as much as he who follows christ , must " take up his cross ; " and to him assuredly , it will prove a more oppressive burden . vice allows all our passions to range uncontrolled ; and where each claims to be superior , it is impossible to gratify all . the predominant desire can only be indulged at the expense of its rival . no mortifications which

example source: Evans Early American Imprints
decade: 1790
misfortunes ; but they are , at the same time , the source of all our pleasures ; therefore , the study of our lives ought to be , not to dissemble an absence of passion ; but to repel those , which lead to vice , by those which direct to virtue . your visit was a most grateful circumstance ; there is a pleasure in seeing you , even here , though intermingled with grief , which is a great alleviation to

In [25]:
print_examples('corpus',kmeans_examples)
# first cluster is only habeaus corpus
# second cluster has 3 mentions of habeus corpus
# mainly corpus juris is common

Target word: corpus

Cluster: 0

example source: National Archives Founders Online
decade: 1800
. we direct the officer to enquire into the fact of infancy , & if he believes him under age he discharges him . if he believes him of full age , we advise the parent etc . that he may take out a habeas corpus & have the fact tried before an impartial judge . if enlisted with the consent of the parent etc . it must be by indentures as prescribed by law for an apprentice or servant , this being the only mode of obligation in which the

example source: National Archives Founders Online
decade: 1800
i was in europe when the constitution was planned & established , and never saw it till after it was established . on receiving it i wrote urging the want of provision for the freedom of religion , freedom of the press , trial by jury , habeas corpus , the substitution of militia for a standing army , and an express reservation to the states of all rights not specifically granted to the union . he a

In [26]:
print_examples('emolument',kmeans_examples)
# first cluster: private emolument 6 cluster is not 
# second cluster emoluments provided by government 

Target word: emolument

Cluster: 0

example source: Evans Early American Imprints
decade: 1800
another ' s hand — ' i must go — so and so — such business calls me . ' another replied — ' i should have been at such a place . ' every one appeared to be running after some object that occupied his mind , and brought emolument to himself . ' and where shall i go ! ' sighed i . " in king street i met a mob , dragging a poor ragged wretch to justice , who had been detected in picking a pocket . i mingled with the

example source: National Archives Founders Online
decade: 1800
proposal , and promised to bear all his expences . he came accordingly with his wife and one child an amiable and beautiful daughter of about twelve years of age and took the oversight of my servants and family affairs to my great relief and their considerable emolument . it was a regular and a virtuous family as far as ever i observed or heard . this madam is the mr dumas , whom you have transmitted to posterity , in so

In [28]:
print_examples('born',kmeans_examples)

Target word: born

Cluster: 0

example source: Evans Early American Imprints
decade: 1750
, yea , in the former part of his life , were wicked . it appears , that his eldest son , cain , was a very wicked man , who slew his righteous brother abel . and adam lived an hundred and thirty years before seth was born : and by that time , we may suppose , his posterity began to be considerably numerous : when he was born , his mother called his name seth ; for god , said she , hath appointed me another seed , in stead of abel

example source: Evans Early American Imprints
decade: 1790
. lewis xvi . was literally afraid of hell , the horns and hoofs of the devil , and excommunication , and with all this it was impossible he should be any thing but a poor creature of a king . if he had been born two hundred years earlier , and had had a reasonable wife , he would have made no more noise in the world than other princes of his line , who have passed across the stage without doing either much good

In [7]:
print_examples('corpus',kmeans_examples)

Target word: corpus

Cluster: 0

example source: National Archives Founders Online
decade: 1800
. we direct the officer to enquire into the fact of infancy , & if he believes him under age he discharges him . if he believes him of full age , we advise the parent etc . that he may take out a habeas corpus & have the fact tried before an impartial judge . if enlisted with the consent of the parent etc . it must be by indentures as prescribed by law for an apprentice or servant , this being the only mode of obligation in which the

example source: National Archives Founders Online
decade: 1800
i was in europe when the constitution was planned & established , and never saw it till after it was established . on receiving it i wrote urging the want of provision for the freedom of religion , freedom of the press , trial by jury , habeas corpus , the substitution of militia for a standing army , and an express reservation to the states of all rights not specifically granted to the union . he a

In [15]:
print_examples('impeachments',kmeans_examples)

Target word: impeachments

Cluster: 0

example source: Evans Early American Imprints
decade: 1770
the council . section the twenty - second . every officer of state , whether judicial or [UNK] , shall be liable to be impeached by the [UNK] assembly , either when in office , or after his resignation , or removal for mal - administration : all impeachments shall be before the president or vice - president and council , who shall hear and determine the same . section the twenty - third . the judges of the supreme court of judicature shall have fixed salaries , be commissioned for seven

example source: Evans Early American Imprints
decade: 1790
bank members , than to permit it to plunder the states of their several quotas . 8 . it was evidently designed , that the senate as judges of impeachments , should be constitutionally preserved in a state of impartiality . impeachments originate in the house of representatives , and the crimes to be restrained by this process , will mostly be compr

In [18]:
print_examples('reprisal',kmeans_examples)

Target word: reprisal

Cluster: 1

example source: National Archives Founders Online
decade: 1770
by surprise , started hostilities , and blustered toward brest , as soon as they saw the french fleet leave brest to overtake them , they retired and sought safety in the english ports . i hope that , in turn , the french fleet will make a reprisal and find a few british men - of - war to capture , besides what they could get out of the fleets coming back from the two indies . i was very happy to see mention , in your account [ liste ] of the arrival

example source: Evans Early American Imprints
decade: 1790
to the executive powers ] have been already noticed — the participation of the senate in the appointment of officers , and the making of treaties . a third remains to be mentioned — the right of the legislature to declare war , and grant letters of marque and reprisal . " again — " it deserves to be remarked , that as the participation of the senate in the making treaties , and the po