In [1]:
import numpy as np
from os.path import join,isfile
from os import listdir
from tqdm import tqdm
import json
import pickle
from collections import defaultdict
from os.path import join,isfile
from transformers import BertTokenizer
import random

2022-04-18 11:55:00.407486: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-18 11:55:00.407574: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
indir = join('data','preprocessed')
constitution_terms_file = join('data','constitution_words.txt') # did not prune the names from the original list
sample_index_file = join(indir,'sample_target_index_2.dict')
tokenized_file = join(indir,'cofea_tokenized.jsonlist')
data_file = join(indir, 'cofea.jsonlist')
kmeans_file = join(indir,'cofea_hist_bert_vecotrs_masked_masked_kmeans_labels.dict')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
random.seed(42)

In [3]:
# get the sampled file index
with open(sample_index_file,'rb') as f:
    sample_index = pickle.load(f)

# get the kmeans labels
with open(kmeans_file,'rb') as f:
    kmeans_labels = pickle.load(f)
    
# cofea file data for dates, source, etc
with open(data_file) as f:
     cofea_data = f.readlines()
        
# get the tokenized documents
with open(tokenized_file) as f:
     tokenized_data = f.readlines()
        
#get the constitution words tokenized and rejoined
# load constitution
with open(constitution_terms_file, 'r',encoding = 'utf-8') as f:
    constitution_terms = f.read().splitlines()
# they were indexed and saved in their tokenized form 
constitution_terms = [tokenizer.tokenize(x) for x in constitution_terms]

constitution_terms_cleaned = []
for x in constitution_terms:
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(x):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    constitution_terms_cleaned.append(' '.join(rejoined_pieces))
constitution_terms_cleaned = set(constitution_terms_cleaned)

# to save time, load the cofea and token data
cofea_data = [ json.loads(doc) for doc in cofea_data]
tokenized_data = [ json.loads(doc) for doc in tokenized_data]

In [4]:
# get the labels
sample_size = 10
# minimum threshold
minimum_size = 55 # we do not calculate the silhouette score for terms less than 55 

source_examples = {}
kmeans_examples = {}
for target_word in tqdm(sample_index):
    source_examples[target_word] = defaultdict(list)
    has_kmeans = False # we may not have kmeans labels
    if target_word in kmeans_labels.keys():
        kmeans_examples[target_word] = defaultdict(list)
        has_kmeans = True
    if len(sample_index[target_word]) > minimum_size:
        for x,index in enumerate(sample_index[target_word]):
                # pull the source data
                _,doc_index,token_id = index
                doc = cofea_data[doc_index]
                #doc = json.loads(doc)
                # text span
                #token_data = json.loads(tokenized_data[doc_index])
                token_data = tokenized_data[doc_index]
                text = token_data['tokens'][max(token_id-50,0):min(token_id+50,len(token_data['tokens']))]
                # source label
                source = doc['source']
                decade =  doc['decade']
                source_examples[target_word][source].append((text,decade))
                
                # pull the kmeans label
                if has_kmeans:
                    klabel = kmeans_labels[target_word][x]
                    kmeans_examples[target_word][klabel].append((text,source,decade))
        # sample for each source cluster
        for source in source_examples[target_word]:
            if len(source_examples[target_word][source]) > sample_size:
                source_examples[target_word][source] = random.sample(source_examples[target_word][source],
                                                                     sample_size)
        #sample for each kmeans cluster
        if has_kmeans:
            for klabel in kmeans_examples[target_word]:
                if len(kmeans_examples[target_word][klabel]) > sample_size:
                    kmeans_examples[target_word][klabel] = random.sample(kmeans_examples[target_word][klabel],
                                                                         sample_size)



100%|█████████████████████████████████████████| 893/893 [02:05<00:00,  7.11it/s]


In [5]:
# save the examples
with open(join(indir,'histbert_masked_kmeans_examples.dict'),'wb') as f:
    pickle.dump(kmeans_examples,file=f)
with open(join(indir,'histbert_masked_source_examples.dict'),'wb') as f:
    pickle.dump(source_examples,file=f)

In [6]:
def rejoin_tokens(token_list,source,decade):
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(token_list):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    return('example source: '+source+'\n'+ 
           'decade: ' + str(decade) + '\n'
           +' '.join(rejoined_pieces).replace('##',''))

In [7]:
def print_examples(word,examples):
    print("Target word: "+word + '\n')
    for cluster in examples[word]:
        print('Cluster: '+str(cluster) + '\n')
        for example in examples[word][cluster]:
            if len(example) == 3:
                tokens,source,decade = example
                print(rejoin_tokens(tokens,source,decade)+'\n')
            else:
                tokens,decade = example
                print(rejoin_tokens(example,'',decade)+'\n')
        print('\n')

In [73]:
print_examples('ha##be##as corpus',kmeans_examples)
"""
Habeas corpus has two discinct clusters, but the usage appears to bethe same
"""

Target word: ha##be##as corpus

Cluster: 1

example source: National Archives Founders Online
decade: 1780
the general court setting them down in the same order as they stand in the course of the proceedings . all depositions taken in any suit so sent to be tried at any of the assizes shall be transmitted together with the record . all writs of habeas corpus which shall be sued out during the session of assize shall be returnable before the judges of the circuit in which the prisoner is detained . and the said courts of assize shall have full power to hear & determine all treasons

example source: Evans Early American Imprints
decade: 1790
to been . the following is the statement of the case with the accompanying affidavits . federal district court , for the district of south carolina , 25th july , 1799 . present his honor judge bee . the question before the court was grounded on a habeas corpus , to bring up jonathan robbins , who was committed to gaol in february last , on suspicion 

In [74]:
print(print_examples('va##can##cies',kmeans_examples))

Target word: va##can##cies

Cluster: 0

example source: National Archives Founders Online
decade: 1770
expect to know the result of their deliberations in a day or two — it will be right to comply with the order of the board of war & ordinance — and a list may be transmitted of the persons you have judged most proper to fill the vacancies . i am sir — with my best wishes for your recovery your most obbt servt g . w .

example source: National Archives Founders Online
decade: 1810
. milligan , or any other person whom you will name , to come an immediately . indeed it would be well worth while to add to his duty that of covering the books with a little paper ( the good bindings at least ) and filling the vacancies of the presses with paper parings , to be brought from washington . this would add little more to the time , as he could carry on both operations at once . accept the assurances of my constant & affectionate friendship & respect .

example source: National Archives Founders On

In [8]:
print_examples('united',kmeans_examples)

Target word: united

Cluster: 0

example source: National Archives Founders Online
decade: 1790
the liberty to mention to you that mr woolcott the present auditor would be in every respect worthy of your consideration as his successor in office . now that the event has happened , a concern as anxious as it is natural , for the success of the department united with a sentiment of justice towards mr woolcott leads me to a repetition of that idea — this gentleman ’ s conduct in the station he now fills , has been that of an excellent officer . it has not only been good but distinguished . it has

example source: National Archives Founders Online
decade: 1790
improvement of the commerce and navigation of the same , has had the same under consideration , and thereupon makes the following report , the countries with which the united states have their chief commercial intercourse , are spain , portugal , france , great britain , the united netherlands , denmark , and sweden , and their americ

In [9]:
print_examples('vice',kmeans_examples)

Target word: vice

Cluster: 0

example source: Evans Early American Imprints
decade: 1790
imagine , that the pain of self - denial is confined to virtue . he who follows the world , as much as he who follows christ , must " take up his cross ; " and to him assuredly , it will prove a more oppressive burden . vice allows all our passions to range uncontrolled ; and where each claims to be superior , it is impossible to gratify all . the predominant desire can only be indulged at the expense of its rival . no mortifications which

example source: Evans Early American Imprints
decade: 1790
misfortunes ; but they are , at the same time , the source of all our pleasures ; therefore , the study of our lives ought to be , not to dissemble an absence of passion ; but to repel those , which lead to vice , by those which direct to virtue . your visit was a most grateful circumstance ; there is a pleasure in seeing you , even here , though intermingled with grief , which is a great alleviation to

In [84]:
print_examples('corpus',kmeans_examples)
# first cluster is only habeaus corpus
# second cluster has 3 mentions of habeus corpus
# mainly corpus juris is common

Target word: corpus

Cluster: 0

example source: Evans Early American Imprints
decade: 1790
, in this [UNK] , no citizen can be deprived of his liberty , without an avowed and sufficient cause , unless in case of rebellion or invasion , the legislature think the public safety requires it , and suspend the privilege of the writ of habeas corpus . but here the constitution leaves aliens , as in other countries , to the protection of the general principles of the law of [UNK] , or of the particular provisions of treaties made between the [UNK] states , and the government whose subjects or citizens the aliens [UNK]

example source: Evans Early American Imprints
decade: 1780
prosecuting pretended offences , and arbitrary punishments upon arbitrary convictions have ever appeared to me to be the great engines of judicial despotism ; and these have all relation to criminal proceedings . the trial by jury in criminal cases , aided by the habeas corpus act , seems therefore to be alone concerned

In [10]:
print_examples('em##ol##umen##t',kmeans_examples)
# first cluster: private emolument 6 cluster is not 
# second cluster emoluments provided by government 

Target word: em##ol##umen##t

Cluster: 0

example source: Evans Early American Imprints
decade: 1800
another ' s hand — ' i must go — so and so — such business calls me . ' another replied — ' i should have been at such a place . ' every one appeared to be running after some object that occupied his mind , and brought emolument to himself . ' and where shall i go ! ' sighed i . " in king street i met a mob , dragging a poor ragged wretch to justice , who had been detected in picking a pocket . i mingled with the

example source: National Archives Founders Online
decade: 1800
proposal , and promised to bear all his expences . he came accordingly with his wife and one child an amiable and beautiful daughter of about twelve years of age and took the oversight of my servants and family affairs to my great relief and their considerable emolument . it was a regular and a virtuous family as far as ever i observed or heard . this madam is the mr dumas , whom you have transmitted to posterity ,