In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
import random

In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
data_file = os.path.join(indir, 'cofea.jsonlist')
target_index_file = os.path.join(indir,'target_word_index.dict')
outfile = os.path.join(outdir,'sample_target_index.dict')
filter_source = ['Evans Early American Imprints','HeinOnline','National Archives Founders Online']
filter_year = [1750,1810]
sample_size = 5000
random.seed(42)

In [3]:
with open(data_file) as f:
     cofea_data = f.readlines()

In [4]:
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)

In [5]:
# need to get a list of all docs that are in specified sources
elligible_docs = []
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    if doc['source'] in filter_source and (filter_year[0] <= doc['decade'] <= filter_year[1] ):
        elligible_docs.append(x)

elligible_docs = set(elligible_docs)

In [6]:
sampled_target = {}
for word in tqdm(target_index):
    index = []
    for f_ind,doc_ind,ind in target_index[word]:
        # only sample documents from the sources we want and not the first two words in the document
        if doc_ind in elligible_docs and ind > 2:
            index.append((f_ind,doc_ind,ind))
            
    if len(index) > sample_size:
        sample_index = random.sample(index,sample_size)
    else:
        sample_index = index
        
    sampled_target[word] = sample_index

100%|███████████████████████████████████████| 1055/1055 [00:15<00:00, 67.29it/s]


In [7]:
with open(outfile,'wb') as f:
    pickle.dump(sampled_target,file=f)

In [8]:
# checking the examples
# collect index of tokens in the documents
file = sorted(glob(os.path.join(indir, '*_tokenized.jsonlist')))[0]
with open(file) as f:
    docs = f.readlines()

In [9]:
for _,doc_id,index in sampled_target['bear arms']:
    doc = json.loads(docs[doc_id])
    print(str(doc_id)+': '+ str(index))
    print(doc['tokens'][index -1] + ' ' + doc['tokens'][index] + ' '+ doc['tokens'][index+1])

78758: 2143
to bear arms
65555: 333
to bear arms
173206: 101906
to bear arms
174496: 23218
to bear arms
174512: 21505
to bear arms
171447: 10856
to bear arms
132285: 518
to bear arms
174424: 305594
to bear arms
172263: 341414
to bear arms
174570: 40132
to bear arms
169469: 12911
can bear arms
172043: 776
and bear arms
172849: 123746
to bear arms
172319: 43331
to bear arms
174660: 141432
to bear arms
172982: 9538
, bear arms
30279: 202
to bear arms
172849: 106578
to bear arms
171721: 69301
to bear arms
133990: 1588
to bear arms
119841: 129
not bear arms
172849: 16611
and bear arms
174405: 4359
to bear arms
34344: 879
not bear arms
23651: 248
to bear arms
170313: 47642
shall bear arms
174522: 80892
cannot bear arms
62057: 1209
to bear arms
135945: 201
to bear arms
26737: 3493
to bear arms
174424: 251408
to bear arms
172263: 390301
to bear arms
170398: 56254
we bear arms
172105: 1532
to bear arms
171586: 6584
or bear arms
170398: 68259
or bear arms
174681: 92279
to bear arms
174424: 25012

174486: 717761
to bear arms
36942: 190
to bear arms
174405: 1694
to bear arms
171556: 22331
and bear arms
171654: 22726
to bear arms
174424: 243801
to bear arms
174635: 1370361
to bear arms
170860: 98841
to bear arms
55185: 1086
not bear arms
174635: 145881
to bear arms
174486: 272005
to bear arms
149559: 14
to bear arms
95231: 863
to bear arms
169901: 519259
to bear arms
27970: 7529
to bear arms
158218: 1986
not bear arms
170913: 5470
to bear arms
172341: 33999
to bear arms
170403: 4990
not bear arms
169901: 519389
not bear arms
153071: 1255
to bear arms
57783: 689
to bear arms
169563: 4613
##lves bear arms
30078: 255
##s bear arms
135882: 183
to bear arms
174472: 337689
to bear arms
144525: 351
to bear arms
169420: 17728
to bear arms
133507: 598
to bear arms
168927: 197
to bear arms
40106: 7545
to bear arms
172628: 2645
to bear arms
172960: 348457
to bear arms
172849: 91466
to bear arms
168836: 7779
to bear arms
174484: 479200
and bear arms
174473: 2195
to bear arms
172985: 2946
to b