In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
import random

In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
data_file = os.path.join(indir, 'cofea.jsonlist')
target_index_file = os.path.join(indir,'target_word_index.dict')
outfile = os.path.join(outdir,'sample_target_index.dict')
filter_source = ['Evans Early American Imprints','HeinOnline','National Archives Founders Online']
filter_year = [1750,1810]
sample_size = 5000
random.seed(42)

In [3]:
with open(data_file) as f:
     cofea_data = f.readlines()

In [4]:
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)

In [5]:
# need to get a list of all docs that are in specified sources
elligible_docs = []
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    if doc['source'] in filter_source and (filter_year[0] <= doc['decade'] <= filter_year[1] ):
        elligible_docs.append(x)

elligible_docs = set(elligible_docs)

In [6]:
sampled_target = {}
for word in tqdm(target_index):
    index = []
    for f_ind,doc_ind,ind in target_index[word]:
        # only sample documents from the sources we want and not the first two words in the document
        if doc_ind in elligible_docs and ind > 2:
            index.append((f_ind,doc_ind,ind))
            
    if len(index) > sample_size:
        sample_index = random.sample(index,sample_size)
    else:
        sample_index = index
        
    sampled_target[word] = sample_index

100%|██████████████████████████████████████████████████████████| 1055/1055 [00:18<00:00, 56.01it/s]


In [19]:
with open(outfile,'wb') as f:
    pickle.dump(sampled_target,file=f)

In [8]:
# checking the examples
# collect index of tokens in the documents
file = sorted(glob(os.path.join(indir, '*_tokenized.jsonlist')))[0]
with open(file) as f:
    docs = f.readlines()

In [17]:
for _,doc_id,index in sampled_target['bear arms']:
    doc = json.loads(docs[doc_id])
    print(str(doc_id)+': '+ str(index))
    print(doc['tokens'][index -1] + ' ' + doc['tokens'][index] + ' '+ doc['tokens'][index+1])

174424: 193504
and bear arms
174490: 331073
to bear arms
172849: 30486
to bear arms
172341: 158344
would bear arms
174442: 9494
to bear arms
171056: 6029
to bear arms
173914: 1399
to bear arms
174481: 125228
lo bear arms
173272: 16179
to bear arms
172986: 129057
to bear arms
170359: 5417
to bear arms
174486: 250476
to bear arms
170883: 6963
to bear arms
171627: 3463
to bear arms
174484: 459258
to bear arms
174450: 7790
and bear arms
171586: 5933
or bear arms
169901: 487540
to bear arms
171453: 1086
to bear arms
131532: 113
to bear arms
174570: 31105
to bear arms
172165: 20139
to bear arms
172341: 82322
and bear arms
174468: 40646
and bear arms
168836: 6918
to bear arms
174607: 65659
account bear arms
128319: 1512
than bear arms
174681: 84722
to bear arms
40109: 687
not bear arms
171652: 2993
to bear arms
170394: 19207
and bear arms
170376: 41695
not bear arms
169905: 234085
se##lves bear arms
174522: 73968
cannot bear arms
169469: 10241
who bear arms
55185: 1010
not bear arms
170313: 1

174424: 201015
to bear arms
171654: 21199
to bear arms
130627: 654
to bear arms
170156: 14559
to bear arms
170972: 9758
to bear arms
172341: 29988
to bear arms
170398: 62207
to bear arms
171290: 358335
to bear arms
57498: 292
to bear arms
172162: 7220
to bear arms
23623: 1272
to bear arms
170882: 7069
to bear arms
174641: 92426
shall bear arms
59381: 372
to bear arms
173206: 92667
to bear arms
172043: 730
and bear arms
174424: 193691
and bear arms
171721: 57434
able bear arms
76556: 202
to bear arms
56700: 42
to bear arms
171914: 114373
to bear arms
169843: 10387
to bear arms
172263: 138726
and bear arms
132525: 165
not bear arms
171721: 64389
to bear arms
174641: 212500
to bear arms
62057: 1145
to bear arms
81542: 1886
to bear arms
131275: 1165
to bear arms
174528: 62068
to bear arms
170821: 1168
to bear arms
172263: 317096
to bear arms
174592: 21522
to bear arms
174405: 3787
to bear arms
174023: 814
and bear arms
174663: 28919
to bear arms
174660: 21522
to bear arms
170913: 5088
to b