In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
import random

In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
data_file = os.path.join(indir, 'cofea.jsonlist')
target_index_file = os.path.join(indir,'target_word_index.dict')
outfile = os.path.join(outdir,'sample_target_index.dict')
filter_source = ['Evans Early American Imprints','HeinOnline','National Archives Founders Online']
filter_year = [1750,1810]
sample_size = 5000
random.seed(42)

In [3]:
with open(data_file) as f:
     cofea_data = f.readlines()

In [4]:
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)

In [5]:
# need to get a list of all docs that are in specified sources
elligible_docs = []
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    if doc['source'] in filter_source and (filter_year[0] <= doc['decade'] <= filter_year[1] ):
        elligible_docs.append(x)

elligible_docs = set(elligible_docs)

In [6]:
sampled_target = {}
for word in tqdm(target_index):
    index = []
    for f_ind,doc_ind,ind in target_index[word]:
        # only sample documents from the sources we want and not the first two words in the document
        if doc_ind in elligible_docs and ind > 2:
            index.append((f_ind,doc_ind,ind))
            
    if len(index) > sample_size:
        sample_index = random.sample(index,sample_size)
    else:
        sample_index = index
        
    sampled_target[word] = sample_index

100%|██████████████████████████████████████████████████████████████| 1088/1088 [00:15<00:00, 70.57it/s]


In [7]:
with open(outfile,'wb') as f:
    pickle.dump(sampled_target,file=f)

In [8]:
# checking the examples
# collect index of tokens in the documents
file = sorted(glob(os.path.join(indir, '*_tokenized.jsonlist')))[0]
with open(file) as f:
    docs = f.readlines()

In [9]:
for _,doc_id,index in sampled_target['bear arms']:
    doc = json.loads(docs[doc_id])
    print(str(doc_id)+': '+ str(index))
    print(doc['tokens'][index -1] + ' ' + doc['tokens'][index] + ' '+ doc['tokens'][index+1])

3447: 3082
to bear arms
4606: 16449
ourselves bear arms
4673: 68373
to bear arms
4806: 106324
and bear arms
88341: 2029
to bear arms
3476: 3114
to bear arms
68853: 1209
to bear arms
102033: 863
to bear arms
3301: 2367
to bear arms
64083: 1557
to bear arms
168875: 14
to bear arms
5132: 2090
may bear arms
2872: 44334
to bear arms
37073: 202
to bear arms
4673: 32020
to bear arms
3782: 61088
to bear arms
2706: 7392
to bear arms
63495: 43
to bear arms
2645: 1243
to bear arms
4673: 118199
to bear arms
45204: 68
not bear arms
62900: 949
and bear arms
152625: 577
to bear arms
5472: 2037
to bear arms
5438: 156
to bear arms
2684: 90541
to bear arms
70851: 89
000 bear arms
4165: 91374
and bear arms
3984: 309993
to bear arms
2879: 16448
to bear arms
64706: 210
to bear arms
43737: 190
to bear arms
115275: 99
to bear arms
37107: 962
to bear arms
2796: 13947
to bear arms
109619: 2883
and bear arms
4333: 122169
could bear arms
2747: 157146
to bear arms
15449: 581
, bear arms
148666: 713
to bear arms
5

In [10]:
len(sampled_target['bear arms'])

229