In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
import random

In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
data_file = os.path.join(indir, 'cofea.jsonlist')
target_index_file = os.path.join(indir,'target_word_index.dict')
outfile = os.path.join(outdir,'sample_target_index.dict')
filter_source = ['Evans Early American Imprints','William S. Hein & Co., HeinOnline','National Archives Founders Online']
filter_year = [1750,1810]
sample_size = 5000

In [3]:
with open(data_file) as f:
     cofea_data = f.readlines()

In [4]:
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)

In [5]:
# need to get a list of all docs that are in specified sources
elligible_docs = []
for x,doc in enumerate(cofea_data):
    doc = json.loads(doc)
    if doc['source'] in filter_source and (filter_year[0] <= doc['decade'] <= filter_year[1] ):
        elligible_docs.append(x)

elligible_docs = set(elligible_docs)

In [6]:
sampled_target = {}
for word in tqdm(target_index):
    index = []
    for f_ind,doc_ind,ind in target_index[word]:
        # only sample documents from the sources we want and not the first two words in the document
        if doc_ind in elligible_docs and ind > 2:
            index.append((f_ind,doc_ind,ind))
            
    if len(index) > sample_size:
        random.seed(42)
        sample_index = random.sample(index,sample_size)
    else:
        sample_index = index
        
    sampled_target[word] = sample_index

100%|█████████████████████████████████████████| 893/893 [00:13<00:00, 66.92it/s]


In [7]:
with open(outfile,'wb') as f:
    pickle.dump(sampled_target,file=f)

In [None]:
# checking the examples
# collect index of tokens in the documents
file = sorted(glob(os.path.join(indir, '*_tokenized.jsonlist')))[0]
with open(file) as f:
    docs = f.readlines()

In [None]:
for _,doc_id,index in sampled_target['bear arms']:
    doc = json.loads(docs[doc_id])
    print(str(doc_id)+': '+ str(index))
    print(doc['tokens'][index -1] + ' ' + doc['tokens'][index] + ' '+ doc['tokens'][index+1])

In [None]:
len(sampled_target['bear arms'])