In [1]:
import json
import os
from glob import glob
from tqdm import tqdm
from collections import defaultdict
from transformers import BertTokenizer
import pickle
import argparse

In [2]:
indir = os.path.join('data','preprocessed')
outdir = os.path.join('data','preprocessed')
files = sorted(glob(os.path.join(indir, '*_tokenized.jsonlist')))
index_file = os.path.join(indir,'cofea_full_index.dict')
target_file = os.path.join(outdir,'target_word_index.dict')
word_file = os.path.join('data','new_constitution_words.txt')
#second_word_file = os.path.join('data','bigrams.txt')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# tokenize key words and phrases
with open(word_file, 'r',encoding = 'utf-8') as f:
    target_words = f.read().splitlines()
# tokenize key words and phrases adding an additional file
#with open(second_word_file, 'r',encoding = 'utf-8') as f:
#    target_words += f.read().splitlines()
    
target_words = list(set(target_words))
target_words = [tokenizer.tokenize(x) for x in target_words]
target_words_cleaned = []
for x in target_words:
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(x):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    target_words_cleaned.append(rejoined_pieces)

In [4]:
# get index
with open(index_file,'rb') as f:
    index = pickle.load(f)

In [45]:
target_index = {}
for word in tqdm(target_words_cleaned):
    if len(word) == 1:
        # just one word
        target_index[word[0]]=index.get(word[0],set())

    elif len(word) > 1:
        # we have a phrase
        phrase_indexes = []
        start = index.get(word[0])
        for f_id,doc_id,doc_index in start:
            match = False
            offset = 0
            for x,piece in enumerate(word[1:]):
                offset += len(word[x].split('##'))# we indexed words by their individual token so ha##be##as is ha ##be ##as
                if (f_id,doc_id,doc_index+offset) in index[piece]:  
                    match = True
                else:
                    match = False
                    break

            if match:
                phrase_indexes.append((f_id,doc_id,doc_index))
        target_index[' '.join(word)] = set(phrase_indexes)


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 114.37it/s]


In [None]:
with open(target_file,'wb') as f:
    pickle.dump(target_index,file=f)