In [None]:
from collections import namedtuple
from glob import glob
from itertools import chain
from nltk import ngrams, FreqDist, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import os.path
from pprint import pprint

from lib.language import get_wordnet_pos, clean_tokens, tokenize_file
from lib.legiscan import summarize_metadata_file
from lib.util import load_json, write_json


BillFiles = namedtuple("BillFiles", "bill_id contents_path meta_path")


lem = WordNetLemmatizer()

whitelist = load_json('../configuration/custom_whitelist.json')

stopwords = set(
    word
    for word in
    chain.from_iterable([
        load_json('../artifacts/legal_stopwords.json'),
        load_json('../configuration/custom_stopwords.json'),
    ])
    if word not in whitelist
)

def generate_document_tokens(path, output_path):
    files = glob(path)
    bill_files = [
        BillFiles(
            os.path.splitext(os.path.basename(file))[0], 
            file,
            f'../tmp/legiscan/bill_meta_{os.path.splitext(os.path.basename(file))[0]}.json',
        )
        for file 
        in files
    ]

    document_tokens = [
        (
            bill_file, 
            summarize_metadata_file(bill_file.meta_path), 
            list(tokenize_file(bill_file.contents_path))
        )
        for bill_file
        in bill_files
        if os.path.exists(bill_file.meta_path)
    ]

    write_json(document_tokens, output_path)
    return document_tokens

def explode_ngrams(doc_tokens, ngram_length):
    return [
        (bill_file, summary, list(ngrams(tokens, ngram_length)))
        for bill_file, summary, tokens
        in doc_tokens
    ]

try:
    document_tokens = document_tokens
except:
    document_tokens = generate_document_tokens('../bills/*', '../tmp/document_tokens.json')

GRAM_LENGTH = 48
corpus = {}
exploded = explode_ngrams(document_tokens, GRAM_LENGTH)
for exploder in exploded:
    bill_file, summary, grams = exploder
    for gram in grams:
        arr = corpus.get(gram, [])
        corpus[gram] = [*arr, (bill_file, summary)]

write_json({
    ' '.join(k): [v[1]['state'] + ' ' + v[1]['bill_id'] + ': ' + ', '.join(v[1]['sponsors']) for v in vv]
    for k, vv
    in corpus.items()
    if len(vv) > 20
}, f'../tmp/grams-{str(GRAM_LENGTH).zfill(2)}.json')