In [1]:
from collections import namedtuple
from glob import glob
from itertools import chain
from nltk import ngrams, FreqDist, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import os.path
from pprint import pprint

from lib.language import get_wordnet_pos, clean_tokens, tokenize_file
from lib.legiscan import summarize_metadata_file
from lib.util import load_json, write_json


BillFiles = namedtuple("BillFiles", "bill_id contents_path meta_path")


lem = WordNetLemmatizer()

whitelist = load_json('../configuration/custom_whitelist.json')

stopwords = set(
    word
    for word in
    chain.from_iterable([
        load_json('../artifacts/legal_stopwords.json'),
        load_json('../configuration/custom_stopwords.json'),
    ])
    if word not in whitelist
)

def generate_document_tokens(path, output_path):
    files = glob(path)
    bill_files = [
        BillFiles(
            os.path.splitext(os.path.basename(file))[0], 
            file,
            f'../tmp/legiscan/bill_meta_{os.path.splitext(os.path.basename(file))[0]}.json',
        )
        for file 
        in files
    ]

    document_tokens = [
        (
            bill_file, 
            summarize_metadata_file(bill_file.meta_path), 
            list(tokenize_file(bill_file.contents_path))
        )
        for bill_file
        in bill_files
        if os.path.exists(bill_file.meta_path)
    ]

    write_json(document_tokens, output_path)
    return document_tokens

def explode_ngrams(doc_tokens, ngram_length):
    return [
        (bill_file, summary, list(ngrams(tokens, ngram_length)))
        for bill_file, summary, tokens
        in doc_tokens
    ]

try:
    document_tokens = document_tokens
except:
    document_tokens = generate_document_tokens('../bills/*', '../tmp/document_tokens.json')

GRAM_LENGTH = 10
corpus = {}
exploded = explode_ngrams(document_tokens, GRAM_LENGTH)
for exploder in exploded:
    bill_file, summary, grams = exploder
    for gram in grams:
        arr = corpus.get(gram, [])
        corpus[gram] = [*arr, (bill_file, summary)]

write_json({
    ' '.join(k): [v[1]['state'] + ' ' + v[1]['bill_id'] + ': ' + ', '.join(v[1]['sponsors']) for v in vv]
    for k, vv
    in corpus.items()
    if len(vv) > 20
}, f'../tmp/grams-{str(GRAM_LENGTH).zfill(2)}.json')

In [41]:
import pandas as pd

pd.set_option('display.max_colwidth', 100)

df = pd.read_json('../tmp/prob_sorted_ngrams.json')
df.columns = ['phrase', 'occurrences', 'improbability']
sliced = df.sample(n=10)
sliced.sort_values(by='improbability', ascending=False)


  return values.astype(dtype, copy=copy)


Unnamed: 0,phrase,occurrences,improbability
24,stop luteinizing hormone secretion and therefore testosterone secretion or synthetic,21,4.712104e+25
25,male patient such a augmentation mammoplasty facial feminization surgery liposuction,25,3.855693e+25
117,individual do not have normal sex chromosome structure sex steroid,29,1.158572e+20
151,physiological or anatomical characteristic that resemble a sex different from,61,1.394117e+19
166,external biological sex characteristic that be irresolvably ambiguous such a,55,5.772981e+18
268,production of estrogen and progesterone when use to delay or,28,7.494855e+16
592,of parent to direct the upbringing education health care and,33,3990044000000.0
647,for related purpose be it enact by the legislature of,25,456824900000.0
656,for the purpose of attempt to alter the appearance of,34,301021000000.0
719,the general assembly of the state of missouri a follow,43,7807533000.0
