# Introduction

In order to train on the words in a legislative bill we want to make use of an existing Natural Language Processing (NLP) model in order to come up with a data structure that will encode contextual information.

To this end we use the BERT model, which is one of the modern NLP models that takes into account context.

For this project, we want to take the text from the bill and predict which party sponsored it

## First steps

Load the BERT model using pytorch.

Why pytorch, this is higher level libary for constructing machine learning models.

In [None]:
import os.path

import pandas as pd
import transformers
from transformers import pipeline
import pandas as pd
import torch
import numpy as np
import json

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")

Now that the we have the model, we want to run inference on the text and get the output token.

In [None]:
def generate_token(summary_json):
    """
    execute the BERT model on the title section of the summary
    :param summary_json: the data from a summary file
    :return: a numpy vector
    """
    text = summary_json['text']
    encoded_input = tokenizer(text, return_tensors='pt')
    print("encoded input")
    print(encoded_input)
    print("model output")
    output = model(**encoded_input)
#    print(type(output))
    last_shape = output.last_hidden_state.shape
    elements = np.cumproduct(last_shape)
    last_layer_vector = torch.from_numpy(output.last_hidden_state.detach().numpy().reshape(max(elements)))
#    print(type(last_layer_vector))
    return last_layer_vector

In [None]:
parent_path = os.path.dirname(os.getcwd())
search_path = os.path.join(parent_path, "data", "extracted")
token_path = os.path.join(parent_path, "data", "tokenized")

In [None]:
import pickle
for root, dirs, files in os.walk(search_path):
    for f in files:
        with open(os.path.join(root, f), 'r') as s_file:
            print(f'reading {f}')
            summary = json.load(s_file)
            encoding = generate_token(summary)
            print(f'{len(encoding)} of type {type(encoding)}')
            summary["input_full"] = encoding
            df = pd.DataFrame(summary)
            output_path = os.path.join(token_path, f.replace(".json",".pkl"))
            print(f'saving {output_path}')
            df.to_pickle(output_path,compression="gzip",protocol=pickle.DEFAULT_PROTOCOL)


These tensors all have different sizes, so running a cosine similarity will not work.
I will try to compress these down to a standard size.
Let's try this  : https://newbedev.com/downsample-a-1d-numpy-array

Let's reload and the dfs and save a resampled version of the encoding

In [None]:
import numpy as np
from scipy.signal import resample
from scipy.interpolate import interp1d

def ResampleLinear1D(original, targetLen):
    original = np.array(original, dtype=np.float)

    index_arr = np.linspace(0, len(original)-1, num=targetLen, dtype=np.float)
    index_floor = np.array(index_arr, dtype=np.int) #Round down

    index_ceil = index_floor + 1
    index_rem = index_arr - index_floor #Remain

    val1 = original[index_floor]
    val2 = original[index_ceil % len(original)]
    interp = val1 * (1.0-index_rem) + val2 * index_rem
    assert(len(interp) == targetLen)
    return interp

In [None]:
for root,dirs, files in os.walk(token_path):
    for f in files:
        pkl_file_path = os.path.join(root, f)
        df = pd.read_pickle(pkl_file_path, compression="gzip")
        print(df)
        shrunk = ResampleLinear1D(df["input_full"], 2048)
        print(shrunk)
        # "status": 2, "party": 1
        df2 = pd.DataFrame({"status": df["status"][0:2048],
                            "party": df["party"][0:2048],
                            "input_shrunk": shrunk})
        print(df2.shape)
        df2.to_pickle(pkl_file_path.replace(".pkl","-shrunk.pkl"),compression="gzip", protocol=pickle.DEFAULT_PROTOCOL)