# Instant Natural Language Learner

## Global Variables and Configuration

In [1]:
# Configuration template. 
CONFIG_PATH = '../configs/template.jsonnet'

# Temporary data buffers. 
TRAIN_BUFFER_PATH = "../data/train_buffer.tmp"
VALIDATION_BUFFER_PATH = "../data/validate_buffer.tmp"

# Check that data has been read. 
TRAIN_DONE = False
VALIDATION_DONE = False

# Class highlighting. 
CITY_SPAN_OPEN = "<span style=\"background-color: #ff5050\">"
DRUG_SPAN_OPEN = "<span style=\"background-color: #3399ff\">"
SPAN_CLOSE = "</span>"

# Special markdown printing function. 
def printmd(string: str) -> None:
    display(Markdown(string))

# Imports and Function Definitions

In [None]:
import os
import json
import time
import shutil
import numpy as np
import tempfile

from typing import List

from allennlp.common.params import Params
from allennlp.commands.train import train_model
from allennlp.models import Model
from allennlp.data import Vocabulary

from IPython.display import Markdown, display
from ipywidgets import Layout, widgets

from model import InstEntityTagger
from predictor import InstPredictor
from dataset_reader import InstDatasetReader

printmd("### <span style=\"background-color: #54f542\">Imports complete.</span>")

def train_prompt() -> None:
    train_text = widgets.Text(layout=Layout(width='70%'))
    printmd("## Training text:")
    display(train_text)
    train_text.on_submit(read_train)

def validation_prompt() -> None:
    valid_text = widgets.Text(layout=Layout(width='70%'))
    printmd("## Validation text:")
    display(valid_text)
    valid_text.on_submit(read_validation)

def read_train(sender: widgets.Text) -> None:
    global TRAIN_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(TRAIN_BUFFER_PATH, 'w') as train_file:
        train_file.write(sender.value)
    printmd("**Writing to:** " + TRAIN_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    TRAIN_DONE = True

def read_validation(sender: widgets.Text) -> None:
    global VALIDATION_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(VALIDATION_BUFFER_PATH, 'w') as valid_file:
        valid_file.write(sender.value)
    printmd("**Writing to:** " + VALIDATION_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    VALIDATION_DONE = True
    
def set_params(config_path: str, train_buffer_path: str) -> Params:
    # Modifying parameter values
    params = Params.from_file(config_path)
    params.__setitem__("train_data_path", train_buffer_path)
    return params

def check_train_read(variable: bool):
    try:
        assert variable
    except AssertionError:
        printmd("### <span style=\"background-color: #ff5050\">Enter training data above, \
                press <Enter> to submit, then try again.</span>")
        assert variable
    
def check_validation_read(variable: bool):
    try:
        assert variable
    except AssertionError:
        printmd("### <span style=\"background-color: #ff5050\">Enter validation data above, \
                press <Enter> to submit, then try again.</span>")
        assert variable

def train() -> Model:
    check_train_read(TRAIN_DONE)
    
    # Set parameters. 
    params = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH)
    
    # Grab pretrained file.
    parms = params.duplicate()
    vocab_params = parms.get(key="vocabulary")
    pretrained_files_params = vocab_params.get(key="pretrained_files")
    extension_pretrained_file = pretrained_files_params.get(key="tokens")
    
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)
    shutil.rmtree(serialization_dir)
    
    return model, params, extension_pretrained_file

def main(model: Model, params: Params, extension_pretrained_file: str) -> List[List[str]]:
    
    test_path = VALIDATION_BUFFER_PATH
    train_path = TRAIN_BUFFER_PATH

    # Get test vocab.
    reader = InstDatasetReader()
    test_dataset = reader.read(test_path) # Change to temp file.

    # Extend vocabulary.
    embedding_sources_mapping = {"word_embeddings.token_embedder_tokens": extension_pretrained_file}
    model.vocab.extend_from_instances(params, test_dataset)
    model.extend_embedder_vocab(embedding_sources_mapping)

    # Make predictions
    predictor = InstPredictor(model, dataset_reader=InstDatasetReader())
    with open(test_path, "r") as text_file:
        lines = text_file.readlines()
    all_text = " ".join(lines) # Makes it all 1 batch.
    output_dict = predictor.predict(all_text)
    tags = output_dict['tags']
    dataset_reader = InstDatasetReader()

    PRINT_STDOUT = False
    out = []
    
    with open("log.log", 'a') as log:
        for instance in dataset_reader._read(test_path):
            tokenlist = list(instance['sentence'])
            for i, token in enumerate(tokenlist):
                log.write(tags[i] + str(token) + "\n")
                if PRINT_STDOUT:
                    print(tags[i] + str(token))
                out.append([str(token), tags[i]])
                
    # Allennlp seems to only support extending the vocabulary once.
    # This is a hack to modify the `old` number of embeddings at each iteration.
    extended_num_embeddings = len(model.vocab.get_index_to_token_vocabulary(namespace='tokens'))
    model.word_embeddings.token_embedder_tokens.num_embeddings = extended_num_embeddings
    
    return out

def generate_markdown(out: List[List[str]]) -> List[str]:
    md_list = []
    for pair in out:
        if pair[1] == '*': # Cities
            md_list.append(CITY_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        elif pair[1] == '!': # Drugs
            md_list.append(DRUG_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        else:
            md_list.append(pair[0])
    return md_list

# Text Samples for Training

I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead. 

# Read in Training Example

In [None]:
train_prompt()

# Train Model

In [6]:
model, params, extension_pretrained_file = train()

0it [00:00, ?it/s]
1it [00:00, 47.92it/s]

0it [00:00, ?it/s]
1it [00:00, 4206.92it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]
100%|##########| 25000/25000 [00:00<00:00, 68803.52it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]
100%|##########| 25000/25000 [00:00<00:00, 73039.28it/s]

  0%|          | 0/1 [00:00<?, ?it/s]
accuracy: 0.8333, precision 0: 0.9697, precision 1: 0.3333, precision 2: 0.3333, recall 0: 0.8421, recall 1: 1.0000, recall 2: 0.5000, fscore 0: 0.9014, fscore 1: 0.5000, fscore 2: 0.4000, loss: 0.9373 ||: 100%|##########| 1/1 [00:00<00:00,  8.31it/s]



# Text Samples for Prediction

When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.

It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
<br/>
<br/>


# Enter Validation Example

In [7]:
global VALIDATION_DONE
VALIDATION_DONE = False
validation_prompt()

## Validation text:

Text(value='', layout=Layout(width='70%'))

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** When I lived in Paris last year, France was experiencing a recession. The night life was too fun, I developed an addiction to Adderall and cocaine.

# Predict

In [8]:
check_validation_read(VALIDATION_DONE)
out = main(model, params, extension_pretrained_file)
md_list = generate_markdown(out)
md = ' '.join(md_list)
printmd(md)

1it [00:00, 57.08it/s]
100%|██████████| 1/1 [00:00<00:00, 4076.10it/s]
100%|██████████| 25000/25000 [00:00<00:00, 69053.64it/s]


When I lived in <span style="background-color: #ff5050"> Paris</span> last year , <span style="background-color: #ff5050"> France</span> was experiencing a recession . The night life was too fun , I developed an addiction <span style="background-color: #3399ff"> to</span> <span style="background-color: #ff5050"> Adderall</span> and cocaine .