In [1]:
import os
import json
import time
import shutil
import numpy as np
import tempfile

from typing import List

from allennlp.common.params import Params
from allennlp.commands.train import train_model
from allennlp.models import Model
from allennlp.data import Vocabulary

from IPython.display import Markdown, display
from ipywidgets import Layout, widgets

from model import InstEntityTagger
from predictor import InstPredictor
from dataset_reader import InstDatasetReader

# from notebook_utils import printmd, train_prompt, validation_prompt

print("Imports complete.")

Imports complete.


In [2]:
# Global variables.
TRAIN_BUFFER_PATH = "../data/train_buffer.tmp"
VALIDATION_BUFFER_PATH = "../data/validate_buffer.tmp"
RUNNING_BUFFER_PATH = "../data/running_buffer.tmp"
TRAIN_DONE = False
VALIDATION_DONE = False
CONFIG_PATH = '../configs/template.jsonnet'

def printmd(string: str) -> None:
    display(Markdown(string))

def train_prompt() -> None:
    train_text = widgets.Text(layout=Layout(width='70%'))
    printmd("**Training text:**")
    display(train_text)
    train_text.on_submit(read_train)

def validation_prompt() -> None:
    valid_text = widgets.Text(layout=Layout(width='70%'))
    printmd("**Validation text:**")
    display(valid_text)
    valid_text.on_submit(read_validation)

def read_train(sender: widgets.Text) -> None:
    global TRAIN_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(TRAIN_BUFFER_PATH, 'w') as train_file:
        train_file.write(sender.value)
    printmd("**Writing to:** " + TRAIN_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    TRAIN_DONE = True

def read_validation(sender: widgets.Text) -> None:
    global VALIDATION_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(VALIDATION_BUFFER_PATH, 'w') as valid_file:
        valid_file.write(sender.value)
    with open(RUNNING_BUFFER_PATH, 'a') as running_buffer:
        running_buffer.write(sender.value)
    printmd("**Writing to:** " + VALIDATION_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    VALIDATION_DONE = True
    
def set_params(config_path: str, train_buffer_path: str) -> Params:
    # Modifying parameter values
    params = Params.from_file(config_path)
    params.__setitem__("train_data_path", train_buffer_path)
    return params

"""
I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. 
One night, I got food poisoning and couldn't find !Tylenol to make the 
pain go away, they insisted I take !aspirin instead.
"""

"""
When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.
"""

"""
It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
"""

if __name__ == "__main__":
    with open(RUNNING_BUFFER_PATH, 'w') as running_buffer:
        running_buffer.write(" ")
    train_prompt()

**Training text:**

Text(value='', layout=Layout(width='70%'))

**Writing to:** ../data/train_buffer.tmp

**Wrote:** I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle.  One night, I got food poisoning and couldn't find !Tylenol to make the  pain go away, they insisted I take !aspirin instead.

In [3]:
def train() -> Model:
    assert TRAIN_DONE
    
    # Set parameters. 
    params = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH)
    
    # Grab pretrained file.
    parms = params.duplicate()
    vocab_params = parms.get(key="vocabulary")
    pretrained_files_params = vocab_params.get(key="pretrained_files")
    extension_pretrained_file = pretrained_files_params.get(key="tokens")
    
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)
    shutil.rmtree(serialization_dir)
    
    return model, params, extension_pretrained_file

if __name__ == "__main__":
    model, params, extension_pretrained_file = train()

0it [00:00, ?it/s]
1it [00:00, 312.75it/s]

0it [00:00, ?it/s]
1it [00:00, 3650.40it/s]

  0%|          | 0/500000 [00:00<?, ?it/s]
100%|##########| 500000/500000 [00:06<00:00, 77540.84it/s]

  0%|          | 0/500000 [00:00<?, ?it/s]
100%|##########| 500000/500000 [00:06<00:00, 79264.75it/s]

  0%|          | 0/1 [00:00<?, ?it/s]
accuracy: 0.8333, loss: 0.9315 ||: 100%|##########| 1/1 [00:00<00:00, 12.24it/s]



===MODEL DEBUG===
Number of embeddings: 37
vocab: Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}
 	Namespace: tokens, Size: 37 
 	Namespace: labels, Size: 3 

===MODEL DEBUG===
===MODEL DEBUG===
index to token vocab: 37
Sentence: {'tokens': tensor([[ 2,  6,  7,  8,  9,  5,  3, 10, 11, 12, 13,  4, 14,  5, 15,  3, 16, 17,
          4,  2, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  4, 32,
         33,  2, 34, 35, 36,  3]])}
Shape of embeddings: torch.Size([1, 42, 300])
===MODEL DEBUG===


In [4]:
def main(model: Model, params: Params, extension_pretrained_file: str) -> List[List[str]]:
    
    test_path = VALIDATION_BUFFER_PATH
    train_path = TRAIN_BUFFER_PATH
    run_path = RUNNING_BUFFER_PATH

    # Get test vocab.
    reader = InstDatasetReader()
    test_dataset = reader.read(test_path) # Change to temp file.
    print("Test_dataset:", str(test_dataset))

    # Extend vocabulary.
    embedding_sources_mapping = {"word_embeddings.token_embedder_tokens": extension_pretrained_file}
    model.vocab.extend_from_instances(params, test_dataset)
    model.extend_embedder_vocab(embedding_sources_mapping)

    # Make predictions
    predictor = InstPredictor(model, dataset_reader=InstDatasetReader())
    with open(test_path, "r") as text_file:
        lines = text_file.readlines()
    all_text = " ".join(lines) # Makes it all 1 batch.
    output_dict = predictor.predict(all_text)
    tags = output_dict['tags']
    dataset_reader = InstDatasetReader()

    PRINT_STDOUT = False
    out = []
    
    with open("log.log", 'a') as log:
        for instance in dataset_reader._read(test_path):
            tokenlist = list(instance['sentence'])
            for i, token in enumerate(tokenlist):
                log.write(tags[i] + str(token) + "\n")
                if PRINT_STDOUT:
                    print(tags[i] + str(token))
                out.append([str(token), tags[i]])
                
    # Allennlp seems to only support extending the vocabulary once.
    # This is a hack to modify the `old` number of embeddings at each iteration.
    extended_num_embeddings = len(model.vocab.get_index_to_token_vocabulary(namespace='tokens'))
    model.word_embeddings.token_embedder_tokens.num_embeddings = extended_num_embeddings
    
    print("DONE.")
    return out

In [5]:
# Blue: #3399ff
# Red: #ff5050

CITY_SPAN_OPEN = "<span style=\"background-color: #ff5050\">"
DRUG_SPAN_OPEN = "<span style=\"background-color: #3399ff\">"
SPAN_CLOSE = "</span>"

def generate_markdown(out: List[List[str]]) -> List[str]:
    md_list = []
    for pair in out:
        if pair[1] == '*': # Cities
            md_list.append(CITY_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        elif pair[1] == '!': # Drugs
            md_list.append(DRUG_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        else:
            md_list.append(pair[0])
    return md_list
 
"""
When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.
"""
"""
It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
"""

"\nIt was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. \nThe skiiers were on their way to Denver, Colorado for Spring break. \nI went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. \nI think Somerville is one my favorite cities to live it. \nComputer programming code is developed in Seattle, Washington.\n"

In [6]:
global VALIDATION_DONE
VALIDATION_DONE = False
validation_prompt()

**Validation text:**

Text(value='', layout=Layout(width='70%'))

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** When I lived in Paris last year, France was experiencing a recession.  The night life was too fun, I developed an addiction to Adderall and cocaine.

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida.  The skiiers were on their way to Denver, Colorado for Spring break.  I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts.  I think Somerville is one my favorite cities to live it.  Computer programming code is developed in Seattle, Washington.

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** When I lived in Paris last year, France was experiencing a recession.  The night life was too fun, I developed an addiction to Adderall and cocaine.

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** When I lived in Paris last year, France was experiencing a recession.  The night life was too fun, I developed an addiction to Adderall and cocaine.

In [12]:
assert VALIDATION_DONE
# parms = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH, VALIDATION_BUFFER_PATH)
out = main(model, params, extension_pretrained_file)
md_list = generate_markdown(out)
md = ' '.join(md_list)
printmd(md)

1it [00:00, 286.38it/s]
100%|██████████| 1/1 [00:00<00:00, 4681.14it/s]

Test_dataset: [<allennlp.data.instance.Instance object at 0x7fd60c564a58>]
===MODEL DEBUG===
index to token vocab: 101
Sentence: {'tokens': tensor([[38,  2,  6,  7, 39,  9, 40,  4, 41, 37, 42, 12, 43,  3, 44, 17, 45, 37,
         46, 47,  4,  2, 48, 49, 50, 26, 51, 21, 52,  3]])}
Shape of embeddings: torch.Size([1, 30, 300])
===MODEL DEBUG===
DONE.





When I lived in <span style="background-color: #ff5050"> Paris</span> last year , <span style="background-color: #ff5050"> France</span> was experiencing a recession . The night life was too fun , I developed an addiction to <span style="background-color: #3399ff"> Adderall</span> and <span style="background-color: #3399ff"> cocaine</span> .