In [12]:
import os
import json
import time
import shutil
import numpy as np
import tempfile

from typing import List

from allennlp.common.params import Params
from allennlp.commands.train import train_model
from allennlp.models import Model
from allennlp.data import Vocabulary

from IPython.display import Markdown, display
from ipywidgets import Layout, widgets

from model import InstEntityTagger
from predictor import InstPredictor
from dataset_reader import InstDatasetReader

# from notebook_utils import printmd, train_prompt, validation_prompt

print("Imports complete.")

Imports complete.


In [15]:
# Global variables.
TRAIN_BUFFER_PATH = "../data/train_buffer.tmp"
VALIDATION_BUFFER_PATH = "../data/validate_buffer.tmp"
RUNNING_BUFFER_PATH = "../data/running_buffer.tmp"
TRAIN_DONE = False
VALIDATION_DONE = False
CONFIG_PATH = '../configs/template.jsonnet'

def printmd(string: str) -> None:
    display(Markdown(string))

def train_prompt() -> None:
    train_text = widgets.Text(layout=Layout(width='70%'))
    printmd("**Training text:**")
    display(train_text)
    train_text.on_submit(read_train)

def validation_prompt() -> None:
    valid_text = widgets.Text(layout=Layout(width='70%'))
    printmd("**Validation text:**")
    display(valid_text)
    valid_text.on_submit(read_validation)

def read_train(sender: widgets.Text) -> None:
    global TRAIN_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(TRAIN_BUFFER_PATH, 'w') as train_file:
        train_file.write(sender.value)
    printmd("**Writing to:** " + TRAIN_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    TRAIN_DONE = True

def read_validation(sender: widgets.Text) -> None:
    global VALIDATION_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(VALIDATION_BUFFER_PATH, 'w') as valid_file:
        valid_file.write(sender.value)
    with open(RUNNING_BUFFER_PATH, 'a') as running_buffer:
        running_buffer.write(sender.value)
    printmd("**Writing to:** " + VALIDATION_BUFFER_PATH)
    printmd("**Wrote:** " + sender.value)
    VALIDATION_DONE = True
    
def set_params(config_path: str, train_buffer_path: str) -> Params:
    # Modifying parameter values
    params = Params.from_file(config_path)
    params.__setitem__("train_data_path", train_buffer_path)
    print(json.dumps(params.as_dict(), indent=4))
    return params

"""
I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. 
One night, I got food poisoning and couldn't find !Tylenol to make the 
pain go away, they insisted I take !aspirin instead.
"""

"""
When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.
"""

"""
It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
"""

if __name__ == "__main__":
    with open(RUNNING_BUFFER_PATH, 'w') as running_buffer:
        running_buffer.write(" ")
    train_prompt()

**Training text:**

Text(value='', layout=Layout(width='70%'))

**Writing to:** ../data/train_buffer.tmp

**Wrote:** I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle.  One night, I got food poisoning and couldn't find !Tylenol to make the  pain go away, they insisted I take !aspirin instead.

In [16]:
def train() -> Model:
    assert TRAIN_DONE
    
    # Set parameters. 
    params = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH)
    
    parms = params.duplicate()
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)
    shutil.rmtree(serialization_dir)
    return model, parms
if __name__ == "__main__":
    model, parms = train()

0it [00:00, ?it/s]
1it [00:00, 153.55it/s]

0it [00:00, ?it/s]
1it [00:00, 2847.46it/s]

  0%|          | 0/500000 [00:00<?, ?it/s]


{
    "dataset_reader": {
        "type": "inst_dataset_reader"
    },
    "train_data_path": "../data/train_buffer.tmp",
    "trainer": {
        "num_epochs": 1,
        "optimizer": {
            "type": "adam"
        }
    },
    "model": {
        "encoder": {
            "simrel": {
                "input_dim": 300,
                "num_classes": 3
            },
            "type": "CosineEncoder"
        },
        "type": "inst_entity_tagger",
        "word_embeddings": {
            "token_embedders": {
                "tokens": {
                    "embedding_dim": 300,
                    "pretrained_file": "~/packages/data/instantnll/GoogleNews-vectors-negative300_SUBSET.txt",
                    "type": "embedding"
                }
            },
            "type": "basic"
        }
    },
    "iterator": {
        "batch_size": 1,
        "type": "basic"
    },
    "vocabulary": {
        "pretrained_files": {
            "tokens": "~/packages/data/instantnll/GoogleN

100%|##########| 500000/500000 [00:07<00:00, 67172.34it/s]

  0%|          | 0/500000 [00:00<?, ?it/s]
100%|##########| 500000/500000 [00:06<00:00, 76631.34it/s]

  0%|          | 0/1 [00:00<?, ?it/s]
accuracy: 0.8333, loss: 0.9315 ||: 100%|##########| 1/1 [00:00<00:00, 19.70it/s]



===allennlp.modules.token_embedders.embedding.py===
Extending vocab. 
===allennlp.modules.token_embedders.embedding.py===


In [31]:
def main(model: Model, parms: Params) -> List[List[str]]:
    
    test_path = VALIDATION_BUFFER_PATH
    train_path = TRAIN_BUFFER_PATH
    run_path = RUNNING_BUFFER_PATH

    # Grab pretrained file.
    vocab_params = parms.get(key="vocabulary")
    pretrained_files_params = vocab_params.get(key="pretrained_files")
    extension_pretrained_file = pretrained_files_params.get(key="tokens")
    
    # Get test vocab.
    reader = InstDatasetReader()
    test_dataset = reader.read(run_path) # Change to temp file.
    train_dataset = reader.read(train_path) # Change to temp file.
    extended_vocab = Vocabulary.from_instances(train_dataset + test_dataset)
    print("Test_dataset:", str(test_dataset))
    
    print(extended_vocab)

    # Extend vocabulary.
    token_embedders = model.word_embeddings._token_embedders # pylint: disable=protected-access
    embedding = token_embedders['tokens']
    namespace = 'tokens'
    print(embedding.num_embeddings)
    embedding.extend_vocab(extended_vocab, namespace, extension_pretrained_file)
    token_embedders['tokens'] = embedding
    model.word_embeddings._token_embedders = token_embedders # pylint: disable=protected-access
    print(embedding.num_embeddings)

    # Make predictions
    predictor = InstPredictor(model, dataset_reader=InstDatasetReader())
    with open(test_path, "r") as text_file:
        lines = text_file.readlines()
    all_text = " ".join(lines) # Makes it all 1 batch.
    print(all_text)
    output_dict = predictor.predict(all_text)
    tags = output_dict['tags']
    print(tags)
    dataset_reader = InstDatasetReader()

    PRINT_STDOUT = False
    out = []
    
    with open("log.log", 'a') as log:
        for instance in dataset_reader._read(test_path):
            tokenlist = list(instance['sentence'])
            for i, token in enumerate(tokenlist):
                log.write(tags[i] + str(token) + "\n")
                if PRINT_STDOUT:
                    print(tags[i] + str(token))
                out.append([str(token), tags[i]])
    
    print("DONE.")
    return out

In [23]:
# Blue: #3399ff
# Red: #ff5050

CITY_SPAN_OPEN = "<span style=\"background-color: #ff5050\">"
DRUG_SPAN_OPEN = "<span style=\"background-color: #3399ff\">"
SPAN_CLOSE = "</span>"

def generate_markdown(out: List[List[str]]) -> List[str]:
    md_list = []
    for pair in out:
        if pair[1] == '*': # Cities
            md_list.append(CITY_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        elif pair[1] == '!': # Drugs
            md_list.append(DRUG_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        else:
            md_list.append(pair[0])
    return md_list
 
"""
When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.
"""
"""
It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
"""

"\nIt was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. \nThe skiiers were on their way to Denver, Colorado for Spring break. \nI went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. \nI think Somerville is one my favorite cities to live it. \nComputer programming code is developed in Seattle, Washington.\n"

In [20]:
global VALIDATION_DONE
VALIDATION_DONE = False
validation_prompt()

**Validation text:**

Text(value='', layout=Layout(width='70%'))

**Writing to:** ../data/validate_buffer.tmp

**Wrote:** When I lived in Paris last year, France was experiencing a recession.  The night life was too fun, I developed an addiction to Adderall and cocaine.

In [32]:
assert VALIDATION_DONE
# parms = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH, VALIDATION_BUFFER_PATH)
out = main(model, parms)
print(out)
md_list = generate_markdown(out)
md = ' '.join(md_list)
printmd(md)

1it [00:00, 688.04it/s]
1it [00:00, 474.63it/s]
100%|██████████| 2/2 [00:00<00:00, 4002.20it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Test_dataset: [<allennlp.data.instance.Instance object at 0x7fa028f3da90>]
Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 53 
 	Namespace: labels, Size: 3 

37
===allennlp.modules.token_embedders.embedding.py===
Extending vocab. 
===allennlp.modules.token_embedders.embedding.py===


100%|██████████| 500000/500000 [00:06<00:00, 76435.27it/s]

37
When I lived in Paris last year, France was experiencing a recession.  The night life was too fun, I developed an addiction to Adderall and cocaine.





TypeError: 'NoneType' object is not subscriptable