[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brendanxwhitaker/instantnll/blob/dev/instantnll/instant_colab.ipynb)

<a href="https://colab.research.google.com/github/brendanxwhitaker/instantnll/blob/dev/instantnll/instant_colab.ipynb">direct link</a>

# Instant Natural Language Learner

## Global Variables and Configuration

In [0]:
# Configuration template. 
CONFIG_PATH = '../configs/template_elmo_colab.jsonnet'

# Temporary data buffers. 
TRAIN_BUFFER_PATH = "../data/train_buffer.tmp"
VALIDATION_BUFFER_PATH = "../data/validate_buffer.tmp"

# Check that data has been read. 
TRAIN_DONE = False
VALIDATION_DONE = False

# Class highlighting. 
try:
  import colorama
except:
  !pip install colorama
from colorama import Fore, Style
CITY_SPAN_OPEN = Fore.RED
DRUG_SPAN_OPEN = Fore.BLUE
SPAN_CLOSE = Fore.BLACK

# Special markdown printing function. 
def printmd(string: str) -> None:
    print(string)

# Imports and Function Definitions

In [35]:
import os
os.chdir("/content")
import json
import time
import shutil
import numpy as np
import tempfile

from typing import List

try:
    from allennlp.common.params import Params
    from allennlp.commands.train import train_model
    from allennlp.models import Model
    from allennlp.data import Vocabulary
except:
    !pip install allennlp
    from allennlp.common.params import Params
    from allennlp.commands.train import train_model
    from allennlp.models import Model
    from allennlp.data import Vocabulary

from IPython.display import Markdown, display
from ipywidgets import Layout, widgets

try:
    from model import InstEntityTagger
    from predictor import InstPredictor
    from dataset_reader import InstDatasetReader
except:
    if not os.path.isdir("instantnll"):
      !git clone --single-branch --branch dev https://github.com/brendanxwhitaker/instantnll.git
    os.chdir("/content/instantnll/instantnll/")
    from model import InstEntityTagger
    from predictor import InstPredictor
    from dataset_reader import InstDatasetReader
    
try:
  import wget
except:
  !pip install wget
  import wget

printmd("Imports complete.")

os.chdir("/content/instantnll/data/")
if not os.path.isfile("/content/instantnll/data/GoogleNews-vectors-negative300_SUBSET_100000.txt"):
  os.chdir("/content/instantnll/data/")
  !wget --no-check-certificate -O "GoogleNews-vectors-negative300_SUBSET_100000.txt" "https://onedrive.live.com/download?cid=5BB216E1C6C27D43&resid=5BB216E1C6C27D43%2118409&authkey=AAeOmL7Rxsf_6Hk"
  os.chdir("/content/instantnll/instantnll")
  if not os.path.isfile("/content/instantnll/data/GoogleNews-vectors-negative300_SUBSET_100000.txt"):
    raise Exception("Couldn't download file.")

def read_train(train_text: str) -> None:
    global TRAIN_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    assert train_text != ""
    assert not train_text.isspace()
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(TRAIN_BUFFER_PATH, 'w') as train_file:
        train_file.write(train_text)
    printmd("**Writing to:** " + TRAIN_BUFFER_PATH)
    printmd("**Wrote:** " + train_text)
    TRAIN_DONE = True

def read_validation(valid_text: str) -> None:
    global VALIDATION_DONE
    assert os.path.exists("../data/") # Avoid absolute paths?
    assert valid_text != ""
    assert not valid_text.isspace()
    # assert not os.path.isfile(TRAIN_BUFFER_PATH)
    with open(VALIDATION_BUFFER_PATH, 'w') as valid_file:
        valid_file.write(valid_text)
    printmd("**Writing to:** " + VALIDATION_BUFFER_PATH)
    printmd("**Wrote:** " + valid_text)
    VALIDATION_DONE = True
    
def set_params(config_path: str, train_buffer_path: str) -> Params:
    # Modifying parameter values
    params = Params.from_file(config_path)
    params.__setitem__("train_data_path", train_buffer_path)
    return params

def check_train_read(variable: bool):
    try:
        assert variable
    except AssertionError:
        printmd("Enter training data above, \
                press <Enter> to submit, then try again.")
        assert variable
    
def check_validation_read(variable: bool):
    try:
        assert variable
    except AssertionError:
        printmd("Enter validation data above, \
                press <Enter> to submit, then try again.")
        assert variable

def train() -> Model:
    check_train_read(TRAIN_DONE)
    
    # Set parameters. 
    params = set_params(CONFIG_PATH, TRAIN_BUFFER_PATH)
    
    # Grab pretrained file.
    params_copy = params.duplicate()
    vocab_params = params_copy.get(key="vocabulary")
    pretrained_files_params = vocab_params.get(key="pretrained_files")
    extension_pretrained_file = pretrained_files_params.get(key="tokens")
    
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)
    shutil.rmtree(serialization_dir)
    
    return model, params, params_copy, extension_pretrained_file

def main(model: Model, params: Params, edible_params: Params, extension_pretrained_file: str) -> List[List[str]]:
    
    test_path = VALIDATION_BUFFER_PATH
    train_path = TRAIN_BUFFER_PATH
    
    # Construct the reader and predictor.
    reader_params = edible_params.get('dataset_reader')
    try:
      reader_params.pop('type')
    except:
      pass
    reader = InstDatasetReader.from_params(params=reader_params)
    predictor = InstPredictor(model, dataset_reader=reader)

    # Get test vocab.
    test_dataset = reader.read(test_path) # Change to temp file.

    # Extend vocabulary.
    embedding_sources_mapping = {"word_embeddings.token_embedder_tokens": extension_pretrained_file}
    model.vocab.extend_from_instances(params, test_dataset)
    model.extend_embedder_vocab(embedding_sources_mapping)

    # Make predictions
    with open(test_path, "r") as text_file:
        lines = text_file.readlines()
    all_text = " ".join(lines) # Makes it all 1 batch.
    output_dict = predictor.predict(all_text)
    tags = output_dict['tags']
    dataset_reader = InstDatasetReader()

    PRINT_STDOUT = False
    out = []
    
    with open("log.log", 'a') as log:
        for instance in dataset_reader._read(test_path):
            tokenlist = list(instance['sentence'])
            for i, token in enumerate(tokenlist):
                log.write(tags[i] + str(token) + "\n")
                if PRINT_STDOUT:
                    print(tags[i] + str(token))
                out.append([str(token), tags[i]])
                
    # Allennlp seems to only support extending the vocabulary once.
    # This is a hack to modify the `old` number of embeddings at each iteration.
    extended_num_embeddings = len(model.vocab.get_index_to_token_vocabulary(namespace='tokens'))
    model.word_embeddings.token_embedder_tokens.num_embeddings = extended_num_embeddings
    
    return out

def generate_markdown(out: List[List[str]]) -> List[str]:
    md_list = []
    for pair in out:
        if pair[1] == '*': # Cities
            md_list.append(CITY_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        elif pair[1] == '!': # Drugs
            md_list.append(DRUG_SPAN_OPEN)
            md_list.append(pair[0] + SPAN_CLOSE)
        else:
            md_list.append(pair[0])
    return md_list

Imports complete.


# Text Samples for Training

I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead. 

# Read in Training Example

In [36]:
#@title Training Example

TrainingText = "I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead." #@param {type:"string"}
read_train(TrainingText)

**Writing to:** ../data/train_buffer.tmp
**Wrote:** I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead.


# Train Model

In [37]:
model, params, params_copy, extension_pretrained_file = train()

0it [00:00, ?it/s]
1it [00:00, 920.41it/s]

0it [00:00, ?it/s]
1it [00:00, 3515.76it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]
100%|##########| 100000/100000 [00:00<00:00, 277641.42it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]
100%|##########| 100000/100000 [00:00<00:00, 276802.05it/s]

  0%|          | 0/1 [00:00<?, ?it/s]
accuracy: 0.8333, precision 0: 1.0000, precision 1: 0.3333, precision 2: 0.4000, recall 0: 0.8158, recall 1: 1.0000, recall 2: 1.0000, fscore 0: 0.8986, fscore 1: 0.5000, fscore 2: 0.5714, loss: 0.9345 ||: 100%|##########| 1/1 [00:00<00:00, 27.18it/s]



# Text Samples for Prediction

When I lived in Paris last year, France was experiencing a recession. 
The night life was too fun, I developed an addiction to Adderall and cocaine.

It was a cold, wintery day in Cambridge, England. Michael's grandparents had moved down to Tampa Bay, Florida. 
The skiiers were on their way to Denver, Colorado for Spring break. 
I went to my first concert in Las Vegas, Nevada. I hate Brussel sprouts. 
I think Somerville is one my favorite cities to live it. 
Computer programming code is developed in Seattle, Washington.
<br/>
<br/>


# Enter Validation Example

In [40]:
global VALIDATION_DONE
VALIDATION_DONE = False
#@title Validation Example

ValidationText = "When I lived in Paris last year, France was experiencing a recession. The night life was too fun, I developed an addiction to Adderall and cocaine." #@param {type:"string"}
read_validation(ValidationText)

**Writing to:** ../data/validate_buffer.tmp
**Wrote:** When I lived in Paris last year, France was experiencing a recession. The night life was too fun, I developed an addiction to Adderall and cocaine.


# Predict

In [43]:
check_validation_read(VALIDATION_DONE)
edible_params = params_copy.duplicate()
out = main(model, params, edible_params, extension_pretrained_file)
md_list = generate_markdown(out)
md = ' '.join(md_list)
print("\n")
printmd(md)

1it [00:00, 748.18it/s]
100%|██████████| 1/1 [00:00<00:00, 4284.27it/s]



When I lived in [31m Paris[30m last year , [31m France[30m was experiencing a recession . The night life was too fun , I developed an addiction to [34m Adderall[30m and [34m cocaine[30m .



