# Imports

In [1]:
#imports
import sys
import pandas as pd
import random
import os
import json
import pprint

from frameit.corpus import Corpus
from frameit.utils import *
from frameit.drop_gold_from_train import dropRepeats
from frameit.EvalAFrame import evalFrame

pp = pprint.PrettyPrinter(indent=4)
os.environ["KERAS_BACKEND"] = "tensorflow"

Loading the en model

    [93mInfo about spaCy[0m

    spaCy version      2.0.11         
    Location           /home/ubuntu/miniconda3/envs/dev_framers/lib/python3.6/site-packages/spacy
    Platform           Linux-4.4.0-1049-aws-x86_64-with-debian-stretch-sid
    Python version     3.6.5          
    Models             en_core_web_lg, en, fr



Using TensorFlow backend.


# Loading the corpus

In [None]:
corpus_file = "./resources/ty_data/questions/qa_questions.csv"

In [2]:
#If you have a gold set of positive and negative examples in an XML file, you can drop those examples from
#the training data with the following code. If your positive and negative gold examples are in the same file, you can
#pass that file to both parameters–positive examples in the negative file and negative examples in the positive file
#will simply be ignored
positive_example_file = 'gold_positive.xml'
negative_example_file = 'gold_negative.xml'
corpus_file, gold_file = dropRepeats(corpus_file, positive_example_file, negative_example_file, 100)

In [3]:
#Corpus data should have one sentence per line in a column titled "text". Any other columns will be ignored
#When loading a new corpus for the first time, set build_index to True to create indices necessary to process the data.
#Otherwise, this step can be safely skipped to significantly speed up runtime by setting build_index to False
corpus = Corpus(corpus_file, build_index=False)

init Corpus
Parsing the Semafor data... 
Parsing the DeepSRL data... 
Creating Utterances...


100%|██████████| 31740/31740 [00:00<00:00, 150240.16it/s]


Loading indices...
Loading lemma indices...


# Constructing a positive set for training

## A starting point for the positive set

In [4]:
positive_strings = ['example', 'strings', 'that would be in the', 'positive', 'sentences', 'for', 'the intent',
                   'that you want', 'to extract']
# positive_strings = ['open', 'close', 'when', 'hours', 'late', 'early']
positive_utterances = build_positive_set(corpus, positive_strings)
#Note: for exact matches of the strings, use the above function call to build_positive_set(). 
#To also include matches of all tenses and plural/singular forms of all words in the string, add_lemmas_to_set()
lemma_strings = ['run', 'dance']
positive_utterances = add_lemmas_to_set(corpus, lemma_strings, existing_set=positive_utterances)

There are 5187 relevant messages in the corpus
There are 5241 relevant messages in the corpus


## Optional: expand using hypernyms

In [5]:
#A hypernym h of a word w is a more generic term that includes w as part of its semantic field. 
#For example, "bird" is a hypernym of "pigeon", "eagle", "falcon", etc. "Animal" is a hypernym of "bird".

#Expanding with hypernyms may not always be appropriate. You may also want to use a different set of terms than 
#the full list of positive_strings defined earlier

positive_utterances = expand_with_hypernym(positive_utterances, positive_strings, corpus)

strings
that would be in the
positive
sentences
for
the intent
that you want
to extract
Number of strings for which no hypernyms were found  8
case
example
time
tasting
There are 6003 relevant messages in the corpus


## Sample sentences to check positive set accuracy

In [6]:
for a in random.sample(positive_utterances, 20):
    print(a.text)
    print()

We are getting brunch in the area so we are staying parked in the garage for about another hours or so, is that okay?

tell me the user name for wifi?

Is there a charge for WiFi?

Could we organize for a taxi to take us to the airport at 12:45?

Do you have any vases for flowers I can borrow?

Is there any chance for a late check-out of 2pm tomorrow?

Can we arrange for an early check in?

may i request for early check in tomorrow?

Can I get a reservation for 2 at Marsh House tonight 7pm please?

Do you have any for purchase?

Was there anything special needed for checkout?

Can I pre-authorize a credit card for incidentals?

be able to assist in getting a rental car or zip car type of thing for this afternoon?

Is it possible to order espn for the apartment so we can watch the us open?

Is there anyway we can switch for today?

Is there anyway I can have a room close to guests staying for the wedding?

Do you ask for security deposits or is that included in the nightly cost?

Do you

## Trimming the positive set

In [7]:
#To remove bad examples from the positive set. Also creates a negative set that can optionally be used
remove_list = ['strings', 'that occur', 'in the positive set', 'that correspond', 'to examples',
               'that are not positive']
positive_utterances, negative_set = trim_examples(positive_utterances, remove_list)

There are 6003 relevant messages in the corpus


# Save the data for training

## Specify hyperparameters for training

If you like, you can customize hyperparameters for the training function. Otherwise, the function will be run with default values

In [8]:
scale_to=700
epochs=40
batch_size=1400
reg_param=0.02

In [9]:
# Give the frame a name and save it to a file
frame_filename = 'frame_training_info.json'
frame_name = "Your Frame Name"
save_frame_training_info_to_file(frame_name, corpus_file, positive_utterances, negative_set,
                                scale_to, epochs, batch_size, reg_param, frame_filename)

Saved info with filename frame_training_info.json.


In [10]:
from frameit.train_from_exploration import train_frame_wrapper
train_frame_wrapper("test_frame.json", frame_file=frame_filename, ml_attr_files=["attr1.json"], 
                    lambda_attr_files=["attr2.json"])

init Corpus
Parsing the Semafor data... 
Parsing the DeepSRL data... 
Creating Utterances...


100%|██████████| 31740/31740 [00:00<00:00, 92274.45it/s]


Loading indices...
Loading lemma indices...
Importing machine learning attributes
Training  Attribute 1
Importing lambda_rule attributes
Training  Proper Noun Attribute
Rebuilding frame
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 10806 samples, validate on 1200 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Saving frame to file
Done!


In [11]:
#for testing
from frameit import SRL, Frame
srl = SRL()
frame = Frame.load('test_frame.json')
srl.addFrame(frame)
srl.parse('Where is the pool')

Loading the en model

    [93mInfo about spaCy[0m

    spaCy version      2.0.11         
    Location           /home/ubuntu/miniconda3/envs/dev_framers/lib/python3.6/site-packages/spacy
    Platform           Linux-4.4.0-1049-aws-x86_64-with-debian-stretch-sid
    Python version     3.6.5          
    Models             en_core_web_lg, en, fr



[]

In [12]:
srl.parse('Where is the pool')

[]

In [None]:
evalFrame(frame_filename, gold_filename)