# Imports

In [1]:
#imports
import sys
import pandas as pd
import random
import os
import json
import pprint

from frameit.corpus import Corpus
from frameit.utils import *
from frameit.drop_gold_from_train import dropRepeats
from frameit.EvalAFrame import evalFrame

pp = pprint.PrettyPrinter(indent=4)
os.environ["KERAS_BACKEND"] = "tensorflow"

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Loading the corpus

In [6]:
# corpus_file = "your_corpus.csv"
corpus_file = "../../../conciergebot-api/data/ty_data/english_corpus/qa_questions_en.csv"

In [None]:
#If you have a gold set of positive and negative examples in an XML file, you can drop those examples from
#the training data with the following code. If your positive and negative gold examples are in the same file, you can
#pass that file to both parameters–positive examples in the negative file and negative examples in the positive file
#will simply be ignored
positive_example_file = 'gold_positive.xml'
negative_example_file = 'gold_negative.xml'
corpus_file, gold_file = dropRepeats(corpus_file, positive_example_file, negative_example_file, 100)

In [7]:
#Corpus data should have one sentence per line in a column titled "text". Any other columns will be ignored
#When loading a new corpus for the first time, set build_index to True to create indices necessary to process the data.
#Otherwise, this step can be safely skipped to significantly speed up runtime by setting build_index to False
corpus = Corpus(corpus_file, build_index=False)

init Corpus
Parsing the Semafor data... 
Parsing the DeepSRL data... 
Creating Utterances...


  0%|          | 0/31740 [00:00<?, ?it/s]

Loading the en model

    [93mInfo about spaCy[0m

    spaCy version      2.0.12         
    Location           /Users/jengelwork/miniconda3/lib/python3.6/site-packages/spacy
    Platform           Darwin-17.7.0-x86_64-i386-64bit
    Python version     3.6.1          
    Models             en_core_web_md, en_core_web_lg, fr, en, fr_core_news_sm


    [93mInfo about spaCy[0m

    spaCy version      2.0.12         
    Location           /Users/jengelwork/miniconda3/lib/python3.6/site-packages/spacy
    Platform           Darwin-17.7.0-x86_64-i386-64bit
    Python version     3.6.1          
    Models             en_core_web_md, en_core_web_lg, fr, en, fr_core_news_sm



100%|██████████| 31740/31740 [00:11<00:00, 2747.95it/s]


Loading indices...
Loading lemma indices...


# Constructing a positive set for training

## A starting point for the positive set

In [8]:
positive_strings = ['example', 'strings', 'that would be in the', 'positive', 'sentences', 'for', 'the intent',
                   'that you want', 'to extract']
# positive_strings = ['open', 'close', 'when', 'hours', 'late', 'early']
positive_utterances = build_positive_set(corpus, positive_strings)
#Note: for exact matches of the strings, use the above function call to build_positive_set(). 
#To also include matches of all tenses and plural/singular forms of all words in the string, add_lemmas_to_set()
lemma_strings = ['run', 'dance']
positive_utterances = add_lemmas_to_set(corpus, lemma_strings, existing_set=positive_utterances)

There are 4440 relevant messages in the corpus
There are 4500 relevant messages in the corpus


## Optional: expand using hypernyms

In [9]:
#A hypernym h of a word w is a more generic term that includes w as part of its semantic field. 
#For example, "bird" is a hypernym of "pigeon", "eagle", "falcon", etc. "Animal" is a hypernym of "bird".

#Expanding with hypernyms may not always be appropriate. You may also want to use a different set of terms than 
#the full list of positive_strings defined earlier

positive_utterances = expand_with_hypernym(positive_utterances, positive_strings, corpus)

when
late
early
Number of strings for which no hypernyms were found  3
go
last
open
close
There are 4891 relevant messages in the corpus


## Sample sentences to check positive set accuracy

In [10]:
for a in random.sample(positive_utterances, 20):
    print(a.text)
    print()

Will there still be valet space and open front desk?

Can we check in early tomorrow?

Is it possible to check in early?

Could I move when I return around 6:30pm?

confirm I am scheduled for late checkout?

do early check in at 1 pm for those rooms?

Can we leave our car in the garage and pick it up in a few hours?

verify when a crib will be able to be brought up?

Is it possible for me to check out late on this coming Sunday?

Is there any way we can get a late checkout?

I have a reservation today..is it possible to check in early?

Can I get a late check out, please?

Is there a chance we can have a late checkout tomorrow?

Do you think we would be able to have early check in?

is late check out available for tomorrow?

can I do a late checkout?

do I do anything special to the apartment or leave the fob and go?

is it possible to get a late checkout tomorrow?

Is there any availability on early check ins?

this is Mr. Hunter in room 1250. tell me the wifi login information and wh

## Trimming the positive set

In [11]:
#To remove bad examples from the positive set. Also creates a negative set that can optionally be used
remove_list = ['strings', 'that occur', 'in the positive set', 'that correspond', 'to examples',
               'that are not positive']
positive_utterances, negative_set = trim_examples(positive_utterances, remove_list)

There are 4891 relevant messages in the corpus


# Save the data for training

## Specify hyperparameters for training

If you like, you can customize hyperparameters for the training function. Otherwise, the function will be run with default values

In [12]:
scale_to=700
epochs=40
batch_size=1400
reg_param=0.02

In [13]:
# Give the frame a name and save it to a file
frame_filename = 'frame_training_info.json'
frame_name = "Your Frame Name"
save_frame_training_info_to_file(frame_name, corpus_file, positive_utterances, negative_set,
                                scale_to, epochs, batch_size, reg_param, frame_filename)

Saved info with filename frame_training_info.json.


Stop here and use *Generic lambda_rule attribute exploration.ipynb* and/or *Generic machine-learning attribute exploration.ipynb* if you would like to train attributes for entity-extraction to be used with this frame. When you've collected the necessary data for attributes that you want to train, run the following cell to train a frame, adjusting passed parameters as necessary to incorporate your attribute data

In [14]:
from frameit.train_from_exploration import train_frame_wrapper
train_frame_wrapper("test_frame.json", frame_file=frame_filename, ml_attr_files=["attr1.json"], 
                    lambda_attr_files=["attr2.json"])

init Corpus
Parsing the Semafor data... 
Parsing the DeepSRL data... 
Creating Utterances...


100%|██████████| 31740/31740 [00:00<00:00, 33162.30it/s] 


Loading indices...
Loading lemma indices...
Importing machine learning attributes
Training  Attribute 1
Importing lambda_rule attributes
Training  Proper Noun Attribute
Rebuilding frame
Train on 8804 samples, validate on 978 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Saving frame to file
Done!


In [15]:
#for testing
from frameit import SRL, Frame
srl = SRL()
frame = Frame.load('test_frame.json')
srl.addFrame(frame)
srl.parse('Where is the pool')

Loading the en model

    [93mInfo about spaCy[0m

    spaCy version      2.0.12         
    Location           /Users/jengelwork/miniconda3/lib/python3.6/site-packages/spacy
    Platform           Darwin-17.7.0-x86_64-i386-64bit
    Python version     3.6.1          
    Models             en_core_web_md, en_core_web_lg, fr, en, fr_core_news_sm


    [93mInfo about spaCy[0m

    spaCy version      2.0.12         
    Location           /Users/jengelwork/miniconda3/lib/python3.6/site-packages/spacy
    Platform           Darwin-17.7.0-x86_64-i386-64bit
    Python version     3.6.1          
    Models             en_core_web_md, en_core_web_lg, fr, en, fr_core_news_sm



[]

In [16]:
srl.parse('Where is the pool')

[]

In [17]:
evalFrame(frame_filename, gold_filename)

NameError: name 'gold_filename' is not defined