# Imports

In [None]:
#imports
import sys
import pandas as pd
import random
import os
import json
import pprint

from frameit.corpus import Corpus
from frameit.utils import *
from frameit.drop_gold_from_train import dropGold
from frameit.EvalAFrame import evalFrame

pp = pprint.PrettyPrinter(indent=4)
os.environ["KERAS_BACKEND"] = "tensorflow"

# Introduction

These notebooks are set up to train a frame that can detect sentences about meals. Feel free to replace the default data files and modify the code as needed to adapt these notebooks to your purposes

# Loading the corpus

Your corpus should be in a .csv file, and the text to be used as training should be in a column titled "text", with each data point on a separate line. If you are planning on using a gold set, you should also have an "Index" column with id numbers for each row. 

In [None]:
corpus_file = "../resources/happy_moment_corpus_small.csv"

# Optional: set up a gold set and drop it from the training file

If you have a gold set of positive and negative examples in an XML file, you can drop those examples from
the training data with the following code.

See the instructions in docs/evaluation.rst for more information on formatting data for the evaluation script.

In [None]:
#If your positive and negative gold examples are in the same file, you can
#pass that file to both parameters–positive examples in the negative file and negative examples in the positive file
#will simply be ignored
positive_example_file = '../resources/meal_gold_set.xml'
negative_example_file = '../resources/meal_gold_set.xml'
#Note: for the default data set which is abnormally small, we use a sample size of 5. For your own purposes,
#we recommend using at least 100 examples.
corpus_file, gold_file = dropGold(corpus_file, positive_example_file, negative_example_file, prefix="../resources/")

## Build the corpus

In [None]:
#Corpus data should have one sentence per line in a column titled "text". Additionally, there should be a column titled
#"index" containing the row number of the datapoint (row numbers do not need to be accurate for the .csv file, but
#they do need to be unique.)
#When loading a new corpus for the first time, set build_index to True to create indices necessary to process the data.
#Otherwise, this step can be safely skipped to significantly speed up runtime by setting build_index to False
corpus = Corpus(corpus_file, build_index=True)

# Constructing a positive set for training

## A starting point for the positive set

In [None]:
# positive_strings = ['example', 'strings', 'that would be in the', 'positive', 'sentences', 'for', 'the intent',
#                    'that you want', 'to extract']
positive_strings = ['breakfast', 'brunch', 'lunch', 'dinner']
positive_utterances = build_positive_set(corpus, positive_strings)
#Note: for exact matches of the strings, use the above function call to build_positive_set(). 
#To also include matches of all tenses and plural/singular forms of all words in the string, add_lemmas_to_set()
lemma_strings = ['restaurant', 'cafe']
positive_utterances = add_lemmas_to_set(corpus, lemma_strings, existing_set=positive_utterances)
negative_set = set()

## Optional: expand using hypernyms

A hypernym h of a word w is a more generic term that includes w as part of its semantic field. 
For example, "bird" is a hypernym of "pigeon", "eagle", "falcon", etc. "Animal" is a hypernym of "bird".

Expanding with hypernyms may not always be appropriate. You may also want to use a different set of terms than 
the full list of positive_strings defined earlier



In [None]:
positive_utterances = expand_with_hypernym(positive_utterances, positive_strings, corpus)

## Sample sentences to check positive set accuracy

In [None]:
for a in random.sample(positive_utterances, 20):
    print(a.text)
    print()

## Trimming the positive set

In [None]:
#To remove bad examples from the positive set. Also creates a negative set that can optionally be used
#Note: there may not necessarily be any bad examples to trim, in which case you should skip this step.
remove_list = ['strings', 'that occur', 'in the positive set', 'that correspond', 'to examples',
               'that are not positive']
positive_utterances, negative_set = trim_examples(positive_utterances, remove_list)

# Save the data for training

## Specify hyperparameters for training

If you like, you can customize hyperparameters for the training function. Otherwise, the function will be run with default values

In [None]:
scale_to=700
epochs=40
batch_size=1400
reg_param=0.02

In [None]:
# Give the frame a name and save it to a file
frame_info_filename = 'frame_training_info.json'
frame_name = "Your Frame Name"
save_frame_training_info_to_file(frame_name, corpus_file, positive_utterances, negative_set,
                                scale_to, epochs, batch_size, reg_param, frame_info_filename, gold_file)

Note that running the above code saves your training data set as an XML file titled "resources/Your Frame Name_interim_data.xml" (if you change the frame_name variable, it will use whatever string you've set there instead of "Your Frame Name"). If you like, you can edit the data set by hand in that file; the file will be used in the Train frame notebook to train your frame.

Stop here and use *Generic lambda_rule attribute exploration.ipynb* and/or *Generic machine-learning attribute exploration.ipynb* if you would like to train attributes for entity-extraction to be used with this frame. When you've collected the necessary data for attributes that you want to train, proceed to the *Train frame* notebook