# Imports

In [None]:
#imports
import sys
import pandas as pd
import random
import os
import json
import pprint

from frameit.corpus import Corpus
from frameit.utils import *
from frameit.utterance import Utterance

pp = pprint.PrettyPrinter(indent=4)
os.environ["KERAS_BACKEND"] = "tensorflow"

# Loading the positive set of a frame

You should train your attributes based on the same dataset as the frame that they will be attached to. Note that this dataset is of utterances, which contain word-embedding information for an entire sentence. The attribute will be trained based on the individual tokens (representing words) in the sentences. 

For optimal results, make sure that you have a minimum of 100 positive examples for training your attribute. Depending on how frequently your desired attribute occurs in the dataset, and how unique its grammatical position is compared to other tokens, you may need more examples for desirable results. If the attribute cannot be effectively trained using your available data, we recommend that you attempt to extract it using lambda_rule heuristics. A tutorial for those is available in another notebook.

In [None]:
filename = 'frame_training_info.json'
positive_utterances = load_frame_pos_set(filename)

# Define attributes

In [None]:
#Specify the part of speech and dependencies that correspond to the attribute you are trying to extract.
#You will be able to provide more detailed information to extract attributes later
#name: str, the attribute's name, used to identify it
#linguistic_info: dict, keys are POS, DEP, and lemma. Values are lists of spacy part-of-speech and dependency tags
#(for POS and DEP) and a list of strings for lemma. If values are passed for a key, only attributes matching those
#values will be extracted by the model.
#examples: list, positive examples of the attribute for training. Will be populated later
#unique: bool, if True only one attribute will be extracted per sentence using this model
attr1 = {"name": "Food", 
         "linguistic_info": {"POS": ["NOUN"], "DEP":["NSUBJ"]},
         "examples": list(),
         "unique": True }

## Method 1 of extracting attributes: dependency trees

You may find it useful to experiment with Spacy parses here: https://explosion.ai/demos/displacy in order to figure out what dependency constraints to set.

In [None]:
#part of speech, dependency, and lemma features of the parents of candidate strings
dep = [{"pos":["verb"], "lemma":["had", "made", "ate", "eat", "make", "prepared", "cooked"]}] 
#part of speech, dependency, and lemma features of the strings themselves
cand = [{"pos":["noun"], "dep":["pobj","dobj","ccomp", "nsubj"]}]
#extract by "parent" or extract by "child". Dep constraints will be applied to the specified token
dep_type = "parent"
attr1_candidates = get_attribute_candidates(positive_utterances, dep_type, dep, cand)

In [None]:
print(len(attr1_candidates))
for e in random.sample(attr1_candidates, 20):
    print(e.text)

## Method 2 of extracting attributes: list matches

In principle, you can implement any heuristic you want to extract positive training data for attributes. Here is a simple example of a heuristic that is applicable in situations where you know that you always want to extract certain terms

In [None]:
def simple_list_heuristic(doc):
#     target_strings = ["list", "of", "strings", "to", "be", "extracted", "as", "attributes"]
    target_strings = ["hamburger", "breakfast", "lunch", "dinner", "brunch", "tea", "coffee", "pizza"]
    cands = set()
    for i in range(len(doc)):
        span = doc[i:i+2]
        if span.text in target_strings:
            for t in span:
                cands.add(t)
    for token in doc:
        if token.text.lower() in target_strings:
            cands.add(token)
    return list(cands)

In [None]:
for sent in positive_utterances:
    list_extraction_result = simple_list_heuristic(sent.spacy)
    if list_extraction_result:
        attr1_candidates.update(list_extraction_result)


In [None]:
print(len(attr1_candidates))
for e in random.sample(attr1_candidates, 20):
    print(e.text)

## Trimming the attribute positive example set

In [None]:
# del_list = ['list', 'of', 'strings', 'that', 'should', 'not', 'be', 'extracted', 'as', 'attributes', 'but',
#            'are', 'included', 'in', 'the', 'set', 'created', 'by', 'the', 'previous', 'step']
del_list = ['husband', 'wife', 'brother', 'sister', 'son', 'daughter']
attr1_candidates = remove_attribute_examples(attr1_candidates, del_list)

In [None]:
print(len(attr1_candidates))
for e in random.sample(attr1_candidates, 20):
    print(e.text)

# Save the attribute data for training

In [None]:
filename = 'attr1.json'
save_ml_attr_data_to_file(attr1, attr1_candidates, filename)