# Imports

In [None]:
#imports
import sys
import pandas as pd
import random
import os
import json
import pprint

from frameit.corpus import Corpus
from frameit.utils import *

pp = pprint.PrettyPrinter(indent=4)
os.environ["KERAS_BACKEND"] = "tensorflow"

# Loading the positive set of a frame

This is necessary for testing

In [None]:
filename = 'frame_training_info.json'
positive_utterances = load_frame_pos_set(filename)

# Define attributes

In [None]:
#Specify the part of speech and dependencies that correspond to the attribute you are trying to extract.
#You will be able to provide more detailed information to extract attributes later
#name: str, the attribute's name, used to identify it
#linguistic_info: dict, keys are POS, DEP, and lemma. Values are lists of spacy part-of-speech and dependency tags
#(for POS and DEP) and a list of strings for lemma. If values are passed for a key, only attributes matching those
#values will be extracted by the model.
#unique: bool, if True only one attribute will be extracted per sentence using this model
proper_noun_attr = {"name": "Proper Noun Attribute", 
                    "linguistic_info": {"POS": ["PROPN"]}, 
                    "unique": False}

## Alternative method of attribute extraction: lambda rules

In [None]:
#lambda rules are functions that are added to an attribute and called when a sentence is passed to the frame for classification
#They can include any heuristic you want, but need to import any dependencies inside the function
#The function is passed to the attribute when calling its training function
#(this notebook saves the function to a json file from which the training script can load it)
#The following is an example heuristic that extracts proper nouns.
def get_prop_name(doc):
    from frameit import TextProcessing
    tp = TextProcessing()
    src_cand = tp.extract_candidates_by_parent(doc, 
                                               [{}], [{"pos":["propn"]}])
    src_cand_tokens = []
    for (_, sp, sent) in src_cand:
        if sp is not None:
            for t in sp:
                src_cand_tokens.append(t)
    return src_cand_tokens

You should test your lambda rule function on utterances from the corpus

In [None]:
for i in range(0,10):
    sent = positive_utterances.pop()
    print(sent.text)
    print(get_prop_name(sent.spacy))
    print('/n')

# Save the frame and attribute data for training

In [None]:
filename = 'attr2.json'
save_lambda_attr_data_to_file(proper_noun_attr, get_prop_name, filename)
