## FrameNet API in nltk

In [None]:
import nltk
'''
    nltk.download('all')
o in alternativa
    nltk.download('framenet')
'''
from nltk.corpus import framenet as fn
from nltk.corpus.reader.framenet import PrettyList

from operator import itemgetter
from pprint import pprint
import numpy as np  

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from pprint import pprint

#### Credits

- Collin F. Baker, Nathan Schneider, Miriam R. L. Petruck, and Michael Ellsworth. Tutorial *Getting the Roles Right: Using FrameNet in NLP* tenuto presso la North American Chapter of the Association for Computational Linguistics - Human Language Technology (NAACL HLT 2015), 
    - http://naacl.org/naacl-hlt-2015/tutorial-framenet.html 
- documentazione NLTK, 
    - https://www.nltk.org/api/nltk.corpus.reader.html 

Documentazione all'URL [https://www.nltk.org/api/nltk.corpus.reader.html](https://www.nltk.org/api/nltk.corpus.reader.html)

- nltk.corpus.reader.framenet module, Corpus reader for the FrameNet 1.7 lexicon and corpus.

#### API Entry Points 
```
    frames([nameRegex])
    frame(exactName)
    frames_by_lemma(lemmaRegex)

    lus([nameRegex])
    fes([nameRegex])

    semtypes()
    propagate_semtypes()

    frame_relations([frame, [frame2,]] [type]) frame_relation_types()
    fe_relations()
```


### Pretty{List,Dict}
```
    >>> fn.frames('noise')
    [<frame ID=801 name=Cause_to_make_noise>, <frame ID=60 name=Motion_noise>, ...]
    >>> type(fn.frames('noise'))
    <class 'nltk.corpus.reader.framenet.PrettyList'>
```

PrettyList does 2 things: 
- limits the number of elements shown, and suppresses printing of their full details
    - Otherwise, it is just a list
- Similarly, PrettyDict suppresses printing of its values' details 

In [None]:
print(fn.frames(r'(?i)medical'))

In [None]:
print(fn.frames('Medical_specialties'))

In [None]:
f = fn.frame(256)
f.name

In [None]:
f.definition

In [None]:
def print_sep():
    print('\n_________________________________________________________\n\n')

f = fn.frame_by_name('Medical_specialties')
print(f)
print_sep()
print(fn.frame_by_name('Perception'))
print_sep()
print(fn.frame_by_name('Complaining'))

## Struttura interna del frame

The dict that is returned from the `frame` function will contain the
        following information about the Frame:

        - 'name'       : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.)
        - 'definition' : textual definition of the Frame
        - 'ID'         : the internal ID number of the Frame
        - 'semTypes'   : a list of semantic types for this frame
           - Each item in the list is a dict containing the following keys:
              - 'name' : can be used with the semtype() function
              - 'ID'   : can be used with the semtype() function

        - 'lexUnit'    : a dict containing all of the LUs for this frame.
                         The keys in this dict are the names of the LUs and
                         the value for each key is itself a dict containing
                         info about the LU (see the lu() function for more info.)

        - 'FE' : a dict containing the Frame Elements that are part of this frame
                 The keys in this dict are the names of the FEs (e.g. 'Body_system')
                 and the values are dicts containing the following keys
              - 'definition' : The definition of the FE
              - 'name'       : The name of the FE e.g. 'Body_system'
              - 'ID'         : The id number
              - '_type'      : 'fe'
              - 'abbrev'     : Abbreviation e.g. 'bod'
              - 'coreType'   : one of "Core", "Peripheral", or "Extra-Thematic"
              - 'semType'    : if not None, a dict with the following two keys:
                 - 'name' : name of the semantic type. can be used with
                            the semtype() function
                 - 'ID'   : id number of the semantic type. can be used with
                            the semtype() function
              - 'requiresFE' : if not None, a dict with the following two keys:
                 - 'name' : the name of another FE in this frame
                 - 'ID'   : the id of the other FE in this frame
              - 'excludesFE' : if not None, a dict with the following two keys:
                 - 'name' : the name of another FE in this frame
                 - 'ID'   : the id of the other FE in this frame

        - 'frameRelation'      : a list of objects describing frame relations
        - 'FEcoreSets'  : a list of Frame Element core sets for this frame
           - Each item in the list is a list of FE objects

        :param fn_fid_or_fname: The Framenet name or id number of the frame
        :type fn_fid_or_fname: int or str
        :param ignorekeys: The keys to ignore. These keys will not be
            included in the output. (optional)
        :type ignorekeys: list(str)
        :return: Information about a frame
        :rtype: dict



In [None]:
print(f)
print_sep()
# print(len(f.lexUnit))
# print(sorted([x for x in f.lexUnit]))
print_sep()
print(sorted([x for x in f.FE]))
print_sep()
print(f.frameRelations)
print_sep()

You can also search for Frames by their Lexical Units (LUs). The **frames_by_lemma()** function returns a list of all frames that contain LUs in which the 'name' attribute of the LU matches the given regular expression. Note that LU names are composed of "lemma.POS", where the "lemma" part can be made up of either a single lexeme (e.g. 'run') or multiple lexemes (e.g. 'a little') (see below).

In [None]:
print(fn.frames_by_lemma(r'(?i)epidemiol'))
print(fn.frames_by_lemma(r'(?i)accident'))

In [None]:
frame_list = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True)
frame_list.sort(key=itemgetter('ID'))

for f in frame_list:
    print('======================\nNAME: ' + str(f.name))
    print('======================\nDEF:  ' + str(f.definition))
    print('======================\nFEs:  ' + str(f.FE))
#     print('======================\nLUs:  ' + str(f.lexUnit))

In [None]:
""" Also see the ``frame()`` function for details about what is
    contained in the dict that is returned.
"""

f = fn.frame_by_id(256)

print('NAME: {}[{}]\tDEF: {}'.format(f.name, f.ID, f.definition))

print('\n____ FEs ____')
FEs = f.FE.keys()
for fe in FEs:
    fed = f.FE[fe]
    print('\tFE: {}\tDEF: {}'.format(fe, fed.definition))
    # print(fed.definition)
    
print('\n____ LUs ____')
LUs = f.lexUnit.keys()
for lu in LUs:
    print(lu)

#    print('\tFE-DEF: ' + fe.definition)

### Lexical Units

A lexical unit (LU) is a pairing of a word with a meaning. For example, the "Apply_heat" Frame describes a common situation involving a Cook, some Food, and a Heating Instrument, and is _evoked_ by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. These frame-evoking words are the LUs in the Apply_heat frame. Each sense of a polysemous word is a different LU.

We have used the word "word" in talking about LUs. The reality is actually rather complex. When we say that the word "bake" is polysemous, we mean that the lemma "bake.v" (which has the word-forms "bake", "bakes", "baked", and "baking") is linked to three different frames:

- Apply_heat: "Michelle baked the potatoes for 45 minutes."
- Cooking_creation: "Michelle baked her mother a cake for her birthday."
- Absorb_heat: "The potatoes have to bake for more than 30 minutes."

These constitute three different LUs, with different definitions.

Framenet provides multiple annotated examples of each sense of a word (i.e. each LU). Moreover, the set of examples (approximately 20 per LU) illustrates all of the combinatorial possibilities of the lexical unit.

Each LU is linked to a Frame, and hence to the other words which evoke that Frame. This makes the FrameNet database similar to a thesaurus, grouping together semantically similar words.

In the simplest case, frame-evoking words are verbs such as "fried" in:

"Matilde fried the catfish in a heavy iron skillet."
Sometimes event nouns may evoke a Frame. For example, "reduction" evokes "Cause_change_of_scalar_position" in:

"...the reduction of debt levels to $665 million from $2.6 billion."
Adjectives may also evoke a Frame. For example, "asleep" may evoke the "Sleep" frame as in:

"They were asleep for hours."

Many common nouns, such as artifacts like "hat" or "tower", typically serve as dependents rather than clearly evoking their own frames.

Details for a specific lexical unit can be obtained using this class's lus() function, which takes an optional regular expression pattern that will be matched against the name of the lexical unit:

In [None]:
print(fn.lus(r'(?i)a little'))
print(fn.lus(r'foresee'))

print(fn.frames_by_lemma(r'(?i)little'))


quante LUs sono presenti in FN??

In [None]:
print(len(fn.lus()))

consideriamo la LU di `foresee.v`

In [None]:
print(fn.lu(256).frame.name)
print(fn.lu(256).definition)
print(fn.lu(256).lexemes[0].name)

---

### Vendetta!

Immaginiamo di accedere al frame 'Revenge'. Prima visualizziamo tutto il suo contenuto, e poi accediamo a Frame Elements (FEs) e Lexical Units (LUs).



In [None]:
f = fn.frame('Revenge')

print(f)

In [None]:
print(f.FE)

è possibile inoltre accedere selettivamente alla definzione associata a un certo FE

In [None]:
f.FE['Injury'].definition

elenco delle LUs del frame

In [None]:
f.lexUnit.keys()

selezione di tutti i frame che hanno un FE che ha a che fare con 'location':

In [None]:
fn.fes('location')

{fe.name for fe in fn.fes("location")}

e per ciascuno dei FEs che ha a che fare con 'location' possiamo risalire al relativo Frame:

In [None]:
for fe in fn.fes("location"):
    print(fe.frame.name + '.' + fe.name)

### Frame relations

Elenco delle possibili relazioni fra frame

In [None]:
import nltk
import re
import sys

def get_fn_relations(fn_rel_list):
    frame_rels = []

    for f in fn_rel_list:
        text = str(f)
        try:
            found = re.search('.*-- (.+?) ->.*', text).group(1)
            # print(found)
            frame_rels.append(found)
        except AttributeError:
            print('the expression \n\t{}\n does not contain the searched pattern'.format(f))
            sys.exit(1)
    
    # rels_set = set(frame_rels)
    # print(rels_set)
    return set(frame_rels)

fn_rels = get_fn_relations(fn.frame_relations())
for fr in fn_rels:
    print('\t' + fr)

Possibile utilizzo: che cosa viene causato da 'Make_noise'?

In [None]:
fn.frame_relations(frame='Make_noise', type='Causative_of')

e più in generale, con quali altri frame è in relazione '`Make_noise`'?

In [None]:
rels = fn.frame_relations(frame='Make_noise')
for rel in rels:
    print(rel)


Accesso alle **annotazioni**

In [None]:
input_term = 'revenge'
count = 0

while count < 10:
    print(fn.exemplars(input_term)[count].FE)
    # print(fn.exemplars(input_term)[count].POS)
    print(fn.exemplars(input_term)[count].annotationSet[0])
    count += 1
    print_sep()

---
### getFrameSetForStudent

Funzione per assegnare a ciascuno un insieme di frame.

In [None]:
import hashlib
import random
from random import randint
from random import seed

def print_frames_with_IDs():
    for x in fn.frames():
        print('{}\t{}'.format(x.ID, x.name))

def get_frams_IDs():
    return [f.ID for f in fn.frames()]   

def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('\nstudent: ' + surname)
    framenet_IDs = get_frams_IDs()
    i = 0
    offset = 0 
    seed(1)
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        print('\tID: {a:4d}\tframe: {framename}'.format(a=fID, framename=fNAME))
        offset = randint(0, nof_frames)
        i += 1        


getFrameSetForStudent('LuCIAnI')
getFrameSetForStudent('FanCELlu')


#### Metodi utili per fare pre-processing

In [None]:
def lemmatize_words(text):
    result = []
    lemmatizer = WordNetLemmatizer()
    for word in text:
        result.append(lemmatizer.lemmatize(word))
    return result

def remove_punctuation(s):
    return re.sub(r'[^\w\s]', '', s)

def remove_stop_words(row):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in row if not w.lower() in stop_words]
    return filtered_sentence

Id dei frame estratti 

In [71]:
ids = [2724, 244, 5, 1612, 1360, 1481, 2524, 2569, 1514, 1750]

#### Per ciascun Frame assegno un WN synset a:
    1. Frame Name
    2. Frame Elements (FEs)
    3. Lexical Units (LUs)

In [None]:
def get_synset_from_frames(index):
    frame_name = fn.frame_by_id(index).name
    return wn.synsets(frame_name.split('_')[0])

#### Estrapolazione del contesto dal synset
Obiettivo è creare una lista che contenga i termini rilevanti per creare un contesto del synset. I termini rilevanti sono all'interno di:
- definizione del synset;
- esempi del synset;
- lemmi;
- definizioni di iponimi e iperonimi.
- esempi di iponimi e iperonimi

In [None]:
def get_synset_ctx(synset):
    defs = remove_punctuation(synset.definition())
    exs = remove_punctuation(' '.join(synset.examples()))
    defs = remove_stop_words(lemmatize_words(defs.split()))
    exs = remove_stop_words(lemmatize_words(exs.split()))

    hypo = synset.hyponyms()
    hyper = synset.hypernyms()
    lemmas = synset.lemmas()
    
    lems = []
    for l in lemmas:
        lems = lems + l.name().split("_")

    for h in hypo:
        defs = defs + remove_stop_words(lemmatize_words(remove_punctuation(h.definition()).split()))
        exs = exs + remove_stop_words(lemmatize_words(remove_punctuation(' '.join(h.examples())).split()))
    
    for h in hyper:
        defs = defs + remove_stop_words(lemmatize_words(remove_punctuation(h.definition()).split()))
        exs = exs + remove_stop_words(lemmatize_words(remove_punctuation(' '.join(h.examples())).split()))

    return defs + exs + lems

#### Estrapolazione del contesto dal frame
Obiettivo è creare una lista che contenga i termini rilevanti per creare un contesto del frame. I termini rilevanti sono presi sia dalla definizione del frame sia dalla definizione dei FEs

In [None]:
def get_frame_ctx(id):
    ctx = []
    frame = fn.frame_by_id(id)
    ctx = ctx + frame.name.split(' ')
    defs = remove_punctuation(frame.definition)
    defs = remove_stop_words(defs.split(" "))
    ctx = ctx + lemmatize_words(defs)
    
    for fe in frame.FE:
        fe_defs = remove_punctuation(frame.FE[fe].definition)
        fe_defs = remove_stop_words(fe_defs.split(" "))
        ctx = ctx + lemmatize_words(fe_defs)
    return ctx

#### Approccio Bag of Words

In [None]:
def bag_of_words(frame_context, synset_context):
    return len([word for word in frame_context if word in synset_context]) + 1

In [None]:
def get_evaluation(result_dict, annotation_dict):
    count_eval = 0
    len_total = 0
    for k in result_dict.keys():
        len_total += len(result_dict[k].keys())
        for el in result_dict[k].keys():
            if result_dict[k][el] == annotation_dict[k][el]:
                count_eval += 1

    return count_eval / len_total

#### Esecuzione

In [80]:
annotation_dict = {
    'Chemical_potency': {
        'Chemical_potency':'chemical.n.01',
        'Chemical_entity':'chemical.n.01',
        'Degree':'degree.n.02',
        'Time':'time.n.03',
        'Circumstances':'circumstance.n.01',
        'Place':'place.n.02',
        'strong.a':'strong.a.01',
        'potent.a':'potent.s.02',
        'stiff.a':'potent.a.03'
    },
    'Fullness': {
        'Fullness': 'fullness.n.03',
        'Container' : 'container.n.01',
        'Contents': 'content.n.01',
        'Degree': 'degree.n.01',
        'Time': 'time.n.03',
        'Frequency': 'frequency.n.01',
        'Duration': 'duration.n.01',
        'full.a': 'full.a.01',
        'empty.a': 'empty.a.01',
        'emptiness.n': 'emptiness.n.01',
        'fullness.n': 'fullness.n.03'
    },
    'Causation': {
        'Causation': 'causing.n.01',
        'Cause': 'cause.n.01',
        'Affected': 'affected.a.01',
        'Effect': 'consequence.n.01',
        'Place': 'place.n.02',
        'Time': 'time.n.03',
        'Actor': 'actor.n.02',
        'Circumstances': 'context.n.02',
        'Manner': 'manner.n.01',
        'Explanation': 'explanation.n.01',
        'Means': 'means.n.01',
        'Frequency': 'frequency.n.01',
        'Concessive': 'concessive.a.01',
        'cause.v':'cause.v.01',
        'cause.n':'cause.n.01',
        'make.v':'make.v.03',
        'lead (to).v':'lead.v.03',
        'reason.n':'cause.n.02',
        'send.v':'send.v.01',
        'bring about.v':'bring.v.03',
        'precipitate.v':'precipitate.v.01',
        'causative.a':'causative.a.1',
        'render.v':'render.v.01',
        'bring.v':'bring.v.02',
        'bring on.v':'bring.v.02',
        'induce.v':'induce.v.01',
        'wreak.v':'bring.v.03',
        'put.v':'put.v.02',
        'since.c': None,
        'because.c': None,
        'because of.prep': None,
        'raise.v':'raise.v.03',
        'result (in).v':'result.v.01'
    },
    'Disgraceful_situation': {
        'Disgraceful_situation': 'disgraceful.s.01',
        'State_of_affairs': 'state.n.02',
        'Protagonist': 'protagonist.n.02',
        'Degree': 'degree.n.01',
        'Explanation': 'explanation.n.01',
        'Judge': None,
        'disgraceful.a': 'disgraceful.s.01',
        'shameful.a': 'disgraceful.s.01'
    },
    'Obviousness': {
        'Obviousness': 'obviousness.n.01',
        'Phenomenon': 'phenomenon.n.01',
        'Attribute': 'property.n.04',
        'Degree': 'degree.n.01',
        'Time': 'time.n.03',
        'Circumstances': 'circumstance.n.01',
        'Perceiver': 'perceiver.n.01',
        'Evidence': 'evidence.n.02',
        'Group': 'group.n.01',
        'Location_of_protagonist': 'location.n.01',
        'Particular_iteration': 'particular.s.06',
        'Direction': 'direction.n.03',
        'obvious.a': 'obvious.a.01',
        'evident.a': 'apparent.s.01',
        'manifest.a': 'apparent.s.01',
        'visible.a': 'visible.a.01',
        'audible.a': 'audible.a.01',
        'unclear.a': 'unclear.a.02',
        'clear.a': 'clear.a.01',
        'clearly.adv': 'clearly.r.01',
        'obviously.adv': 'obviously.r.01',
        'clarity.n': 'clarity.n.01',
        'show.v': 'show.v.04',
        'show up.v': 'show.v.04',
        'stand out.v': None,
        'noticeable.a': 'noticeable.a.01'
    },
    'Infrastructure': {
        'Infrastructure':'infrastructure.n.02',
        'Activity':'activity.n.01',
        'Place':'topographic_point.n.01',
        'Possessor':'owner.n.02',
        'Resource':'resource.n.02',
        'User':'user.n.01',
        'Descriptor':'descriptor.n.02',
        'Infrastructure':'infrastructure.n.01',
        'infrastructure.n':'infrastructure.n.01',
        'base.n':'basis.n.02'
    },
    'Product_line': {
        'Product_line': 'merchandise.n.01',
        'Brand' : 'trade_name.n.01',
        'Collection': 'collection.n.01',
        'Products': 'merchandise.n.01',
        'Descriptor': 'descriptor.n.02',
        'Collection_name': 'collection.n.01',
        'Designer': 'couturier.n.01',
        'line.n': 'line.n.22',
        'collection.n': 'collection.n.01'
    },
    'Gusto': {
        'Gusto': 'gusto.n.01',
        'Person': 'person.n.01',
        'Degree': 'degree.n.01',
        'life.n': 'liveliness.n.02',
        'vim.n': 'energy.n.05',
        'spirit.n': 'spirit.n.03'
    },
    'Military': {
        'Military': 'military.n.01',
        'Force': 'force.n.04',
        'Possessor': 'owner.n.02',
        'Descriptor': None,
        'Members': 'member.n.04',
        'Domain': 'domain.n.02',
        'Goal': 'goal.n.01',
        'Period_of_existence': 'time_period.n.01',
        'military.n': 'military.n.01',
        'force.n': 'force.n.04',
        'navy.n': 'navy.n.01',
        'air force.n': None,
        'army.n': 'army.n.01',
        'naval.a': 'naval.a.01',
        'armed forces.n': None,
        'military.a': 'military.a.01',
        'military forces.n': 'military.n.01',
        'militia.n': 'militia.n.01',
        'national guard.n': None,
        'marines.n': 'marines.n.01',
        'coast guard.n': None
    },
    'Terrorism': {
        'Terrorism': 'terrorism.n.01',
        'Terrorist': 'terrorist.n.01',
        'Act': 'act.n.02',
        'Victim': 'victim.n.01',
        'Organization': 'organization.n.01',
        'Descriptor': 'descriptor.n.02',
        'Manner': 'manner.n.01',
        'Means': 'means.n.01',
        'Time': 'time.n.01',
        'Place': 'topographic_point.n.01',
        'Purpose': 'purpose.n.01',
        'Instrument': 'instrument.n.02',
        'terrorism.n': 'terrorism.n.01',
        'terrorist.n': 'terrorist.n.01',
        'ecoterrorism [environmentalism].n': 'ecoterrorism.n.01',
        'ecoterrorist [environmentalist].n': None,
        'bioterrorism.n': 'bioterrorism.n.01',
        'bioterrorist.n': None,
        'ecoterrorist.n': None,
        'ecoterrorism.n': 'ecoterrorism.n.01',
        'obviously.adv': 'obviously.r.01',
        'terror.n': 'terror.n.04'
    }
}

result_dict = {}
for id in ids:
    frame = fn.frame_by_id(id)
    result_dict[frame.name] = {}
    frame_context = get_frame_ctx(id)
    synsets = get_synset_from_frames(id)
    res_max = 0
    best_syn = None
    
    for syn in synsets:
        synset_context = get_synset_ctx(syn)
        res = bag_of_words(frame_context, synset_context)
        if res > res_max:
            res_max = res
            best_syn = syn
    if best_syn is not None:
        result_dict[frame.name][frame.name] = best_syn.name()
    else:
        result_dict[frame.name][frame.name] = None
    
    for fe in frame.FE:
        res_max = 0
        best_syn = None
        for syn in wn.synsets(fe.split("_")[0]):
            synset_context = get_synset_ctx(syn)
            res = bag_of_words(frame_context, synset_context)
            if res > res_max:
                res_max = res
                best_syn = syn
        if best_syn is not None:
            result_dict[frame.name][fe] = best_syn.name()
        else:
            result_dict[frame.name][fe] = None
    i = 0
    for lu in frame.lexUnit.keys():
        res_max = 0
        best_syn = None
        if i<20:
            for syn in wn.synsets(lu.split(".")[0].split(" ")[0]):
                synset_context = get_synset_ctx(syn)
                res = bag_of_words(frame_context, synset_context)
                if res > res_max:
                    res_max = res
                    best_syn = syn
            i += 1
            if best_syn is not None:        
                result_dict[frame.name][lu] = best_syn.name()
            else:
                result_dict[frame.name][lu] = None

print(f"La valutazione del sistema è del: {round(get_evaluation(result_dict, annotation_dict)*100, 2)}%")


La valutazione del sistema è del: 49.67%
