## Load data

Set up basic connection with MariaDB server

In [49]:
import getpass

user = "fnbrasil" # getpass.getuser()
password = getpass.getpass()

········


In [50]:
password

'OssracF1982'

In [51]:
from sqlalchemy import create_engine

conn_str = f'mysql+pymysql://{user}:{password}@localhost:3307/fnbr_db'
engine = create_engine(conn_str, pool_recycle=3600)

Load frames, frame elements, lexical units (in Portuguese and English) and frame relations from FrameNet Brasil's database into different DataFrames

In [52]:
import pandas as pd


frames = pd.read_sql('''
    select f.idFrame, e.name, e.description, tf.frameTop
    from view_frame f
    join entry e on e.entry = f.entry
    left join topframe tf on tf.frameBase = f.entry and tf.frameTop in ('frm_event', 'frm_entity', 'frm_attributes')
    where e.idLanguage = 2 and exists(
        select 1
        from view_relation r
        join domain d on d.`idEntity`  = r.idEntity2
        where r.idEntity1 = f.`idEntity` and d.entry ='dom_framenet' 
    );
''', engine).set_index("idFrame", drop=False)


fes = pd.read_sql('''
    select fe.idFrameElement, fe.typeEntry, e.name, e.description, fe.idFrame
    from view_frameelement fe
    join entry e on e.entry = fe.entry
    where idLanguage = 2;
''', engine).set_index("idFrameElement", drop=False)


lus_en = pd.read_sql('''
    select lu.idLU, lu.name, lu.senseDescription, lu.idFrame 
    from view_lu lu
    where lu.idLanguage = 2;
''', engine).set_index("idLU", drop=False)


lus_pt = pd.read_sql('''
    select lu.idLU, lu.name, lu.senseDescription, lu.idFrame 
    from view_lu lu
    where lu.idLanguage = 1;
''', engine).set_index("idLU", drop=False)


frame_relations = pd.read_sql('''
    select r.entry, f1.idFrame as 'idFrame1', f2.idFrame as 'idFrame2'
    from entityrelation e
    join relationtype r on r.idRelationType = e.idRelationType  
    join frame f1 on e.idEntity1 = f1.`idEntity` 
    join frame f2 on e.idEntity2 = f2.`idEntity`;
''', engine)

Load FrameNet+ from the local file system into a dataframe, then convert it into a **Dict[int, set[str]]**, where int is the numerical frame id and the set represents the LUs included by FrameNet+.

In [53]:
import os
import pandas as pd


fn_plus = pd.read_csv(os.path.join('FN+', 'all-data', 'frameindexLU'),
                     sep='\t', header=None, usecols=[2, 3], names=['frameName', 'luName'])

# Merge with frame DataFrame
fn_plus = fn_plus.merge(frames, how='left', left_on='frameName', right_on='name')[['idFrame', 'luName']]
# Remove those that were not found in the current database
fn_plus = fn_plus[~fn_plus['idFrame'].isnull()]

# Remove POS because we're not going to ask that from the AI assistants
fn_plus['idFrame'] = fn_plus['idFrame'].astype('int')
fn_plus['luName'] = fn_plus['luName'].apply(lambda l: l.split('.')[0])

fn_plus = fn_plus.groupby('idFrame')['luName'].apply(set).to_dict()

Build a similar dict but with the FrameNet Brasil LUs instead, to evaluate the other experiment.

In [None]:
fnbr = lus_pt.merge(frames.reset_index(drop=True), on="idFrame")[["name_x", "idFrame"]]
fnbr["name_x"] = fnbr["name_x"].apply(lambda l: l.lower().split('.')[0])

fnbr = fnbr.groupby("idFrame")["name_x"].apply(set).to_dict()

We then use the FrameNet+ dict to filter out frames from the main DataFrame. We want to make sure that we only consider frames that exist in FrameNet+ for the experiments.

In [None]:
frames = frames[frames['idFrame'].isin(fnp)]

Build FrameNet graph based on frame relations. This data will be used to find child frames when generating some prompts

In [54]:
import networkx as nx

inheritances = frame_relations[frame_relations["entry"] == "rel_inheritance"]

frame_network = nx.DiGraph()
frame_network.add_nodes_from(
    pd.concat([inheritances['idFrame1'], inheritances['idFrame2']]).unique())
frame_network.add_edges_from(
    [(j, i) for i, j in zip(inheritances['idFrame1'], inheritances['idFrame2'])])

## Preprocess

In [55]:
import re
import xml.etree.ElementTree as ET

def clean_def(text):
    """
    Cleans the definitions of frames and FEs of XML tags.
    A lot of times, XML tags are used to designate where the text refers to the actual definition
    or some example sentences.
    """
    text = re.sub(r'\<ex\>.*', '', text, flags=re.DOTALL)
    text = text.replace('<def-root>', '')
    text = text.replace('</def-root>', '')
    def_root = ET.fromstring('<def>' + text + '</def>')
    def_str = def_root.text if def_root.text is not None else ""
    for child in def_root:
        if child.tag == "ex":
            break
        if child.text is not None:
            def_str += child.text
        if child.tail is not None:
            def_str += child.tail

    return def_str.strip()

# Remove XML tags used to refer to frame elements in frame and FE definitions
frames["description"] = frames["description"].apply(clean_def)
fes["description"] = fes["description"].apply(clean_def)
# Remove redundant spaces and "#", sometimes used to refer to frame elements
frames["description"] = frames["description"].str.replace('\s+', r' ', regex=True)
frames["description"] = frames["description"].str.replace('#(\w+)', r'\1', regex=True)
fes["description"] = fes["description"].str.replace('\s+', r' ', regex=True)
fes["description"] = fes["description"].str.replace('#(\w+)', r'\1', regex=True)

# Stripping and removing "FN: " from LU sense descriptions
lus_en["senseDescription"] = lus_en["senseDescription"].str.replace('FN: ', '')
lus_en["senseDescription"] = lus_en["senseDescription"].str.strip()
lus_pt["senseDescription"] = lus_pt["senseDescription"].str.replace('FN: ', '')
lus_pt["senseDescription"] = lus_pt["senseDescription"].str.strip()
# Removing other symbols that we don't want on prompts (e.g. underscores instead of spaces in LU names)
lus_en["name"] = lus_en["name"].str.replace('_', ' ')
lus_en["name"] = lus_en["name"].str.replace('[', '(', regex=False)
lus_en["name"] = lus_en["name"].str.replace(']', ')', regex=False)
lus_pt["name"] = lus_pt["name"].str.replace('_', ' ')
lus_pt["name"] = lus_pt["name"].str.replace('[', '(', regex=False)
lus_pt["name"] = lus_pt["name"].str.replace(']', ')', regex=False)

# Strip frame names
frames['name'] = frames['name'].str.strip()

## Sample FrameNet

Here, we split frames into four different DataFrames based on their "type". Events, entities and attributes are already indicated in FrameNet Brasil's database. For artifacts we do it manually, following the inheritance tree of the Artifcat frame (id: 390)

In [56]:
def find_child(frame_id, network):
    """
    Recursively iterates over the frame network to find all
    child frames of a given frame and returns them as a set.
    """
    child = set()
    
    for pred in network.predecessors(frame_id):
        child.add(pred)
        child.update(find_child(pred, network))
    
    return child

In [58]:
frm_event = set(frames[frames['frameTop'] == 'frm_event']['idFrame'])
frm_entity = set(frames[frames['frameTop'] == 'frm_entity']['idFrame'])
frm_attribute = set(frames[frames['frameTop'] == 'frm_attributes']['idFrame'])

frm_artifact = find_child(390, frame_network)

Now we find the FE and LU count of each frame...

In [67]:
core_fes = fes[fes['typeEntry'].isin(['cty_core', 'cty_core-unexpressed'])]

fe_count = core_fes.groupby('idFrame').count()['idFrameElement']
lu_count = lus_en.groupby('idFrame').count()['idLU']

...and split them into different scenarios based on LU and FE counts. Here we have 4 scenarios based on the combination of low vs. high counts on FEs and LUs. Each frame type has its own splits, so we end up with 16 scenarios (of which 12 were reported in our Element)

In [77]:
def get_splits_by_scenario(frames, fe_count, lu_count):
    """
    This function takes a frame set and the global FE and
    LU counts and returns a 4-tuple containing lists of frame ids.
    Each list containts the frames that belong to that split/scenario.
    """
    fe_count = fe_count[fe_count.index.isin(frames)]
    lu_count = lu_count[lu_count.index.isin(frames)]
    
    lower_fe = fe_count[fe_count <= fe_count.quantile(.25)]
    upper_fe = fe_count[fe_count >= fe_count.quantile(.75)]
    
    lower_lu = lu_count[lu_count <= lu_count.quantile(.25)]
    upper_lu = lu_count[lu_count >= lu_count.quantile(.75)]
    
    return (
        lower_fe.index.intersection(lower_lu.index),
        lower_fe.index.intersection(upper_lu.index),
        upper_fe.index.intersection(lower_lu.index),
        upper_fe.index.intersection(upper_lu.index)
    )

event_splits = get_splits_by_scenario(frm_event, fe_count, lu_count)
entity_splits = get_splits_by_scenario(frm_entity, fe_count, lu_count)
attribute_splits = get_splits_by_scenario(frm_attribute, fe_count, lu_count)
artifact_splits = get_splits_by_scenario(frm_artifact, fe_count, lu_count)

## Generate prompts

These are the utility functions used by the prompt generators

In [93]:
import math
from random import random
from collections import OrderedDict
from networkx import all_neighbors

POS = {
    "v": "verb",
    "a": "adjective",
    "n": "noun",
    "adv": "adverb",
    "pron": "pronoun",
    "prep": "preposition",
    "idio": "idiomatic expression"
}

def get_prompt_data(idFrame):
    """
    Gets all the data required to build prompts for a given frame.
    """
    frm = frames.loc[idFrame]
    core_fes = fes[(fes["idFrame"] == idFrame) & (fes["typeEntry"] == "cty_core")]
    unexp_fes = fes[(fes["idFrame"] == idFrame) & (fes["typeEntry"] == "cty_core-unexpressed")]
    lus = lus_en[lus_en["idFrame"] == idFrame]
    
    core_fes = core_fes[["name", "description"]].to_dict('records')
    unexp_fes = unexp_fes[["name", "description"]].to_dict('records')
    lus = lus[["name", "senseDescription"]].to_dict('records')
    
    return {
        "id": frm["idFrame"],
        "name": frm["name"],
        "description": frm["description"],
        "core_fes": core_fes,
        "unexp_fes": unexp_fes,
        "lus": lus
    }


def fmt_entity(text):
    return text.replace('_', ' ')

def fmt_definition(text):
    return text.strip()[:-1] if text.strip()[-1] == '.' else text

def build_comma_string(arr):
    """
    Builds a string with commas and "and" of the array elements
    """
    if len(arr) == 1:
        return f'"{fmt_entity(arr[0])}"'
    
    if len(arr) == 2:
        return f'"{fmt_entity(arr[0])}" and "{fmt_entity(arr[-1])}"'
    
    other = ''.join([f', "{fmt_entity(w)}"' for w in arr[1:-1]])

    return f'"{fmt_entity(arr[0])}"{other} and "{fmt_entity(arr[-1])}"'


def count_pos(lus):
    """
    Counts the number of POS occurences in the LU set
    """
    counts = OrderedDict()
    
    for lu in lus:
        pos = lu[1]
        if pos in counts:
            counts[pos] += 1
        else:
            counts[pos] = 1
    
    return counts


def get_pos_names(counts):
    """
    Gets the POS names based on their counts, i.e.,
    turns them into plurals when necessary
    """
    pos_dict = POS.copy()
    
    for k, v in counts.items():
        if v > 1:
            # add plural
            pos_dict[k] += 's'
            
    return pos_dict


def build_fe_text(core_fes, unexp_fes = []):
    """
    Builds the basic FE text used in all prompts.
    """
    core_fes = [f["name"] for f in core_fes]
    unexp_fes = [f["name"] for f in unexp_fes]
    
    prompt = ''
    
    if len(core_fes) == 1:
        prompt += f' The core frame element in this frame is "{fmt_entity(core_fes[0])}".'
    else:
        prompt += f' Core frame elements in this frame are {to_comma(core_fes)}.'
    
    if len(unexp_fes) == 1:
        prompt += f' The core unexpressed frame element in this frame is "{fmt_entity(unexp_fes[0])}".'
    elif len(unexp_fes) > 1:
        prompt += f' Core unexpressed frame elements in this frame are {to_comma(unexp_fes)}.'
        
    return prompt


def build_lu_text(lus):
    """
    Builds the basic LU text used in all prompts.
    """
    lus = sorted((lu["name"].split('.') for lu in lus), key=lambda lu: lu[1])
    
    if len(lus) == 1:
        return f' This frame is evoked by the {POS[lus[0][1]]} "{lus[0][0]}"'
    else:
        pos = lus[0][1]
        pos_i = 1
        counts = count_pos(lus)
        names = pos_names(counts)
        order = list(counts.keys())
        text = f' Words evoking this frame are the {names[pos]} "{lus[0][0]}"'
        
        for i, lu in enumerate(lus[1:]):
            if lu[1] == pos:
                pos_i += 1
                last = i+2 == len(lus) or pos_i == counts[pos]
                sep = ' and' if last else ','
                text += f'{sep} "{lu[0]}"'
            else:
                pos = lu[1]
                pos_i = 0
                sep = ' and' if pos == order[-1] else ','
                text += f'{sep} the {names[pos]} "{lu[0]}"'
        
        return text + '.'


def get_random_child(frame_id, network):
    """
    Gets a random child frame of the given frame_id
    """
    options = list(network.predecessors(frame_id))
    return options[math.floor(random() * len(options))]

Functions used to build different types of prompts.

In [96]:
def prompt_suggest_lus(data):
    """
    Builds a prompt that asks the assistant to propose new LUs.
    """
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # FE definitions
    for fe in data["core_fes"]:
        prompt += f' {fmt_definition(fe["name"])}: {fmt_definition(fe["description"])}.'
    for fe in data["unexp_fes"]:
        prompt += f' {fmt_definition(fe["name"])}: {fmt_definition(fe["description"])}.'
        
    # LU
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' Please propose 10 additional words that evoke the "{fmt_entity(data["name"])}" semantic frame.'
    prompt += f' Present them as a JSON array.'
    
    return prompt


def prompt_create_from_lus(data, new_lus):
    """
    Builds a prompt that asks the assistant to create a new subframe
    of the given frame based on a set of LUs that the new frame must evoke.
    """
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    fes = [f["name"] for f in data["core_fes"]]
    
    if len(fes) == 1:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has one core frame element:'
        prompt += f' "{fmt_entity(fes[0])}".'
    else:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has {len(fes)} core elements:'
        prompt += f' {to_comma(fes)}.'

    for fe in data["core_fes"]:
        prompt += f' The definition of the "{fmt_entity(fe["name"])}" frame element is as follows:'
        prompt += f' "{fmt_definition(fe["description"])}".'

    # LUs
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' First, propose a semantic frame evoked by words such as {to_comma(new_lus)}.'
    prompt += f' Second, please propose semantic frames for other kinds of "{fmt_entity(data["name"])}".'
    prompt += ' Present them as table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


def prompt_create_from_inheritance(data, child_data):
    """
    Builds a prompt that asks the assistant to propose subframes of
    the given frame.
    """
    # Frame definition
    prompt = f'There is a semantic frame for "{fmt_entity(data["name"])}", whose definition is as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # Child frame
    prompt += f' The "{fmt_entity(child_data["name"])}" frame inherits the "{fmt_entity(data["name"])}" frame.'
    
    # Child FEs
    prompt += fe_text(data["core_fes"])
    
    # Child LUs
    prompt += lu_text(child_data["lus"])
    
    # Request part
    prompt += f' Please propose other semantic frames inheriting the "{fmt_entity(data["name"])}" frame.'
    prompt += ' Present them as a table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


A small test...

In [106]:
frames[frames["name"] == "Entity"]

Unnamed: 0_level_0,idFrame,name,description,frameTop
idFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
226,226,Entity,This frame is for words that denote highly sch...,frm_entity


In [97]:
print(prompt_create_from_lus(get_prompt_data(226), ["god", "saint", "deity", "goddess"]))

The semantic frame for "Entity" is defined as follows: "This frame is for words that denote highly schematic entities". The semantic frame for "Entity" has one core frame element: "Entity". The definition of the "Entity" frame element is as follows: "A thing (either abstract or physical) that exists with some degree of permanence". Words evoking this frame are the adverb "anything", the nouns "item", "entity", "object", "thing", "individual", "what", "material", "something", "article", "stuff", "paradox", "page", "plate", "rainbow", "trash", "waste", "label", "resource", "fuse", "grocery" and the pronoun "everything". First, propose a semantic frame evoked by words such as "god", "saint", "deity" and "goddess". Second, please propose semantic frames for other kinds of "Entity". Present them as table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".


In [107]:
frames[frames["name"] == "Intentionally_act"]

Unnamed: 0_level_0,idFrame,name,description,frameTop
idFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
178,178,Intentionally_act,This is an abstract frame for acts performed b...,frm_event


In [None]:
child = get_prompt_data(get_random_child(178, frame_network))

In [110]:
frames.loc[child["id"]]

idFrame                                                      783
name                                                Execute_plan
description    An Agent acts according to a Plan, carrying it...
frameTop                                               frm_event
Name: 783, dtype: object

In [111]:
print(prompt_create_from_inheritance(get_prompt_data(178), child))

There is a semantic frame for "Intentionally act", whose definition is as follows: "This is an abstract frame for acts performed by sentient beings". The core frame element in this frame is "Agent". The core unexpressed frame element in this frame is "Act". The "Execute plan" frame inherits the "Intentionally act" frame. The core frame element in this frame is "Agent". Words evoking this frame are the nouns "implementation", "effect (into effect)" and "force (into force)" and the verbs "implement" and "institute". Please propose other semantic frames inheriting the "Intentionally act" frame. Present them as a table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".


### Generate full experiment file (assistant suggest new files)

In [113]:
import numpy as np

def generate_split_prompt(splits, random=True, size=5):
    """
    Generates all prompts asking the assistant to suggest new LUs.
    These are organized into different splits and returned as a single string.
    """
    full_prompt = ''
    
    for i, split in enumerate(splits):
        full_prompt += "=================================================\n"
        full_prompt += f'---------------      SPLIT {i}      ---------------\n'
        full_prompt += "=================================================\n"
        
        if random:
            frame_ids = np.random.choice(split, size=size, replace=False)
        else:
            frame_ids = split
        
        for idFrame in frame_ids:
            full_prompt += f'{str(idFrame)} - {prompt_suggest_lus(get_prompt_data(idFrame))}'
            full_prompt += '\n\n\n'

    return full_prompt

In [115]:
all_prompts = generate_split_prompt(entity_splits)

ValueError: Cannot take a larger sample than population when 'replace=False'

## Evaluate

Here we use the compiled responses we got from ChatGPT and OpenAssistant to evaluate their LU suggestions

In [37]:
import os
import json
import pandas as pd

ASSISTANT = "ChatGPT" # OpenAssistant
LANG = "en" # or pt

with open(os.path.join(ASSISTANT, f'event_{LANG}.json')) as fp:
    event_lus = json.load(fp)
    
with open(os.path.join(ASSISTANT, f'entity_{LANG}.json')) as fp:
    entity_lus = json.load(fp)
    
with open(os.path.join(ASSISTANT, f'attribute_{LANG}.json')) as fp:
    attribute_lus = json.load(fp)
    
# with open(os.path.join(ASSISTANT, f'artifact_{LANG}.json')) as fp:
#     artifact_lus = json.load(fp)

In [39]:
def create_df(type_lus):
    return pd.DataFrame.from_records([
        { "idFrame": int(idFrame), "lu": clean_lu(lu), "split": split }
        for split, frames in type_lus.items()
        for idFrame, lus in frames.items()
        for lu in lus
    ])


def is_in_fn_plus(row, dataset):
    frame_id = row["idFrame"]
    return frame_id in fn_plus and row["lu"] in fn_plus[frame_id]

def is_in_fnbr(row, dataset):
    frame_id = row["idFrame"]
    return frame_id in fnbr and row["lu"] in fnbr[frame_id]

In [40]:
df = create_df(artifact_lus)

df['split'] = df['split'].str.replace('split-', '')

if LANG == "en":
    df['FN+?'] = df.apply(is_in_fn_plus, axis='columns')
else:
    df['FN-Br?'] = df.apply(is_in_fnbr, axis='columns')

df = df.merge(df_frm.reset_index(drop=True), on='idFrame')

df[['name', 'lu', 'split', 'FN+?']].to_csv('lus.csv', index=False)