## Load data

In [1]:
import getpass
password = getpass.getpass()

········


In [2]:
from sqlalchemy import create_engine

conn_str = f'mysql+pymysql://fnbrasil:{password}@localhost:3307/fnbr_db'

engine = create_engine(conn_str, pool_recycle=3600)

In [3]:
import pandas as pd

df_frm = pd.read_sql('''
    select f.idFrame, e.name, e.description, tf.frameTop
    from view_frame f
    join entry e on e.entry = f.entry
    left join topframe tf on tf.frameBase = f.entry and tf.frameTop in ('frm_event', 'frm_entity', 'frm_attributes')
    where e.idLanguage = 2 and exists(
        select 1
        from view_relation r
        join domain d on d.`idEntity`  = r.idEntity2
        where r.idEntity1 = f.`idEntity` and d.entry ='dom_framenet' 
    );
''', engine).set_index("idFrame", drop=False)

In [4]:
df_fe = pd.read_sql('''
    select fe.idFrameElement, fe.typeEntry, e.name, e.description, fe.idFrame
    from view_frameelement fe
    join entry e on e.entry = fe.entry
    where idLanguage = 2;
''', engine).set_index("idFrameElement", drop=False)

In [5]:
df_lu = pd.read_sql('''
    select lu.idLU, lu.name, lu.senseDescription, lu.idFrame 
    from view_lu lu
    where lu.idLanguage = 2;
''', engine).set_index("idLU", drop=False)

In [6]:
df_lu_pt = pd.read_sql('''
    select lu.idLU, lu.name, lu.senseDescription, lu.idFrame 
    from view_lu lu
    where lu.idLanguage = 1;
''', engine).set_index("idLU", drop=False)

In [7]:
df_rel = pd.read_sql('''
    select r.entry, f1.idFrame as 'idFrame1', f2.idFrame as 'idFrame2'
    from entityrelation e
    join relationtype r on r.idRelationType = e.idRelationType  
    join frame f1 on e.idEntity1 = f1.`idEntity` 
    join frame f2 on e.idEntity2 = f2.`idEntity`;
''', engine)

In [8]:
import os
import pandas as pd


df_fnp = pd.read_csv(os.path.join('FN+', 'all-data', 'frameindexLU'),
                     sep='\t', header=None, usecols=[2, 3], names=['frameName', 'luName'])

df_fnp = df_fnp.merge(df_frm, how='left', left_on='frameName', right_on='name')[['idFrame', 'luName']]
df_fnp = df_fnp[~df_fnp['idFrame'].isnull()]

df_fnp['idFrame'] = df_fnp['idFrame'].astype('int')
df_fnp['luName'] = df_fnp['luName'].apply(lambda l: l.split('.')[0])

fnp = df_fnp.groupby('idFrame')['luName'].apply(set).to_dict()

In [9]:
import networkx as nx

df_inh = df_rel[df_rel["entry"] == "rel_inheritance"]

fn = nx.DiGraph()
fn.add_nodes_from(pd.concat([df_inh['idFrame1'], df_inh['idFrame2']]).unique())
fn.add_edges_from([(j, i) for i, j in zip(df_inh['idFrame1'], df_inh['idFrame2'])])

## Preprocess

In [10]:
import re
import xml.etree.ElementTree as ET

def clean_def(text):
    text = re.sub(r'\<ex\>.*', '', text, flags=re.DOTALL)
    text = text.replace('<def-root>', '')
    text = text.replace('</def-root>', '')
    def_root = ET.fromstring('<def>' + text + '</def>')
    def_str = def_root.text if def_root.text is not None else ""
    for child in def_root:
        if child.tag == "ex":
            break
        if child.text is not None:
            def_str += child.text
        if child.tail is not None:
            def_str += child.tail

    return def_str.strip()

# Remove "meta"
df_frm["description"] = df_frm["description"].apply(clean_def)
df_frm["description"] = df_frm["description"].str.replace('\s+', r' ', regex=True)
df_frm["description"] = df_frm["description"].str.replace('#(\w+)', r'\1', regex=True)

df_fe["description"] = df_fe["description"].apply(clean_def)
df_fe["description"] = df_fe["description"].str.replace('\s+', r' ', regex=True)
df_fe["description"] = df_fe["description"].str.replace('#(\w+)', r'\1', regex=True)

df_lu["senseDescription"] = df_lu["senseDescription"].str.replace('FN: ', '')
df_lu["senseDescription"] = df_lu["senseDescription"].str.strip()
df_lu_pt["senseDescription"] = df_lu_pt["senseDescription"].str.replace('FN: ', '')
df_lu_pt["senseDescription"] = df_lu_pt["senseDescription"].str.strip()

# Other
df_lu["name"] = df_lu["name"].str.replace('_', ' ')
df_lu["name"] = df_lu["name"].str.replace('[', '(', regex=False)
df_lu["name"] = df_lu["name"].str.replace(']', ')', regex=False)
df_lu_pt["name"] = df_lu_pt["name"].str.replace('_', ' ')
df_lu_pt["name"] = df_lu_pt["name"].str.replace('[', '(', regex=False)
df_lu_pt["name"] = df_lu_pt["name"].str.replace(']', ')', regex=False)

df_frm['name'] = df_frm['name'].str.strip()

## Sample FrameNet

In [11]:
df_filtered =  df_frm[df_frm['idFrame'].isin(fnp)]

frm_event = set(df_filtered[df_filtered['frameTop'] == 'frm_event']['idFrame'])
frm_entity = set(df_filtered[df_filtered['frameTop'] == 'frm_entity']['idFrame'])
frm_attribute = set(df_filtered[df_filtered['frameTop'] == 'frm_attributes']['idFrame'])

In [12]:
def recur_pred(n):
    pred = set()
    
    for p in fn.predecessors(n):
        pred.add(p)
        pred.update(recur_pred(p))
    
    return pred

frm_artifact = recur_pred(390) # Hard-coded frame Id
frm_artifact &= set(df_frm['idFrame'])   # Must be in BFN

In [13]:
len(frm_artifact)

25

In [14]:
df_corefe = df_fe[df_fe['typeEntry'].isin(['cty_core', 'cty_core-unexpressed'])]

frm_event_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_event)].groupby('idFrame').count()['idFrameElement']
frm_entity_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_entity)].groupby('idFrame').count()['idFrameElement']
frm_attribute_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_attribute)].groupby('idFrame').count()['idFrameElement']
frm_artifact_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_artifact)].groupby('idFrame').count()['idFrameElement']

frm_event_lu_count = df_lu[df_lu['idFrame'].isin(frm_event)].groupby('idFrame').count()['idLU']
frm_entity_lu_count = df_lu[df_lu['idFrame'].isin(frm_entity)].groupby('idFrame').count()['idLU']
frm_attribute_lu_count = df_lu[df_lu['idFrame'].isin(frm_attribute)].groupby('idFrame').count()['idLU']
frm_artifact_lu_count = df_lu[df_lu['idFrame'].isin(frm_artifact)].groupby('idFrame').count()['idLU']


In [15]:
def get_frm_splits(fe_count, lu_count):
    lower_fe = fe_count[fe_count <= fe_count.quantile(.25)]
    upper_fe = fe_count[fe_count >= fe_count.quantile(.75)]
    
    lower_lu = lu_count[lu_count <= lu_count.quantile(.25)]
    upper_lu = lu_count[lu_count >= lu_count.quantile(.75)]
    
    return (
        lower_fe.index.intersection(lower_lu.index),
        lower_fe.index.intersection(upper_lu.index),
        upper_fe.index.intersection(lower_lu.index),
        upper_fe.index.intersection(upper_lu.index)
    )

def print_stats(splits):
    print('Low FE x Low LU:', len(splits[0]))
    print('Low FE x Upp LU:', len(splits[1]))
    print('Upp FE x Low LU:', len(splits[2]))
    print('Upp FE x Upp LU:', len(splits[3]))

event_splits = get_frm_splits(frm_event_fe_count, frm_event_lu_count)
entity_splits = get_frm_splits(frm_entity_fe_count, frm_entity_lu_count)
attribute_splits = get_frm_splits(frm_attribute_fe_count, frm_attribute_lu_count)
artifact_splits = get_frm_splits(frm_artifact_fe_count, frm_artifact_lu_count)

In [16]:
df_frm.loc[1156]

idFrame                                                     1156
name                                                         Key
description    This frame is for objects, physical or otherwi...
frameTop                                              frm_entity
Name: 1156, dtype: object

In [31]:
frm_artifact_fe_count

idFrame
123     1
144     1
157     1
232     1
272     1
273     1
388     1
395     1
398     1
431     6
482     3
505     1
656     5
696     1
717     3
756     1
791     1
1084    1
1085    2
1098    1
1099    1
1104    1
1122    1
1156    4
1181    1
Name: idFrameElement, dtype: int64

In [32]:
artifact_splits

(Int64Index([1098, 1104, 1181], dtype='int64', name='idFrame'),
 Int64Index([123, 144, 157, 272, 395, 398], dtype='int64', name='idFrame'),
 Int64Index([656, 717, 1085, 1098, 1104, 1156, 1181], dtype='int64', name='idFrame'),
 Int64Index([123, 144, 157, 272, 395, 398, 431], dtype='int64', name='idFrame'))

## Generate prompts

This are the utility functions used by the prompt generators

In [19]:
import math
from random import random
from collections import OrderedDict
from networkx import all_neighbors

POS = {
    "v": "verb",
    "a": "adjective",
    "n": "noun",
    "adv": "adverb",
    "pron": "pronoun",
    "prep": "preposition",
    "idio": "idiomatic expression"
}


def get_prompt_data(idFrame):
    frm = df_frm.loc[idFrame]
    core_fes = df_fe[(df_fe["idFrame"] == idFrame) & (df_fe["typeEntry"] == "cty_core")]
    unexp_fes = df_fe[(df_fe["idFrame"] == idFrame) & (df_fe["typeEntry"] == "cty_core-unexpressed")]
    lus = df_lu[df_lu["idFrame"] == idFrame]
    
    core_fes = core_fes[["name", "description"]].to_dict('records')
    unexp_fes = unexp_fes[["name", "description"]].to_dict('records')
    lus = lus[["name", "senseDescription"]].to_dict('records')
    
    return {
        "id": frm["idFrame"],
        "name": frm["name"],
        "description": frm["description"],
        "core_fes": core_fes,
        "unexp_fes": unexp_fes,
        "lus": lus
    }


def fmt_entity(text):
    return text.replace('_', ' ')

def fmt_definition(text):
    return text.strip()[:-1] if text.strip()[-1] == '.' else text

def to_comma(arr):
    if len(arr) == 1:
        return f'"{fmt_entity(arr[0])}"'
    
    if len(arr) == 2:
        return f'"{fmt_entity(arr[0])}" and "{fmt_entity(arr[-1])}"'
    
    other = ''.join([f', "{fmt_entity(w)}"' for w in arr[1:-1]])

    return f'"{fmt_entity(arr[0])}"{other} and "{fmt_entity(arr[-1])}"'


def pos_counts(lus):
    counts = OrderedDict()
    
    for lu in lus:
        pos = lu[1]
        if pos in counts:
            counts[pos] += 1
        else:
            counts[pos] = 1
    
    return counts


def pos_names(counts):
    pos_dict = POS.copy()
    
    for k, v in counts.items():
        if v > 1:
            # add plural
            pos_dict[k] += 's'
            
    return pos_dict


def fe_text(core_fes, unexp_fes = []):
    core_fes = [f["name"] for f in core_fes]
    unexp_fes = [f["name"] for f in unexp_fes]
    
    prompt = ''
    
    if len(core_fes) == 1:
        prompt += f' The core frame element in this frame is "{fmt_entity(core_fes[0])}".'
    else:
        prompt += f' Core frame elements in this frame are {to_comma(core_fes)}.'
    
    if len(unexp_fes) == 1:
        prompt += f' The core unexpressed frame element in this frame is "{fmt_entity(unexp_fes[0])}".'
    elif len(unexp_fes) > 1:
        prompt += f' Core unexpressed frame elements in this frame are {to_comma(unexp_fes)}.'
        
    return prompt


def lu_text(lus):
    lus = sorted((lu["name"].split('.') for lu in lus), key=lambda lu: lu[1])
    
    if len(lus) == 1:
        return f' This frame is evoked by the {POS[lus[0][1]]} "{lus[0][0]}"'
    else:
        pos = lus[0][1]
        pos_i = 1
        counts = pos_counts(lus)
        names = pos_names(counts)
        order = list(counts.keys())
        text = f' Words evoking this frame are the {names[pos]} "{lus[0][0]}"'
        
        for i, lu in enumerate(lus[1:]):
            if lu[1] == pos:
                pos_i += 1
                last = i+2 == len(lus) or pos_i == counts[pos]
                sep = ' and' if last else ','
                text += f'{sep} "{lu[0]}"'
            else:
                pos = lu[1]
                pos_i = 0
                sep = ' and' if pos == order[-1] else ','
                text += f'{sep} the {names[pos]} "{lu[0]}"'
        
        return text + '.'


def random_child_frm(anchor, graph):
    # Get "neighbors" with the correct direction
    options = [n for n in all_neighbors(graph, anchor) if graph.has_edge(n, anchor)]
    return options[math.floor(random() * len(options))]

In [20]:
def prompt_suggest_lus(data):
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # FE definitions
    for fe in data["core_fes"]:
        prompt += f' {fmt_definition(fe["name"])}: {fmt_definition(fe["description"])}.'
    for fe in data["unexp_fes"]:
        prompt += f' {fmt_definition(fe["name"])}: {fmt_definition(fe["description"])}.'
        
    # LU
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' Please propose 10 additional words that evoke the "{fmt_entity(data["name"])}" semantic frame.'
    prompt += f' Present them as a JSON array.'
    
    return prompt


def prompt_create_from_lus(data, new_lus):
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    fes = [f["name"] for f in data["core_fes"]]
    
    if len(fes) == 1:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has one core frame element:'
        prompt += f' "{fmt_entity(fes[0])}".'
    else:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has {len(fes)} core elements:'
        prompt += f' {to_comma(fes)}.'

    for fe in data["core_fes"]:
        prompt += f' The definition of the "{fmt_entity(fe["name"])}" frame element is as follows:'
        prompt += f' "{fmt_definition(fe["description"])}".'

    # LUs
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' First, propose a semantic frame evoked by words such as {to_comma(new_lus)}.'
    prompt += f' Second, please propose semantic frames for other kinds of "{fmt_entity(data["name"])}".'
    prompt += ' Present them as table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


def prompt_create_from_inheritance(data, child_data):
    # Frame definition
    prompt = f'There is a semantic frame for "{fmt_entity(data["name"])}", whose definition is as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # Child frame
    prompt += f' The "{fmt_entity(child_data["name"])}" frame inherits the "{fmt_entity(data["name"])}" frame.'
    
    # Child FEs
    prompt += fe_text(data["core_fes"])
    
    # Child LUs
    prompt += lu_text(child_data["lus"])
    
    # Request part
    prompt += f' Please propose other semantic frames inheriting the "{fmt_entity(data["name"])}" frame.'
    prompt += ' Present them as a table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


### Prompt for new LUs

In [21]:
import numpy as np

def generate_split_prompt(splits, random=True, size=5):
    full_prompt = ''
    
    for i, split in enumerate(splits):
        full_prompt += "=================================================\n"
        full_prompt += f'---------------      SPLIT {i}      ---------------\n'
        full_prompt += "=================================================\n"
        
        if random:
            frame_ids = np.random.choice(split, size=size, replace=False)
        else:
            frame_ids = split
        
        for idFrame in frame_ids:
            full_prompt += f'{str(idFrame)} - {prompt_suggest_lus(get_prompt_data(idFrame))}'
            full_prompt += '\n\n\n'

    return full_prompt



In [22]:
prompt = generate_split_prompt(artifact_splits, random=False)

In [23]:
print(prompt)

---------------      SPLIT 0      ---------------
1098 - The semantic frame for "Information display" is defined as follows: "A human-made artifact, digital or physical, which is used to display information presented via the object". The core frame element in this frame is "Display". Display: It identifies the entity or substance that is designed to have a particular Use. Words evoking this frame are the nouns "monitor", "board", "board (forum)" and "board (white/black/chalk)". Please propose 10 additional words that evoke the "Information display" semantic frame. Present them as a JSON array.


1104 - The semantic frame for "Distant operated IED" is defined as follows: "A improvised explosive device, the Bomb, is rigged to detonate when it receives a signal, typically by a wire that leads to a trigger. The trigger may be operated directly by the Detonator, in which case the weapon is known as a CWIED (command wire IED). Or, the triggering mechanism may be activated wirelessly, such as

In [24]:
attribute_splits = (
    [318, 220, 46, 731, 1015],
    [345, 1000, 453, 838, 326],
    [882, 959, 685, 725, 601],
    [342, 74, 332, 106, 125],
)

prompt = generate_split_prompt(attribute_splits, random=False)

In [25]:
df_frm[df_frm["name"].str.contains("Avoidin")]

Unnamed: 0_level_0,idFrame,name,description,frameTop
idFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
249,249,Avoiding,An Agent avoids an Undesirable_situation under...,frm_event


In [26]:
# print(prompt_create_from_lus(get_prompt_data(226), ["god", "saint", "deity", "goddess"]))
# print(prompt_create_from_inheritance(get_prompt_data(178), get_prompt_data(random_child_frm(178, fn))))

## Evaluate

In [37]:
import os
import json
import pandas as pd

with open(os.path.join('ChatGPT', 'event_pt.json')) as fp:
    event_lus = json.load(fp)
    
with open(os.path.join('ChatGPT', 'entity_pt.json')) as fp:
    entity_lus = json.load(fp)
    
with open(os.path.join('ChatGPT', 'attribute_pt.json')) as fp:
    attribute_lus = json.load(fp)
    
with open(os.path.join('ChatGPT', 'artifact_en.json')) as fp:
    artifact_lus = json.load(fp)

FrameNet Br

In [38]:
def clean_lu(text):
    return text.lower().split('.')[0]

fnbr_merged = df_lu_pt.merge(df_frm.reset_index(drop=True), on='idFrame')[['name_x', 'idFrame']]
fnbr_merged['name_x'] = fnbr_merged['name_x'].apply(clean_lu)

fnbr = fnbr_merged.groupby('idFrame')['name_x'].apply(set).to_dict()

In [39]:
def create_df(type_lus):
    return pd.DataFrame.from_records([
        { "idFrame": int(idFrame), "lu": clean_lu(lu), "split": split }
        for split, frames in type_lus.items()
        for idFrame, lus in frames.items()
        for lu in lus
    ])


def is_in_fnp(row):
    if row['idFrame'] not in fnp:
        return False
    
    return row['lu'] in fnp[row['idFrame']]

def is_in_fnbr(row):
    if row['idFrame'] not in fnbr:
        return False
    
    return row['lu'] in fnbr[row['idFrame']]

In [40]:
df = create_df(artifact_lus)

df['split'] = df['split'].str.replace('split-', '')
df['FN+?'] = df.apply(is_in_fnp, axis='columns')
df = df.merge(df_frm.reset_index(drop=True), on='idFrame')

df[['name', 'lu', 'split', 'FN+?']].to_csv('lus.csv', index=False)