## Load data

In [35]:
import getpass
password = getpass.getpass()

········


In [116]:
import pymysql
from sqlalchemy import create_engine

conn_str = f'mysql+pymysql://fnbrasil:{password}@localhost:3307/fnbr_db'

engine = create_engine(conn_str, pool_recycle=3600)

In [117]:
import pandas as pd

df_frm = pd.read_sql('''
    select f.idFrame, e.name, e.description, tf.frameTop
    from view_frame f
    join entry e on e.entry = f.entry
    left join topframe tf on tf.frameBase = f.entry and tf.frameTop in ('frm_event', 'frm_entity', 'frm_attributes')
    where e.idLanguage = 2;
''', engine).set_index("idFrame", drop=False)

In [118]:
df_fe = pd.read_sql('''
    select fe.idFrameElement, fe.typeEntry, e.name, e.description, fe.idFrame
    from view_frameelement fe
    join entry e on e.entry = fe.entry
    where idLanguage = 2;
''', engine).set_index("idFrameElement", drop=False)

In [119]:
df_lu = pd.read_sql('''
    select lu.idLU, lu.name, lu.senseDescription, lu.idFrame 
    from view_lu lu
    where lu.idLanguage = 2;
''', engine).set_index("idLU", drop=False)

In [120]:
df_rel = pd.read_sql('''
    select r.entry, f1.idFrame as 'idFrame1', f2.idFrame as 'idFrame2'
    from entityrelation e
    join relationtype r on r.idRelationType = e.idRelationType  
    join frame f1 on e.idEntity1 = f1.`idEntity` 
    join frame f2 on e.idEntity2 = f2.`idEntity`;
''', engine)

In [121]:
import networkx as nx

df_inh = df_rel[df_rel["entry"] == "rel_inheritance"]

fn = nx.DiGraph()
fn.add_nodes_from(pd.concat([df_inh['idFrame1'], df_inh['idFrame2']]).unique())
fn.add_edges_from([(j, i) for i, j in zip(df_inh['idFrame1'], df_inh['idFrame2'])])

## Preprocess

In [122]:
import re
import xml.etree.ElementTree as ET

def clean_def(text):
    text = re.sub(r'\<ex\>.*', '', text, flags=re.DOTALL)
    text = text.replace('<def-root>', '')
    text = text.replace('</def-root>', '')
    def_root = ET.fromstring('<def>' + text + '</def>')
    def_str = def_root.text if def_root.text is not None else ""
    for child in def_root:
        if child.tag == "ex":
            break
        if child.text is not None:
            def_str += child.text
        if child.tail is not None:
            def_str += child.tail

    return def_str.strip()

# Remove "meta"
df_frm["description"] = df_frm["description"].apply(clean_def)
df_frm["description"] = df_frm["description"].str.replace('\s+', r' ', regex=True)
df_frm["description"] = df_frm["description"].str.replace('#(\w+)', r'\1', regex=True)

df_fe["description"] = df_fe["description"].apply(clean_def)
df_fe["description"] = df_fe["description"].str.replace('\s+', r' ', regex=True)
df_fe["description"] = df_fe["description"].str.replace('#(\w+)', r'\1', regex=True)

df_lu["senseDescription"] = df_lu["senseDescription"].str.replace('FN: ', '')
df_lu["senseDescription"] = df_lu["senseDescription"].str.strip()

# Other
df_lu["name"] = df_lu["name"].str.replace('_', ' ')
df_lu["name"] = df_lu["name"].str.replace('[', '(', regex=False)
df_lu["name"] = df_lu["name"].str.replace(']', ')', regex=False)

## Sample FrameNet

In [123]:
frm_event = set(df_frm[df_frm['frameTop'] == 'frm_event']['idFrame'])
frm_entity = set(df_frm[df_frm['frameTop'] == 'frm_entity']['idFrame'])
frm_attribute = set(df_frm[df_frm['frameTop'] == 'frm_attributes']['idFrame'])

In [124]:
df_corefe = df_fe[df_fe['typeEntry'].isin(['cty_core', 'cty_core-unexpressed'])]

frm_event_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_event)].groupby('idFrame').count()['idFrameElement']
frm_entity_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_entity)].groupby('idFrame').count()['idFrameElement']
frm_attribute_fe_count = df_corefe[df_corefe['idFrame'].isin(frm_attribute)].groupby('idFrame').count()['idFrameElement']

frm_event_lu_count = df_lu[df_lu['idFrame'].isin(frm_event)].groupby('idFrame').count()['idLU']
frm_entity_lu_count = df_lu[df_lu['idFrame'].isin(frm_entity)].groupby('idFrame').count()['idLU']
frm_attribute_lu_count = df_lu[df_lu['idFrame'].isin(frm_attribute)].groupby('idFrame').count()['idLU']

In [125]:
def get_frm_splits(fe_count, lu_count):
    lower_fe = fe_count[fe_count <= fe_count.quantile(.25)]
    upper_fe = fe_count[fe_count >= fe_count.quantile(.75)]
    
    lower_lu = lu_count[lu_count <= lu_count.quantile(.25)]
    upper_lu = lu_count[lu_count >= lu_count.quantile(.75)]
    
    return (
        lower_fe.index.intersection(lower_lu.index),
        lower_fe.index.intersection(upper_lu.index),
        upper_fe.index.intersection(lower_lu.index),
        upper_fe.index.intersection(upper_lu.index)
    )

def print_stats(splits):
    print('Low FE x Low LU:', len(splits[0]))
    print('Low FE x Upp LU:', len(splits[1]))
    print('Upp FE x Low LU:', len(splits[2]))
    print('Upp FE x Upp LU:', len(splits[3]))

event_splits = get_frm_splits(frm_event_fe_count, frm_event_lu_count)
entity_splits = get_frm_splits(frm_entity_fe_count, frm_entity_lu_count)
attribute_splits = get_frm_splits(frm_attribute_fe_count, frm_attribute_lu_count)

## Generate prompts

This are the utility functions used by the prompt generators

In [126]:
import math
from random import random
from collections import OrderedDict
from networkx import all_neighbors

POS = {
    "v": "verb",
    "a": "adjective",
    "n": "noun",
    "adv": "adverb",
    "prep": "preposition",
    "idio": "idiomatic expression"
}


def get_prompt_data(idFrame):
    frm = df_frm.loc[idFrame]
    core_fes = df_fe[(df_fe["idFrame"] == idFrame) & (df_fe["typeEntry"] == "cty_core")]
    unexp_fes = df_fe[(df_fe["idFrame"] == idFrame) & (df_fe["typeEntry"] == "cty_core-unexpressed")]
    lus = df_lu[df_lu["idFrame"] == idFrame]
    
    core_fes = core_fes[["name", "description"]].to_dict('records')
    unexp_fes = unexp_fes[["name", "description"]].to_dict('records')
    lus = lus[["name", "senseDescription"]].to_dict('records')
    
    return {
        "id": frm["idFrame"],
        "name": frm["name"],
        "description": frm["description"],
        "core_fes": core_fes,
        "unexp_fes": unexp_fes,
        "lus": lus
    }


def fmt_entity(text):
    return text.replace('_', ' ')

def fmt_definition(text):
    return text.strip()[:-1] if text.strip()[-1] == '.' else text

def to_comma(arr):
    if len(arr) == 1:
        return f'"{fmt_entity(arr[0])}"'
    
    if len(arr) == 2:
        return f'"{fmt_entity(arr[0])}" and "{fmt_entity(arr[-1])}"'
    
    other = ''.join([f', "{fmt_entity(w)}"' for w in arr[1:-1]])

    return f'"{fmt_entity(arr[0])}"{other} and "{fmt_entity(arr[-1])}"'


def pos_counts(lus):
    counts = OrderedDict()
    
    for lu in lus:
        pos = lu[1]
        if pos in counts:
            counts[pos] += 1
        else:
            counts[pos] = 1
    
    return counts


def pos_names(counts):
    pos_dict = POS.copy()
    
    for k, v in counts.items():
        if v > 1:
            # add plural
            pos_dict[k] += 's'
            
    return pos_dict


def fe_text(core_fes, unexp_fes = []):
    core_fes = [f["name"] for f in core_fes]
    unexp_fes = [f["name"] for f in unexp_fes]
    
    prompt = ''
    
    if len(core_fes) == 1:
        prompt += f' The core frame element in this frame is "{fmt_entity(core_fes[0])}".'
    else:
        prompt += f' Core frame elements in this frame are {to_comma(core_fes)}.'
    
    if len(unexp_fes) == 1:
        prompt += f' The core unexpressed frame element in this frame is "{fmt_entity(unexp_fes[0])}".'
    elif len(unexp_fes) > 1:
        prompt += f' Core unexpressed frame elements in this frame are {to_comma(unexp_fes)}.'
        
    return prompt


def lu_text(lus):
    lus = sorted((lu["name"].split('.') for lu in lus), key=lambda lu: lu[1])
    
    if len(lus) == 1:
        return f' This frame is evoked by the {POS[lus[0][1]]} "{lus[0][0]}"'
    else:
        pos = lus[0][1]
        pos_i = 1
        counts = pos_counts(lus)
        names = pos_names(counts)
        order = list(counts.keys())
        text = f' Words evoking this frame are the {names[pos]} "{lus[0][0]}"'
        
        for i, lu in enumerate(lus[1:]):
            if lu[1] == pos:
                pos_i += 1
                last = i+2 == len(lus) or pos_i == counts[pos]
                sep = ' and' if last else ','
                text += f'{sep} "{lu[0]}"'
            else:
                pos = lu[1]
                pos_i = 0
                sep = ' and' if pos == order[-1] else ','
                text += f'{sep} the {names[pos]} "{lu[0]}"'
        
        return text + '.'


def random_child_frm(anchor, graph):
    # Get "neighbors" with the correct direction
    options = [n for n in all_neighbors(graph, anchor) if graph.has_edge(n, anchor)]
    return options[math.floor(random() * len(options))]

In [127]:
def prompt_suggest_lus(data):
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # FE definitions
    for fe in data["core_fes"]:
        prompt += f' {fmt_definition(fe["description"])}.'
    for fe in data["unexp_fes"]:
        prompt += f' {fmt_definition(fe["description"])}.'
        
    # LU
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' Please propose 10 additional words that evoke the "{fmt_entity(data["name"])}" semantic frame.'
    
    return prompt


def prompt_create_from_lus(data, new_lus):
    # Frame definition
    prompt = f'The semantic frame for "{fmt_entity(data["name"])}" is defined as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    fes = [f["name"] for f in data["core_fes"]]
    
    if len(fes) == 1:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has one core frame element:'
        prompt += f' "{fmt_entity(fes[0])}".'
    else:
        prompt += f' The semantic frame for "{fmt_entity(data["name"])}" has {len(fes)} core elements:'
        prompt += f' {to_comma(fes)}.'

    for fe in data["core_fes"]:
        prompt += f' The definition of the "{fmt_entity(fe["name"])}" frame element is as follows:'
        prompt += f' "{fmt_definition(fe["description"])}".'

    # LUs
    prompt += lu_text(data["lus"])
    
    # Request part
    prompt += f' First, propose a semantic frame evoked by words such as {to_comma(new_lus)}.'
    prompt += f' Second, please propose semantic frames for other kinds of "{fmt_entity(data["name"])}".'
    prompt += ' Present them as table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


def prompt_create_from_inheritance(data, child_data):
    # Frame definition
    prompt = f'There is a semantic frame for "{fmt_entity(data["name"])}", whose definition is as follows:'
    prompt += f' "{fmt_definition(data["description"])}".'
    
    # FEs
    prompt += fe_text(data["core_fes"], data["unexp_fes"])
    
    # Child frame
    prompt += f' The "{fmt_entity(child_data["name"])}" frame inherits the "{fmt_entity(data["name"])}" frame.'
    
    # Child FEs
    prompt += fe_text(data["core_fes"])
    
    # Child LUs
    prompt += lu_text(child_data["lus"])
    
    # Request part
    prompt += f' Please propose other semantic frames inheriting the "{fmt_entity(data["name"])}" frame.'
    prompt += ' Present them as a table in which columns are "Frame Name", "Frame Definition", "Frame Elements", "Frame Element Definition" and "Words evoking the frame".'
        
    return prompt


### Prompt for new LUs

In [145]:
import numpy as np

def generate_split_prompt(splits):
    full_prompt = ''
    
    for i, split in enumerate(event_splits):
        full_prompt += "=================================================\n"
        full_prompt += f'---------------      SPLIT {i}      ---------------\n'
        full_prompt += "=================================================\n"
        for idFrame in np.random.choice(split, size=5):
            full_prompt += prompt_suggest_lus(get_prompt_data(idFrame))
            full_prompt += '\n\n\n'

    return full_prompt



In [146]:
a = generate_split_prompt(event_splits)

20945


In [130]:
df_frm[df_frm["name"].str.contains("Avoidin")]

Unnamed: 0_level_0,idFrame,name,description,frameTop
idFrame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
249,249,Avoiding,An Agent avoids an Undesirable_situation under...,


In [131]:
# print(prompt_create_from_lus(get_prompt_data(226), ["god", "saint", "deity", "goddess"]))
# print(prompt_create_from_inheritance(get_prompt_data(178), get_prompt_data(random_child_frm(178, fn))))