In [3]:
# Get Vote frame
import pandas as pd
import xml.etree.ElementTree as ET
import re

def get_vote_frame_info(vote_file):
    fn_prefix = '{http://framenet.icsi.berkeley.edu}'
    
    vote_root = ET.parse(vote_file).getroot()
    raw_def = vote_root.find(f'{fn_prefix}definition').text

    clean_def = raw_def.split('\n')[0].strip()
    clean_def = re.sub(r'<.*?>', '', clean_def).strip()

    vote_fe_nodes = vote_root.findall(f'{fn_prefix}FE')
    vote_fes = [(x.attrib['name'], x.find(f'{fn_prefix}definition').text) for x in vote_fe_nodes]
    clean_vote_fes = [(x[0], re.sub(r'<.*?>', '', x[1].split('\n')[0]).strip()) for x in vote_fes]

    # frame_info = {}
    # frame_info['Vote'] = {
    return {
        'definition': clean_def,
        'FEs': clean_vote_fes,
        'raw_definition': raw_def,
        'raw_FEs': vote_fes
    }
    
    # return frame_info

vote_frame_info = get_vote_frame_info('../data/fsp/Vote.xml')

In [5]:
tree = ET.parse('../data/fsp/FactChecks.xml')
root = tree.getroot()


In [6]:
fn_prefix = '{http://framenet.icsi.berkeley.edu}'
sent_nodes = root.findall(f'{fn_prefix}sentence')

vote_samples = []

for sent_node in sent_nodes[1:]:
    sent_text = sent_node.find(f'{fn_prefix}text').text
    # get annotations with status="MANUAL"
    sent_annotations = sent_node.findall(f'{fn_prefix}annotationSet[@status="MANUAL"]')
    
    for annotation in sent_annotations:
        # get the frame name
        frame_name = annotation.attrib['frameName']
        
        if frame_name != 'Vote':
            continue
        
        lu_name = annotation.attrib['luName']
        # get the target
        target = annotation.find(f'{fn_prefix}layer[@name="Target"]').find(f'{fn_prefix}label[@name="Target"]')
        start = int(target.attrib['start'])
        end = int(target.attrib['end'])+1
        target_text = sent_text[start:end]
        # get the frame elements
        frame_elements = annotation.findall(f'{fn_prefix}layer[@name="FE"]/{fn_prefix}label')
        fe_dict = {}
        for fe in frame_elements:
            fe_name = fe.attrib['name']
            fe_start = int(fe.attrib['start'])
            fe_end = int(fe.attrib['end'])+1
            fe_text = sent_text[fe_start:fe_end]
            fe_dict[fe_name] = {
                'text': fe_text,
                'start': fe_start,
                'end': fe_end
            }
        
        vote_samples.append((sent_text, frame_name, target_text, *[(fe_dict.get(fe, {}).get('start', -1), fe_dict.get(fe, {}).get('end', -1)) if fe in fe_dict else None for fe, _ in vote_frame_info['FEs']]))
        
        print(sent_text)
        print(frame_name)
        print(target_text)
        print(frame_elements)
        
    

Bernie Sanders was against the auto bailout and voted against the money that ended up saving the auto industry .
Vote
voted
[<Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a62858bd0>, <Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a62858c20>, <Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a62858c70>]
Bernie Sanders voted for the bill that gave special protection - immunity from liability - to the gun makers and sellers , limiting the Sandy Hook lawsuit .
Vote
voted
[<Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a6268d130>, <Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a6268d180>, <Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a6268d1d0>]
He was the only Republican to vote against creating a House panel to investigate Planned Parenthood .
Vote
vote
[<Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a62574180>, <Element '{http://framenet.icsi.berkeley.edu}label' at 0x7f4a625741d0>, <Element '{http:/

In [7]:
vote_samples_df = pd.DataFrame(vote_samples, columns=['sentence', 'frame', 'target', *[fe for fe, _ in vote_frame_info['FEs']]])
vote_samples_df = vote_samples_df[vote_samples_df.target.apply(lambda x: x.lower() in {'vote', 'voted', 'votes'})]
vote_samples_df = vote_samples_df.drop_duplicates()


In [5]:
# Get all permutations of the frame elements
from itertools import permutations

vote_prompts = []

vote_fes = [fe for fe, _ in vote_frame_info['FEs']]

vote_permutations = []

for i, sample in vote_samples_df.iterrows():
    present_fes = sample[vote_fes][sample[vote_fes].notnull()]
    skip = False
    
    fe_str = '\n- '.join([f'{fe}: {sample.sentence[fe_span[0]:fe_span[1]]}' for fe, fe_span in present_fes.sort_values().items()])
    
    for perm in permutations(present_fes.items()):
        prev_start = -1
        for _, fe_span in perm:
            if fe_span[0] < prev_start:
                skip = False
                break
            prev_start = fe_span[0]
            skip = True
            
        if skip:
            skip = False
            continue
        
        prompt = f'Frame: Vote\n\nSentence: {sample.sentence}\n\nFrame Elements:\n- {fe_str}\n\n' \
                 f'New order of frame elements: {", ".join([fe_name for fe_name, _ in perm])}\n\n'
                #  f'This sentence presents the frame elements above in the order they appear in the sentence. ' \
                #  f'Your task is to create a new high quality sentence which retains the same meaning and uses all of the same frame elements such that the order of the frame elements are in the order of: {", ".join([fe_name for fe_name, _ in perm])}.'
        vote_prompts.append(prompt)

        vote_permutations.append((*sample.values.tolist(), prompt))

In [12]:
frame_definitions = 'test definition.'
sentence = 'this is a claim'

prompt = f"Frames and their elements:\n{frame_definitions}\n\nAnnotate the following sentence:\n\"{sentence}\"\n\nFormat the response as follows:\n\nFormat: Annotated Sentence:\nFrame: Occupy_rank\nElements: Dimension, Item, Rank, Comparison_set, Time\n\nAnalysis:\n- Dimension: [Dimension]\n- Item 1: [Item 1]\n- Rank 1: [Rank 1]\n- Item 2: [Item 2]\n- Rank 2: [Rank 2]\n- Comparison set: [Comparison_set]\n- Time: [Time]\n\nAnnotation:\n\"[Sentence]\""

print(prompt)

Frames and their elements:
test definition.

Annotate the following sentence:
"this is a claim"

Format the response as follows:

Format: Annotated Sentence:
Frame: Occupy_rank
Elements: Dimension, Item, Rank, Comparison_set, Time

Analysis:
- Dimension: [Dimension]
- Item 1: [Item 1]
- Rank 1: [Rank 1]
- Item 2: [Item 2]
- Rank 2: [Rank 2]
- Comparison set: [Comparison_set]
- Time: [Time]

Annotation:
"[Sentence]"


In [152]:
# from openai import OpenAI

# client = OpenAI()

# gpt_outputs = []

# for prompt in vote_prompts:
#     completion = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages=[
#             {"role": "system", "content": "You are a bot helping a user create a new high quality sentence with the same meaning as the original sentence. The original sentence presents the frame elements in a specific order. Your task is to create a new sentence that retains the same meaning using a different order of the frame elements provided by the user. The output shall not modify the frame elements."},
#             {"role": "user", "content": prompt},
#         ]
#     )
    
#     gpt_outputs.append((prompt, completion.choices[0].message.content))

#     # Save outputs to a file every 10 prompts
#     if len(gpt_outputs) % 10 == 0:
#         with open('../data/fsp/vote_order_augment.txt', 'w') as f:
#             for prompt, output in gpt_outputs:
#                 f.write(f'#@Prompt:\n{prompt}\n\n#@Output:\n{output}\n\n\n\n')

In [6]:
# gpt_outputs_df = pd.DataFrame(gpt_outputs, columns=['prompt', 'output'])
# gpt_outputs_df.to_csv('../data/fsp/vote_order_augment.csv', index=False)

gpt_outputs_df = pd.read_csv('../data/fsp/vote_order_augment.csv')

new_vote_samples_df = pd.DataFrame(vote_permutations, columns=['og_sentence', 'frame', 'target', *[f'og_{x}' for x in vote_fes], 'prompt'])

# right side should be new_
print(gpt_outputs_df.shape, new_vote_samples_df.shape)

new_vote_samples_df['new_sentence'] = gpt_outputs_df.output


(675, 2) (675, 12)


In [7]:
new_vote_samples_df.head(1)

Unnamed: 0,og_sentence,frame,target,og_Agent,og_Issue,og_Side,og_Position,og_Frequency,og_Time,og_Place,og_Support_rate,prompt,new_sentence
0,Bernie Sanders was against the auto bailout an...,Vote,voted,"(0, 14)","(62, 110)",,"(54, 61)",,,,,Frame: Vote\n\nSentence: Bernie Sanders was ag...,Bernie Sanders voted against the money that en...


In [8]:
valid_new_samples = new_vote_samples_df[new_vote_samples_df.apply(lambda x: all([x.og_sentence[x[f'og_{fe}'][0]:x[f'og_{fe}'][1]].lower() in x.new_sentence.lower() 
                                                                                 if x[f'og_{fe}'] is not None else True for fe in vote_fes]), axis=1)].copy()

for fe in vote_fes:
    valid_new_samples[f'new_{fe}'] = valid_new_samples.apply(lambda x: (x.new_sentence.lower().index(x.og_sentence[x[f'og_{fe}'][0]:x[f'og_{fe}'][1]].lower()), 
                                                                        x.new_sentence.lower().index(x.og_sentence[x[f'og_{fe}'][0]:x[f'og_{fe}'][1]].lower())+len(x.og_sentence[x[f'og_{fe}'][0]:x[f'og_{fe}'][1]])) 
                                                             if x[f'og_{fe}'] is not None else None, axis=1)

In [9]:
augmented_samples = valid_new_samples[['og_sentence', 'new_sentence', *[f'new_{fe}' for fe in vote_fes]]].copy()
augmented_samples.rename(columns={x:x[4:] for x in augmented_samples.keys() if 'og_' not in x}, inplace=True)
augmented_samples

Unnamed: 0,og_sentence,sentence,Agent,Issue,Side,Position,Frequency,Time,Place,Support_rate
0,Bernie Sanders was against the auto bailout an...,Bernie Sanders voted against the money that en...,"(0, 14)","(29, 77)",,"(21, 28)",,,,
1,Bernie Sanders was against the auto bailout an...,The money that ended up saving the auto indust...,"(70, 84)","(0, 48)",,"(59, 66)",,,,
2,Bernie Sanders was against the auto bailout an...,The money that ended up saving the auto indust...,"(58, 72)","(0, 48)",,"(77, 84)",,,,
3,Bernie Sanders was against the auto bailout an...,Against the money that ended up saving the aut...,"(58, 72)","(8, 56)",,"(0, 7)",,,,
4,Bernie Sanders was against the auto bailout an...,Against the money that ended up saving the aut...,"(58, 72)","(8, 56)",,"(0, 7)",,,,
...,...,...,...,...,...,...,...,...,...,...
670,I 've always voted for keeping the government ...,Keeping the government open is the issue I've ...,"(4, 5)","(0, 27)",,"(59, 62)",,,,
671,I 've always voted for keeping the government ...,"For keeping the government open, I 've always ...","(8, 9)","(4, 31)",,"(0, 3)",,,,
672,I 've always voted for keeping the government ...,"For keeping the government open, I've always v...","(8, 9)","(4, 31)",,"(0, 3)",,,,
673,Congressman DeSantis voted to cut Social Secur...,"To cut Social Security and Medicare, Congressm...","(37, 57)","(0, 35)",,,,,,


In [10]:
vote_re = re.compile(r'\bvot(ed|es|e|ing){1}\b', re.IGNORECASE)

# Count number of times the word "vote" appears in the new sentence
# augmented_samples.sentence.apply(lambda x: len([x.span() for x in vote_re.finditer(x)]) == 1).sum()
augmented_samples['target_span'] = augmented_samples.sentence.apply(lambda x: [x.span() for x in vote_re.finditer(x)][0])

In [46]:
# augmented_samples.to_csv('data/fe_order_permutation_samples.csv', index=False)

In [11]:
# Compare old permutation counts to new ones.

# Get old permutation counts:
print('Old permutation counts:')
print(vote_samples_df.apply(lambda x: [z[0] for z in sorted([(fe, x[fe]) for fe in vote_fes if x[fe] is not None], key=lambda y: y[-1])], axis=1).value_counts().head(25))

# Get new permutation counts:
print('\n\nNew permutation counts:')
print(augmented_samples.drop_duplicates().apply(lambda x: [z[0] for z in sorted([(fe, x[fe]) for fe in vote_fes if x[fe] is not None], key=lambda y: y[-1])], axis=1).value_counts().head(25))

# Merge old and new
print('\n\nMerged permutation counts:')
print(pd.concat([vote_samples_df, augmented_samples]).drop_duplicates().apply(lambda x: [z[0] for z in sorted([(fe, x[fe]) for fe in vote_fes if x[fe] is not None], key=lambda y: y[-1])], axis=1).value_counts().head(25))


# TODO: New prompt which can handle changed FEs, e.g., ask GPT to do something like: 
# (she) vote (against) (raising the minimum wage) -> On the vote to (raise the minimum wage), (she) cast a vote (against) it
# Could then either do edit distance to match the spans, or have it directly output the span, e.g., [Issue](raise the minimum wage)

Old permutation counts:
[Agent, Position, Issue]                 45
[Agent, Issue]                           35
[Agent, Position, Issue, Time]            4
[Agent, Side, Support_rate]               4
[Agent, Frequency, Issue]                 3
[Agent, Position, Side, Support_rate]     2
[Agent, Position, Issue, Frequency]       2
[Agent, Frequency, Position, Issue]       2
[Time, Agent, Issue]                      2
[Position, Issue, Agent]                  1
[Agent, Support_rate, Side, Issue]        1
[Agent, Issue, Frequency, Time]           1
[Issue, Agent, Position]                  1
[Agent, Side, Issue]                      1
[Time, Agent, Position, Issue]            1
[Place, Agent, Side, Issue]               1
[Agent, Time, Issue]                      1
[Agent, Time, Position, Issue]            1
[Agent, Support_rate, Side]               1
Name: count, dtype: int64


New permutation counts:
[Position, Issue, Agent]               36
[Issue, Agent]                         32
[Iss

In [None]:
augmented_samples.drop_duplicates().sentence.values