We want to:
- Load the snapshots of X and BlueSky data
- Format them into threads (give replies/quotes their necessary context)
- Filter by the politician-focused keyword lists
- Export for narrative extraction

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
import json
import os
import pickle
import re
import uuid

from itertools import product 

import openai
from openai import OpenAI
from pydantic import BaseModel

from typing import List

from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linear_sum_assignment

from dotenv import load_dotenv
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv('./.env')

True

In [4]:
key_openai = os.getenv('KEY_OPENAI')

# Load data
Load annotated samples

In [5]:
fname = f'./data/annotated/df_x_sample_filtered_20250226_20250227_annotated.xlsx'
df_x = pd.read_excel(fname)
print(df_x.shape)
df_x = df_x[:52]
print(df_x.shape)
df_x[:3]

(305, 19)
(52, 19)


Unnamed: 0,bucket,file,data_idx,matching_rules,data.author_id,data.conversation_id,data.text,data.referenced_tweets,includes.media,agent,agent_norm,action_or_event,action_or_event_norm,object,object_norm,good_example,notes,narrative,sentiment
0,x,./data/snapshots/x/x-2025-02/26/03/x-1-2025-02...,36.0,"[{'id': '1894436145368317952', 'tag': 'keyword...",1.347794e+18,1.894598e+18,So @MarkJCarney just lied on national tv @CBC ...,,"[{'height': 2048, 'media_key': '3_189459794642...",@MarkJCarney,Mark Carney,lied,lied,on national tv,Canadian national TV,,,,
1,x,./data/snapshots/x/x-2025-02/26/03/x-1-2025-02...,21.0,"[{'id': '1894436145368317952', 'tag': 'keyword...",8.42176e+17,1.894592e+18,Trudeau's MISTRESS Publicly HUMILIATED By Repo...,,,Trudeau's MISTRESS,Trudeau's mistress,HUMILIATED By,humiliated by,Reporter,reporter,,,,
2,x,./data/snapshots/x/x-2025-02/26/03/x-1-2025-02...,31.0,"[{'id': '1894436145368317952', 'tag': 'keyword...",1.384664e+18,1.894589e+18,HILARIOUS! Mark Carney BOTCHES French Liberal ...,,,Mark Carney,Mark Carney,BOTCHES,botches,French Liberal Debate,French Liberal Debate,,,,


In [6]:
fname = f'./data/annotated/df_bluesky_sample_filtered_20250226_20250227_annotated.xlsx'
df_bluesky = pd.read_excel(fname)
print(df_bluesky.shape)
df_bluesky = df_bluesky[:114]
print(df_bluesky.shape)
df_bluesky[:3]

(10518, 18)
(114, 18)


Unnamed: 0,bucket,file,data_idx,commit.record.reply.parent.uri,commit.record.reply.root.uri,commit.record.text,commit.record.title,commit.record.embed.external.uri,agent,agent_norm,action_or_event,action_or_event_norm,object,object_norm,good_example,notes,narrative,sentiment
0,bluesky,./data/snapshots/bluesky/bluesky-2025-02/26/03...,5.0,,,I called Governor Stitt’s (Oklahoma Governor) ...,,,,,,,,,,,,
1,bluesky,./data/snapshots/bluesky/bluesky-2025-02/26/03...,8.0,,,When I say I want you\nMay every inch of you b...,,,,,,,,,,,,
2,bluesky,./data/snapshots/bluesky/bluesky-2025-02/26/03...,16.0,,,Toast by Streetband may be a rap song by a whi...,,https://youtu.be/cmeby-7YpLk,,,,,,,,,,


# Extraction

In [7]:
client = OpenAI(api_key=key_openai)

In [8]:
model_name = 'gpt-4o-mini-2024-07-18'

## Define json schema for extractions

In [9]:
class NarrativeEvent(BaseModel):
    agent: str
    agent_norm: str
    action_or_event: str
    action_or_event_norm: str
    object: str
    object_norm: str
    narrative: str
    sentiment: int

class NarrativeExtraction(BaseModel):
    events: List[NarrativeEvent]

## Few-shot examples

In [10]:
cols = [
    'agent', 'agent_norm',
    'action_or_event', 'action_or_event_norm',
    'object', 'object_norm',
    'narrative', 'sentiment',
]

In [11]:
def join_if_list(value):
    if isinstance(value, (list, tuple)) or (hasattr(value, 'ndim') and value.ndim == 1):
        return ', '.join(str(x) for x in value)
    if pd.isna(value):
        return ""
    return value

def is_list_like(value):
    return isinstance(value, (list, tuple)) or (hasattr(value, 'ndim') and value.ndim == 1)

In [12]:
df_x_examples = df_x[df_x['good_example'] == 1].groupby(['data.text']).agg({
    col: list for col in cols
}).reset_index()

fewshot_inputs_x = []
fewshot_outputs_x = []
for index, row in df_x_examples.iterrows():
    fewshot_inputs_x.append(row['data.text'])
    prepared = {}
    max_len = 1  # minimum 1 event per row
    
    # Process each column value. If it's missing, assign a default list;
    # if list-like, use it as is; otherwise, wrap it in a one-element list.
    for col in cols:
        val = row[col]
        if is_list_like(val):
            prepared[col] = list(val)
            if len(prepared[col]) > max_len:
                max_len = len(prepared[col])
        else:
            prepared[col] = [val]
    
    # Pad any lists shorter than max_len with empty defaults
    for col in cols:
        if len(prepared[col]) < max_len:
            pad_val = 0 if col == "sentiment" else ""
            prepared[col].extend([pad_val] * (max_len - len(prepared[col])))
    
    # Create one event per index position and append to a list for that row
    events = []
    for i in range(max_len):
        event = {
            "agent": prepared["agent"][i] if not pd.isna(prepared["agent"][i]) else None,
            "agent_norm": prepared["agent_norm"][i] if not pd.isna(prepared["agent_norm"][i]) else None,
            "action_or_event": prepared["action_or_event"][i] if not pd.isna(prepared["action_or_event"][i]) else None,
            "action_or_event_norm": prepared["action_or_event_norm"][i] if not pd.isna(prepared["action_or_event_norm"][i]) else None,
            "object": prepared["object"][i] if not pd.isna(prepared["object"][i]) else None,
            "object_norm": prepared["object_norm"][i] if not pd.isna(prepared["object_norm"][i]) else None,
            "narrative": prepared["narrative"][i] if not pd.isna(prepared["narrative"][i]) else None,
            "sentiment": prepared["sentiment"][i] if not pd.isna(prepared["sentiment"][i]) else None,
        }
        events.append(event)
    
    fewshot_outputs_x.append(events)
fewshot_outputs_x[1] = []

In [13]:
df_bluesky_examples = df_bluesky[df_bluesky['good_example'] == 1].groupby(['commit.record.text']).agg({
    col: list for col in cols
}).reset_index()

fewshot_inputs_bluesky = []
fewshot_outputs_bluesky = []
for index, row in df_bluesky_examples.iterrows():
    fewshot_inputs_bluesky.append(row['commit.record.text'])
    prepared = {}
    max_len = 1  # minimum 1 event per row
    
    # Process each column value. If it's missing, assign a default list;
    # if list-like, use it as is; otherwise, wrap it in a one-element list.
    for col in cols:
        val = row[col]
        if is_list_like(val):
            prepared[col] = list(val)
            if len(prepared[col]) > max_len:
                max_len = len(prepared[col])
        else:
            prepared[col] = [val]
    
    # Pad any lists shorter than max_len with empty defaults
    for col in cols:
        if len(prepared[col]) < max_len:
            pad_val = 0 if col == "sentiment" else ""
            prepared[col].extend([pad_val] * (max_len - len(prepared[col])))
    
    # Create one event per index position and append to a list for that row
    events = []
    for i in range(max_len):
        event = {
            "agent": prepared["agent"][i] if not pd.isna(prepared["agent"][i]) else None,
            "agent_norm": prepared["agent_norm"][i] if not pd.isna(prepared["agent_norm"][i]) else None,
            "action_or_event": prepared["action_or_event"][i] if not pd.isna(prepared["action_or_event"][i]) else None,
            "action_or_event_norm": prepared["action_or_event_norm"][i] if not pd.isna(prepared["action_or_event_norm"][i]) else None,
            "object": prepared["object"][i] if not pd.isna(prepared["object"][i]) else None,
            "object_norm": prepared["object_norm"][i] if not pd.isna(prepared["object_norm"][i]) else None,
            "narrative": prepared["narrative"][i] if not pd.isna(prepared["narrative"][i]) else None,
            "sentiment": prepared["sentiment"][i] if not pd.isna(prepared["sentiment"][i]) else None,
        }
        events.append(event)
    
    fewshot_outputs_bluesky.append(events)
fewshot_outputs_bluesky[2] = []

## Prompting

In [None]:
zeroshot_system_prompt = '''
You are an expert at structured data extraction and narrative understanding from social media data, specializing in the 2025 Canadian Presidential election. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements, focusing on figures in Canadian (and related international) politics:

agent: One who is/has done the event.
agent_norm: Normalized form of agent.
action_or_event: Action which the agent has taken.
action_or_event_norm: Normalized form of action_or_event.
object: One who is receiving the action or being acted upon.
object_norm: Normalized form of object.
narrative: Short, 1-sentence description of the larger narrative that this agent-action-object triple seems to be a part of.

A post may contain no events or multiple. Extract all identified events in the post. Elements may be explicitly found in the post or implicit. Elements that cannot be filled should be left as None. If the social media post's poster is extracted as an element, they should be referred to as "User". All other people can be identifed by their name and social media handle (if found in the post).
'''.strip()
print(zeroshot_system_prompt)

You are an expert at structured data extraction and narrative understanding from social media data, specializing in the 2025 Canadian Presidential election. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements, focusing on figures in Canadian (and related international) politics:

agent: One who is/has done the event.
agent_norm: Normalized form of agent.
action_or_event: Action which the agent has taken.
action_or_event_norm: Normalized form of action_or_event.
object: One who is receiving the action or being acted upon.
object_norm: Normalized form of object.
narrative: Short, 1-sentence description of the larger narrative that this agent-action-object triple seems to be a part of.
sentiment: 1, 0, -1. Whether the social media post is referring to the agent-action-object triple in a positive (1), neutral (0), or negative (-1) manner.

A post may contain no events 

In [None]:
fewshot_system_prompt = '''
You are an expert at structured data extraction and narrative understanding from social media data, specializing in the 2025 Canadian Presidential election. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements, focusing on figures in Canadian (and related international) politics:

agent: One who is/has done the event.
agent_norm: Normalized form of agent.
action_or_event: Action which the agent has taken.
action_or_event_norm: Normalized form of action_or_event.
object: One who is receiving the action or being acted upon.
object_norm: Normalized form of object.
narrative: Short, 1-sentence description of the larger narrative that this agent-action-object triple seems to be a part of.

A post may contain no events or multiple. Extract all identified events in the post. Elements may be explicitly found in the post or implicit. Elements that cannot be filled should be left as None. If the social media post's poster is extracted as an element, they should be referred to as "User". All other people can be identifed by their name and social media handle (if found in the post).

Here are some examples of valid extractions:
'''.strip()
print(fewshot_system_prompt)

You are an expert at structured data extraction and narrative understanding from social media data, specializing in the 2025 Canadian Presidential election. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements, focusing on figures in Canadian (and related international) politics:

agent: One who is/has done the event.
agent_norm: Normalized form of agent.
action_or_event: Action which the agent has taken.
action_or_event_norm: Normalized form of action_or_event.
object: One who is receiving the action or being acted upon.
object_norm: Normalized form of object.
narrative: Short, 1-sentence description of the larger narrative that this agent-action-object triple seems to be a part of.
sentiment: 1, 0, -1. Whether the social media post is referring to the agent-action-object triple in a positive (1), neutral (0), or negative (-1) manner.

A post may contain no events 

In [16]:
# Bluesky-specific few-shot prompt
fewshot_system_prompt_bluesky = f'''
{fewshot_system_prompt}

Example Input 1:
{fewshot_inputs_bluesky[0]}
Example Output 1:
{fewshot_outputs_bluesky[0]}

Example Input 2:
{fewshot_inputs_bluesky[1]}
Example Output 2:
{fewshot_outputs_bluesky[1]}

Example Input 3:
{fewshot_inputs_bluesky[2]}
Example Output 3:
{fewshot_outputs_bluesky[2]}
'''.strip()

In [17]:
# X-specific few-shot prompt
fewshot_system_prompt_x = f'''
{fewshot_system_prompt}

Example Input 1:
{fewshot_inputs_x[0]}
Example Output 1:
{fewshot_outputs_x[0]}

Example Input 2:
{fewshot_inputs_x[1]}
Example Output 2:
{fewshot_outputs_x[1]}

Example Input 3:
{fewshot_inputs_x[2]}
Example Output 3:
{fewshot_outputs_x[2]}
'''.strip()

# Run

## Bluesky

### Zeroshot

In [None]:
processed_data_dir = os.path.join('./data', 'processed')
fname = 'gpt4omini_df_bluesky_sample_filtered_20250226_20250227_extractions_zeroshot'
fname_parsed_rounds_in_prog = os.path.join(
    processed_data_dir, f'in_prog_{fname}.pkl'
)
save_every = 10

id2processed_text = {}
for idx, row in enumerate(df_bluesky.iterrows()):
    id_, row = row[0], row[1]
    try:
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=[
                {"role": "system", "content": zeroshot_system_prompt},
                {"role": "user", "content": row['commit.record.text']}
            ],
            response_format=NarrativeExtraction,
        )
        message = completion.choices[0].message
        try:
            parsed_round = message.content
        except:
            # Parsing error
            print(f'error_parsing round: {round_n}')
            parsed_round = message
    except:
        # Invalid JSON
        parsed_round = {}
    id2processed_text[id_] = parsed_round

    # Save intermittently
    if (idx + 1) % save_every == 0:
        with open(fname_parsed_rounds_in_prog, 'wb') as f:
            pickle.dump(id2processed_text, f)
print('DONE')

processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

with open(fname_parsed_rounds, 'wb') as f:
    pickle.dump(id2processed_text, f)
print('Final export to:', fname)

DONE
Final export to: gpt4omini_df_bluesky_sample_filtered_20250226_20250227_extractions_zeroshot


### Fewshot

In [None]:
processed_data_dir = os.path.join('./data', 'processed')
fname = 'gpt4omini_df_bluesky_sample_filtered_20250226_20250227_extractions_fewshot'
fname_parsed_rounds_in_prog = os.path.join(
    processed_data_dir, f'in_prog_{fname}.pkl'
)
save_every = 10

id2processed_text = {}
for idx, row in enumerate(df_bluesky.iterrows()):
    id_, row = row[0], row[1]
    try:
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=[
                {"role": "system", "content": fewshot_system_prompt_bluesky},
                {"role": "user", "content": row['commit.record.text']}
            ],
            response_format=NarrativeExtraction,
        )
        message = completion.choices[0].message
        try:
            parsed_round = message.content
        except:
            # Parsing error
            print(f'error_parsing round: {round_n}')
            parsed_round = message
    except:
        # Invalid JSON
        parsed_round = {}
    id2processed_text[id_] = parsed_round

    # Save intermittently
    if (idx + 1) % save_every == 0:
        with open(fname_parsed_rounds_in_prog, 'wb') as f:
            pickle.dump(id2processed_text, f)
print('DONE')

processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

with open(fname_parsed_rounds, 'wb') as f:
    pickle.dump(id2processed_text, f)
print('Final export to:', fname)

## X

### Zeroshot

In [102]:
processed_data_dir = os.path.join('./data', 'processed')
fname = 'gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_zeroshot'
fname_parsed_rounds_in_prog = os.path.join(
    processed_data_dir, f'in_prog_{fname}.pkl'
)
save_every = 10

id2processed_text = {}
for idx, row in enumerate(df_x.iterrows()):
    id_, row = row[0], row[1]
    try:
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=[
                {"role": "system", "content": zeroshot_system_prompt},
                {"role": "user", "content": row['data.text']}
            ],
            response_format=NarrativeExtraction,
        )
        message = completion.choices[0].message
        try:
            parsed_round = message.content
        except:
            # Parsing error
            print(f'error_parsing round: {round_n}')
            parsed_round = message
    except:
        # Invalid JSON
        parsed_round = {}
    id2processed_text[id_] = parsed_round

    # Save intermittently
    if (idx + 1) % save_every == 0:
        with open(fname_parsed_rounds_in_prog, 'wb') as f:
            pickle.dump(id2processed_text, f)
print('DONE')

processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

with open(fname_parsed_rounds, 'wb') as f:
    pickle.dump(id2processed_text, f)
print('Final export to:', fname)

DONE
Final export to: gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_zeroshot


### Fewshot

In [106]:
processed_data_dir = os.path.join('./data', 'processed')
fname = 'gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_fewshot'
fname_parsed_rounds_in_prog = os.path.join(
    processed_data_dir, f'in_prog_{fname}.pkl'
)
save_every = 10

id2processed_text = {}
for idx, row in enumerate(df_x.iterrows()):
    id_, row = row[0], row[1]
    try:
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=[
                {"role": "system", "content": fewshot_system_prompt_x},
                {"role": "user", "content": row['data.text']}
            ],
            response_format=NarrativeExtraction,
        )
        message = completion.choices[0].message
        try:
            parsed_round = message.content
        except:
            # Parsing error
            print(f'error_parsing round: {round_n}')
            parsed_round = message
    except:
        # Invalid JSON
        parsed_round = {}
    id2processed_text[id_] = parsed_round

    # Save intermittently
    if (idx + 1) % save_every == 0:
        with open(fname_parsed_rounds_in_prog, 'wb') as f:
            pickle.dump(id2processed_text, f)
print('DONE')

processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

with open(fname_parsed_rounds, 'wb') as f:
    pickle.dump(id2processed_text, f)
print('Final export to:', fname)

DONE
Final export to: gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_fewshot


# Evaluate extractions

## Utilities

In [18]:
# Load the sentence-transformers model.
model = SentenceTransformer('all-mpnet-base-v2')

In [19]:
def extraction_to_text_by_fields(extraction, keys):
    """
    Convert an extraction dict into a single string by concatenating
    the values from the specified keys.
    If a value is a list or tuple, its elements are joined with a space.
    """
    parts = []
    for key in keys:
        value = extraction.get(key, "")
        if isinstance(value, (list, tuple)):
            parts.append(" ".join(map(str, value)))
        else:
            parts.append(str(value))
    return " ".join(parts).strip()

In [20]:
def compute_extraction_similarity_by_fields(gt_extractions, new_extractions, model, keys):
    """
    Compute a similarity score based on comparing extracted information.
    
    - `gt_extractions`: list of ground-truth extraction dicts.
    - `new_extractions`: list of new extraction dicts.
    - `keys`: list of keys to use for building the text string
       (e.g. ["agent_norm", "action_or_event_norm", "object_norm"]).
       
    Steps:
      * Convert each extraction (ground-truth and new) into a text string by concatenating
        the specified fields.
      * Encode these strings using the sentence-transformers model.
      * Compute a cosine similarity matrix.
      * Use the Hungarian algorithm to optimally match ground-truth to new extraction pairs.
      * Penalize for any extra or missing extractions by dividing the sum by the maximum
        number of extractions on either side.
        
    Returns a float score between 0 and 1.
    """
    # If either side is empty, return 0.
    if not gt_extractions or not new_extractions:
        return 0.0

    # Build the text representations.
    gt_texts = [extraction_to_text_by_fields(e, keys) for e in gt_extractions]
    new_texts = [extraction_to_text_by_fields(e, keys) for e in new_extractions]
    
    # Encode texts into embeddings.
    gt_embeddings = model.encode(gt_texts, convert_to_tensor=True)
    new_embeddings = model.encode(new_texts, convert_to_tensor=True)
    
    # Compute cosine similarity matrix.
    cosine_scores = util.cos_sim(gt_embeddings, new_embeddings)
    sim_matrix = cosine_scores.cpu().numpy()

    # Use Hungarian algorithm (linear_sum_assignment) to maximize similarity.
    # Since linear_sum_assignment minimizes cost, we use negative similarities.
    row_ind, col_ind = linear_sum_assignment(-sim_matrix)
    sum_sim = sim_matrix[row_ind, col_ind].sum()

    # Divide by the maximum number of extractions to penalize unpaired events.
    total_possible = max(len(gt_extractions), len(new_extractions))
    score = sum_sim / total_possible
    return float(score)

In [48]:
def is_list_like(x):
    """Check if x is list-like (list, tuple, or a 1D numpy array)."""
    return isinstance(x, (list, tuple)) or (hasattr(x, 'ndim') and x.ndim == 1)

In [49]:
def convert_df_to_extractions_dict(df):
    """
    Convert the ground-truth dataframe into a mapping of the form:
    
        { row_index: [extraction_dict, extraction_dict, ...], ... }
        
    Each extraction_dict contains the keys:
        "agent", "agent_norm", "action_or_event", "action_or_event_norm",
        "object", "object_norm", "narrative", "sentiment"
    
    For each row, scalar values are converted into a single-element list; 
    list-like values remain unchanged. If the number of extraction items differ 
    across fields, shorter lists are padded with default values ("" for text, 0 for sentiment).
    
    Parameters:
      - df: pd.DataFrame containing your ground-truth extraction columns.
    
    Returns:
      - A dictionary mapping each row index (from the original dataframe) to a list of extraction dictionaries.
    """
    result = {}
    
    extraction_fields = [
        "agent", "agent_norm", "action_or_event", "action_or_event_norm",
        "object", "object_norm", "narrative", "sentiment"
    ]
    
    for row_idx, row in df.iterrows():
        extraction_data = {}
        max_len = 1  # Minimum one extraction per row
        
        # Wrap each field into a list if necessary (or use defaults if missing or NaN)
        for col in extraction_fields:
            if col not in row or pd.isna(row[col]):
                extraction_data[col] = [0] if col == "sentiment" else [""]
            else:
                value = row[col]
                if is_list_like(value):
                    extraction_data[col] = list(value)
                else:
                    extraction_data[col] = [value]
            
            # Update max_len if any column has more extractions
            if len(extraction_data[col]) > max_len:
                max_len = len(extraction_data[col])
        
        # Pad columns that have fewer than max_len extraction items
        for col in extraction_fields:
            if len(extraction_data[col]) < max_len:
                pad_val = 0 if col == "sentiment" else ""
                extraction_data[col].extend([pad_val] * (max_len - len(extraction_data[col])))
        
        # Create a list of extraction dictionaries, one per extraction event
        extractions = []
        for i in range(max_len):
            # Convert sentiment to int if possible
            sentiment_val = extraction_data["sentiment"][i]
            try:
                sentiment_val = int(sentiment_val)
            except (ValueError, TypeError):
                sentiment_val = 0
            
            extraction = {
                "agent": extraction_data["agent"][i],
                "agent_norm": extraction_data["agent_norm"][i],
                "action_or_event": extraction_data["action_or_event"][i],
                "action_or_event_norm": extraction_data["action_or_event_norm"][i],
                "object": extraction_data["object"][i],
                "object_norm": extraction_data["object_norm"][i],
                "narrative": extraction_data["narrative"][i],
                "sentiment": sentiment_val
            }
            extractions.append(extraction)
        
        # Map the original row index to its list of extraction dictionaries.
        result[row_idx] = extractions
    
    return result

In [66]:
def evaluate_outputs(id2processed, id2groundtruth):
    id2score_type1 = {}  # Textualized triples
    id2score_type2 = {}  # Agent
    id2score_type3 = {}  # Action or event
    id2score_type4 = {}  # Object

    for id_, gt_extractions in id2groundtruth.items():
        new_extractions = id2processed.get(id_, [])
        
        # Ensure both sides are lists.
        if not isinstance(gt_extractions, list):
            gt_extractions = [gt_extractions] if gt_extractions else []
        if not isinstance(new_extractions, list):
            new_extractions = [new_extractions] if new_extractions else []
        
        # Compute score 1
        score1 = compute_extraction_similarity_by_fields(
            gt_extractions, new_extractions, model,
            keys=["agent_norm", "action_or_event_norm", "object_norm"]
        )
        # Compute score 2
        score2 = compute_extraction_similarity_by_fields(
            gt_extractions, new_extractions, model,
            keys=["agent_norm"]
        )
        # Compute score 3
        score3 = compute_extraction_similarity_by_fields(
            gt_extractions, new_extractions, model,
            keys=["action_or_event_norm"]
        )
        # Compute score 4
        score4 = compute_extraction_similarity_by_fields(
            gt_extractions, new_extractions, model,
            keys=["object_norm"]
        )
        
        id2score_type1[id_] = score1
        id2score_type2[id_] = score2
        id2score_type3[id_] = score3
        id2score_type4[id_] = score4

    # Combine into one dataframe and return
    df_results = pd.DataFrame(list(id2score_type1.items()), columns=['id', 'triple_score']).set_index('id').join(
        pd.DataFrame(list(id2score_type2.items()), columns=['id', 'agent_score']).set_index('id')
    ).join(
        pd.DataFrame(list(id2score_type3.items()), columns=['id', 'action_score']).set_index('id')
    ).join(
        pd.DataFrame(list(id2score_type4.items()), columns=['id', 'object_score']).set_index('id')
    )
    return df_results

## Ground truth

In [47]:
gt_x = convert_df_to_extractions_dict(df_x)
print(len(gt_x))

gt_bluesky = convert_df_to_extractions_dict(df_bluesky)
print(len(gt_bluesky))

52
114


## Bluesky

### Zeroshot

In [68]:
processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

fname_parsed_rounds = '/nas/ckgfs/users/eboxer/complexdata/data/processed/gpt4omini_df_bluesky_sample_filtered_20250226_20250227_extractions_zeroshot.pkl'
with open(fname_parsed_rounds, 'rb') as f:
    id2processed_text = pickle.load(f)
id2processed_text = {id_: json.loads(text)['events'] for id_, text in id2processed_text.items()}
print('Read from:', fname)

Read from: ./data/annotated/df_bluesky_sample_filtered_20250226_20250227_annotated.xlsx


In [69]:
df_results = evaluate_outputs(id2processed_text, gt_bluesky)
df_results.describe()

Unnamed: 0,triple_score,agent_score,action_score,object_score
count,114.0,114.0,114.0,114.0
mean,0.164413,0.193534,0.107973,0.161881
std,0.263555,0.295711,0.165689,0.250512
min,-0.043402,0.0,-0.009241,0.0
25%,0.0,0.0,0.0,0.0
50%,0.045057,0.085689,0.045701,0.067722
75%,0.195917,0.255459,0.161195,0.221194
max,1.0,1.0,1.0,1.0


### Fewshot

In [70]:
processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

fname_parsed_rounds = '/nas/ckgfs/users/eboxer/complexdata/data/processed/gpt4omini_df_bluesky_sample_filtered_20250226_20250227_extractions_fewshot.pkl'
with open(fname_parsed_rounds, 'rb') as f:
    id2processed_text = pickle.load(f)
id2processed_text = {id_: json.loads(text)['events'] for id_, text in id2processed_text.items()}
print('Read from:', fname)

Read from: ./data/annotated/df_bluesky_sample_filtered_20250226_20250227_annotated.xlsx


In [71]:
df_results = evaluate_outputs(id2processed_text, gt_bluesky)
df_results.describe()

Unnamed: 0,triple_score,agent_score,action_score,object_score
count,114.0,114.0,114.0,114.0
mean,0.16501,0.184633,0.102467,0.159101
std,0.296769,0.320898,0.184931,0.290299
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.176765,0.25,0.118678,0.189173
max,1.0,1.0,1.0,1.0


## X

### Zeroshot

In [72]:
processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

fname_parsed_rounds = '/nas/ckgfs/users/eboxer/complexdata/data/processed/gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_zeroshot.pkl'
with open(fname_parsed_rounds, 'rb') as f:
    id2processed_text = pickle.load(f)
id2processed_text = {id_: json.loads(text)['events'] for id_, text in id2processed_text.items()}
print('Read from:', fname)

Read from: ./data/annotated/df_bluesky_sample_filtered_20250226_20250227_annotated.xlsx


In [73]:
df_results = evaluate_outputs(id2processed_text, gt_x)
df_results.describe()

Unnamed: 0,triple_score,agent_score,action_score,object_score
count,52.0,52.0,52.0,52.0
mean,0.380657,0.419167,0.27785,0.353249
std,0.331304,0.359794,0.254691,0.310749
min,0.0,0.0,0.0,0.0
25%,0.09353,0.111735,0.100797,0.113141
50%,0.280021,0.325341,0.194944,0.242954
75%,0.743036,0.592641,0.372361,0.5
max,0.966931,1.0,1.0,1.0


### Fewshot

In [74]:
processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds = os.path.join(processed_data_dir, f'{fname}.pkl')

fname_parsed_rounds = '/nas/ckgfs/users/eboxer/complexdata/data/processed/gpt4omini_df_x_sample_filtered_20250226_20250227_extractions_fewshot.pkl'
with open(fname_parsed_rounds, 'rb') as f:
    id2processed_text = pickle.load(f)
id2processed_text = {id_: json.loads(text)['events'] for id_, text in id2processed_text.items()}
print('Read from:', fname)

Read from: ./data/annotated/df_bluesky_sample_filtered_20250226_20250227_annotated.xlsx


In [75]:
df_results = evaluate_outputs(id2processed_text, gt_x)
df_results.describe()

Unnamed: 0,triple_score,agent_score,action_score,object_score
count,52.0,52.0,52.0,52.0
mean,0.401956,0.441171,0.293919,0.391227
std,0.362698,0.369074,0.280289,0.351624
min,0.0,0.0,0.0,0.0
25%,0.062911,0.111642,0.08549,0.092142
50%,0.348325,0.393829,0.220333,0.306221
75%,0.798207,0.745034,0.446169,0.5
max,1.0,1.0,1.0,1.0


0-shot for Bluesky, fewshot for X