Reading data from ComplexData S3 buckets and getting familiar with structure.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
import json
import os
import pickle
import re
import uuid

from itertools import product 

import boto3

from isanlp_rst.parser import Parser
from keybert import KeyBERT

import openai
from openai import OpenAI
from pydantic import BaseModel

from typing import List

from dotenv import load_dotenv
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load credentials

In [3]:
load_dotenv('./.env')

True

In [4]:
region_name = os.getenv("COMPLEXDATA_DEFAULT_REGION")
aws_access_key_id = os.getenv("COMPLEXDATA_S3_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("COMPLEXDATA_S3_SECRET_ACCESS_KEY")
aws_session_token = os.getenv("COMPLEXDATA_S3_SESSION_TOKEN")
key_openai = os.getenv('KEY_OPENAI')

In [5]:
s3_client = boto3.client(
    's3',
    region_name=region_name,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token=aws_session_token,
)

## TODO Query

In [6]:
# Example: List all buckets
buckets = s3_client.list_buckets()
bucket_names = []
print("Buckets:")
for bucket in buckets.get('Buckets', []):
    print(f" - {bucket['Name']}")
    bucket_names.append(bucket['Name'])

ClientError: An error occurred (InvalidToken) when calling the ListBuckets operation: The provided token is malformed or otherwise invalid.

In [12]:
key = '2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a-957b-4377-b2d7-2ca7bacc2408'
obj = s3_client.get_object(Bucket='x-misinfo', Key=key)
obj_bytes = obj['Body'].read()
data = obj_bytes.decode('utf-8')
print(f"Data from {key}:\n{data}\n")

ClientError: An error occurred (InvalidToken) when calling the GetObject operation: The provided token is malformed or otherwise invalid.

## Load 1-day worth of data from BlueSky, Reddit, and X

In [44]:
# Pull data from specified buckets and dates
buckets = [
    'bluesky-misinfo',
    'reddit-misinfo',
    'x-misinfo',
]
years = ['2025']
months = ['03']
days = ['01']
hours = [
    '00', #'01', '02', '03', '04', '05',
    # '06', '07', '08', '09', '10', '11',
    # '12', '13', '14', '15', '16', '17',
    # '18', '19', '20', '21', '22', '23',
]

bucket2file2data = {}

for bucket in buckets:
    bucket2file2data[bucket] = {}
    for date in product(years, months, days, hours):
        dir_path = '/'.join(p)
        files = s3_client.list_objects_v2(Bucket=bucket, Prefix=dir_path)
        for file in tqdm(files['Contents']):
            # Load file
            file_key = file['Key']
            obj = s3_client.get_object(Bucket=bucket, Key=file_key)
            obj_bytes = obj['Body'].read()
            data = obj_bytes.decode('utf-8')

            bucket2file2data[bucket][file_key] = data
        print(f'Finished: {date}')
    print(f'Bucket: {bucket}; # of files: {len(bucket2file2data[bucket])}')

100%|██████████| 667/667 [05:13<00:00,  2.13it/s]


Finished: ('2025', '03', '01', '00')
Bucket: bluesky-misinfo; # of files: 667


100%|██████████| 242/242 [00:41<00:00,  5.87it/s]


Finished: ('2025', '03', '01', '00')
Bucket: reddit-misinfo; # of files: 242


100%|██████████| 282/282 [01:12<00:00,  3.91it/s]

Finished: ('2025', '03', '01', '00')
Bucket: x-misinfo; # of files: 282





In [20]:
def escape_newlines_in_json(json_str):
    return json_str.replace('\n', '\\n')

def load_jsonl_str(json_str):
    # Split into newlines and load as list of Python dicts
    json_chunks = re.split(r'\n(?=\{)', json_str.strip())

    data_objects = []
    for chunk in json_chunks:
        escaped_chunk = escape_newlines_in_json(chunk)
        try:
            obj = json.loads(escaped_chunk)
            data_objects.append(obj)
        except json.JSONDecodeError as e:
            print("Error decoding a chunk:", e)
    
    return data_objects

In [77]:
# Convert to dataframe
bucket = 'reddit-misinfo'
df_reddit = pd.DataFrame({
    'bucket': [bucket]*len(bucket2file2data[bucket]),
    'file': bucket2file2data[bucket].keys(),
    'data': [
        load_jsonl_str(bucket2file2data[bucket][key])
        for key in list(bucket2file2data[bucket].keys())
    ]
})
print(df_reddit.shape)

(242, 3)


In [78]:
# Convert to dataframe
bucket = 'x-misinfo'
df_x = pd.DataFrame({
    'bucket': [bucket]*len(bucket2file2data[bucket]),
    'file': bucket2file2data[bucket].keys(),
    'data': [
        load_jsonl_str(bucket2file2data[bucket][key])
        for key in list(bucket2file2data[bucket].keys())
    ]
})
print(df_x.shape)

(282, 3)


In [79]:
# Convert to dataframe
bucket = 'bluesky-misinfo'
df_bluesky = pd.DataFrame({
    'bucket': [bucket]*len(bucket2file2data[bucket]),
    'file': bucket2file2data[bucket].keys(),
    'data': [
        load_jsonl_str(bucket2file2data[bucket][key])
        for key in list(bucket2file2data[bucket].keys())
    ]
})
print(df_bluesky.shape)

(667, 3)


### Postprocessing

In [102]:
pd.set_option('display.max_columns', 50)

In [95]:
# Explode jsonl and melt data
df_exploded = df_x.explode('data').reset_index(drop=True)
df_exploded['data_idx'] = df_exploded.groupby(['bucket', 'file']).cumcount()
data_norm = pd.json_normalize(df_exploded['data'])
df_x_norm = pd.concat([df_exploded.drop(columns=['data']), data_norm], axis=1)
print(df_x_norm.shape)

(17274, 49)


In [96]:
# Explode jsonl and melt data
df_exploded = df_reddit.explode('data').reset_index(drop=True)
df_exploded['data_idx'] = df_exploded.groupby(['bucket', 'file']).cumcount()
data_norm = pd.json_normalize(df_exploded['data'])
df_reddit_norm = pd.concat([df_exploded.drop(columns=['data']), data_norm], axis=1)
print(df_reddit_norm.shape)

(47802, 46)


In [97]:
# Explode jsonl and melt data
df_exploded = df_bluesky.explode('data').reset_index(drop=True)
df_exploded['data_idx'] = df_exploded.groupby(['bucket', 'file']).cumcount()
data_norm = pd.json_normalize(df_exploded['data'])
df_bluesky_norm = pd.concat([df_exploded.drop(columns=['data']), data_norm], axis=1)
print(df_bluesky_norm.shape)

(7270126, 271)


### View

In [104]:
df_x_norm['data.context_annotations'].iloc[0]

[{'domain': {'id': '10',
   'name': 'Person',
   'description': 'Named people in the world like Nelson Mandela'},
  'entity': {'id': '1052324708752883712',
   'name': 'Yashar Ali',
   'description': 'Yashar Ali'}},
 {'domain': {'id': '94',
   'name': 'Journalist',
   'description': "A journalist like 'Anderson Cooper'"},
  'entity': {'id': '1052324708752883712',
   'name': 'Yashar Ali',
   'description': 'Yashar Ali'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy of user interests. '},
  'entity': {'id': '1046545033657081857',
   'name': 'News',
   'description': 'News'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy of user interests. '},
  'entity': {'id': '1052324708752883712',
   'name': 'Yashar Ali',
   'description': 'Yashar Ali'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy of user interests. '},
  'entity': {'id': '107475536953

In [105]:
df_x_norm['data.entities.annotations'].iloc[0]

[{'start': 29,
  'end': 37,
  'probability': 0.9061,
  'type': 'Person',
  'normalized_text': 'Netanyahu'}]

In [106]:
df_x_norm['data.text'].iloc[0]

'@yashar I was looking to see Netanyahu’s response but I’m not surprised, he has his own crisis going on as well and also the orange man is his friend so never mind.'

In [109]:
df_reddit_norm['body'].iloc[0]

'Amen! Also… I thought I was the only person on the planet whose favourite pizza was Kraft pizza, nice!'

In [113]:
df_reddit_norm[df_reddit_norm['text'].notna()]['text'].iloc[0]

"Anyone notice the Kraft Pizza Kits changed their sauce? I can tolerate a lot like rising heating costs, threats of invasion from the USA and paper straws but don't mess with my favorite pizza!"

In [143]:
df_bluesky_norm['commit.record.body'].value_counts()[:10]

commit.record.body
# Test\nTest post, do not like.\n                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [131]:
df_bluesky_norm['commit.record.title'].value_counts()[:10]

commit.record.title
2025/02/28 とのフラ 違法日記                                                    6
                                                                        1
ATプロトコルコミュニティファンドのご紹介                                                   1
文字起こし                                                                   1
'Be Thankful!' Trump Berates Zelensky in Insane Oval Office Meltdown    1
There Is No Antimemetics Divisionのあらすじを探して日本語訳して                        1
There Is No Antimemetics Divisionあらすじ                                   1
2025/02/28                                                              1
bob                                                                     1
Name: count, dtype: int64

# Load sample

In [9]:
fname = './data/processed/df_x_norm_20250301_parsed_keywords.pkl'

df_x_norm = pd.read_pickle(fname)
print(df_x_norm.shape)
print('Read from:', fname)

(17274, 51)
Read from: ./data/processed/df_x_norm_20250301_parsed_keywords.pkl


In [10]:
pd.set_option('display.max_columns', 100)

In [13]:
# Get subsample of N tweets
N = 1_000
df_x_sample = df_x_norm.sample(N, random_state=0)[[
    'bucket', 'file', 'matching_rules',
    'data.author_id', 'data.conversation_id',
    'data.text', 'data.referenced_tweets', 'includes.media',
]]
print(df_x_sample.shape)
df_x_sample[:3]

(1000, 8)


Unnamed: 0,bucket,file,matching_rules,data.author_id,data.conversation_id,data.text,data.referenced_tweets,includes.media
5322,x-misinfo,2025/03/01/05/x-1-2025-03-01-05-57-44-d6423c97...,"[{'id': '1895531213122588673', 'tag': 'keyword...",1497096338277744641,1895655864536813792,@stavridisj @CNN @jimsciutto This puts Canada ...,"[{'type': 'replied_to', 'id': '189565586453681...","[{'duration_ms': 305386, 'height': 720, 'media..."
74,x-misinfo,2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a...,"[{'id': '1895531213122588673', 'tag': 'keyword...",1780186210264100865,1895601454225744172,@yashar Awesome! Go fight in the war then UK a...,"[{'type': 'replied_to', 'id': '189560145422574...","[{'height': 2000, 'media_key': '3_189560145011..."
3078,x-misinfo,2025/03/01/02/x-1-2025-03-01-02-49-07-33bbb6c4...,"[{'id': '1895531213122588673', 'tag': 'keyword...",712236479767040000,1895667483769782502,"@TezcatliOcta “Not only that, their operations...","[{'type': 'replied_to', 'id': '189566748376978...","[{'height': 2048, 'media_key': '3_189566747664..."


In [None]:
# fname = f'./data/to_annotate/df_x_sample_N{N}.xlsx'
# df_x_sample.to_excel(fname, index=False)
# print('Wrote to:', fname)

Wrote to: ./data/to_annotate/df_x_sample_N1000.xlsx


# OpenAI Assignment

In [52]:
client = OpenAI(api_key=key_openai)

In [57]:
model_name = 'gpt-4o-mini-2024-07-18'

## Define json schema for extractions

In [55]:
class NarrativeEvent(BaseModel):
    teller: str
    mode_of_telling: str
    recipient: str
    social_situation: str

    agent: str
    action_or_event: str
    object: str
    location: str
    time_frame: str
    motivation_or_cause: str

class NarrativeExtraction(BaseModel):
    events: List[NarrativeEvent]

## Prompting

In [66]:
system_prompt = '''
You are an expert at structured data extraction and narrative understanding from social media data. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements:

teller: One who is recounting the event.
mode_of_telling: How the teller is recounting the event.
recipient: Audience of the teller.
social_situation: In what context the teller is recounting the event to the recipient.
agent: One who is/has done the event.
action_or_event: Action which the agent has taken.
object: One who is receiving the action or being acted upon.
location: Where the action is taking/took place.
time_frame: When the action is taking/took place.
motivation_or_cause: Why the action is taking/took place.

A post may contain no events or multiple. Extract all identified events in the post. Elements may be explicitly found in the post or implicit. Elements that cannot be filled should be left as empty strings. If the social media post's poster is extracted as an element, they should be referred to as "User". All other people can be identifed by their name and Twitter handle (if found in the post).
'''.strip()
print(system_prompt)

You are an expert at structured data extraction and narrative understanding from social media data. You will be given unstructured text from a social media post and should convert it into the given structure, a list of events where each event contains the following elements:

teller: One who is recounting the event.
mode_of_telling: How the teller is recounting the event.
recipient: Audience of the teller.
social_situation: In what context the teller is recounting the event to the recipient.
agent: One who is/has done the event.
action_or_event: Action which the agent has taken.
object: One who is receiving the action or being acted upon.
location: Where the action is taking/took place.
time_frame: When the action is taking/took place.
motivation_or_cause: Why the action is taking/took place.

A post may contain no events or multiple. Extract all identified events in the post. Elements may be explicitly found in the post or implicit. Elements that cannot be filled should be left as emp

In [67]:
text = df_x_sample['data.text'].iloc[0]
print(text)

@stavridisj @CNN @jimsciutto This puts Canada in a very difficult spot  ....  Any advice, Admiral?


In [68]:
completion = client.beta.chat.completions.parse(
    model=model_name,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text}
    ],
    response_format=NarrativeExtraction,
)
message = completion.choices[0].message

In [69]:
json.loads(message.content)

{'events': [{'teller': 'User',
   'mode_of_telling': 'Questioning',
   'recipient': 'Admiral Stavridis',
   'social_situation': 'Seeking advice on a difficult situation regarding Canada',
   'agent': 'User',
   'action_or_event': 'Seeks advice',
   'object': 'Admiral Stavridis',
   'location': '',
   'time_frame': '',
   'motivation_or_cause': 'Canada being in a difficult spot'}]}

In [71]:
processed_data_dir = os.path.join('./data', 'processed')
fname_parsed_rounds_in_prog = os.path.join(
    processed_data_dir, 'in_prog_gpt4omini_df_x_sample_extractions.pkl'
)

In [88]:
save_every = 10

id2processed_text = {}
for idx, row in enumerate(df_x_sample.iterrows()):
    row = row[1]
    id_ = row['data.conversation_id'] + '|' + row['file']
    try:
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": row['data.text']}
            ],
            response_format=NarrativeExtraction,
        )
        message = completion.choices[0].message
        try:
            parsed_round = message.content
        except:
            # Parsing error
            print(f'error_parsing round: {round_n}')
            parsed_round = message
    except:
        # Invalid JSON
        parsed_round = {}
    id2processed_text[id_] = parsed_round

    # Save intermittently
    if (idx + 1) % save_every == 0:
        with open(fname_parsed_rounds_in_prog, 'wb') as f:
            pickle.dump(id2processed_text, f)
print('DONE')

DONE


In [89]:
processed_data_dir = os.path.join('./data', 'processed')
fname = os.path.join(
    processed_data_dir, 'gpt4omini_df_x_sample_extractions.xlsx'
)

with open(fname, 'wb') as f:
    pickle.dump(file2processed_text, f)
print('Final export to:', fname)

# with open(fname, 'rb') as f:
#     file2processed_text = pickle.load(f)
# print('Read from:', fname)

Final export to: ./data/processed/gpt4omini_df_x_sample_extractions.xlsx


In [90]:
df_x_sample[:3]

Unnamed: 0,bucket,file,matching_rules,data.author_id,data.conversation_id,data.text,data.referenced_tweets,includes.media
5322,x-misinfo,2025/03/01/05/x-1-2025-03-01-05-57-44-d6423c97...,"[{'id': '1895531213122588673', 'tag': 'keyword...",1497096338277744641,1895655864536813792,@stavridisj @CNN @jimsciutto This puts Canada ...,"[{'type': 'replied_to', 'id': '189565586453681...","[{'duration_ms': 305386, 'height': 720, 'media..."
74,x-misinfo,2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a...,"[{'id': '1895531213122588673', 'tag': 'keyword...",1780186210264100865,1895601454225744172,@yashar Awesome! Go fight in the war then UK a...,"[{'type': 'replied_to', 'id': '189560145422574...","[{'height': 2000, 'media_key': '3_189560145011..."
3078,x-misinfo,2025/03/01/02/x-1-2025-03-01-02-49-07-33bbb6c4...,"[{'id': '1895531213122588673', 'tag': 'keyword...",712236479767040000,1895667483769782502,"@TezcatliOcta “Not only that, their operations...","[{'type': 'replied_to', 'id': '189566748376978...","[{'height': 2048, 'media_key': '3_189566747664..."


In [112]:
df_output = pd.DataFrame({
    'id': id2processed_text.keys(),
    'output': id2processed_text.values(),
})
df_output.loc[:,'data.conversation_id'] = df_output['id'].apply(lambda x: x.split('|')[0])
df_output.loc[:,'file'] = df_output['id'].apply(lambda x: x.split('|')[1])
print(df_output.shape)
df_output[:3]

(100, 4)


Unnamed: 0,id,output,data.conversation_id,file
0,1895655864536813792|2025/03/01/05/x-1-2025-03-...,"{""events"":[{""teller"":""User"",""mode_of_telling"":...",1895655864536813792,2025/03/01/05/x-1-2025-03-01-05-57-44-d6423c97...
1,1895601454225744172|2025/03/01/00/x-1-2025-03-...,"{""events"":[{""teller"":""User"",""mode_of_telling"":...",1895601454225744172,2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a...
2,1895667483769782502|2025/03/01/02/x-1-2025-03-...,"{""events"":[{""teller"":""User"",""mode_of_telling"":...",1895667483769782502,2025/03/01/02/x-1-2025-03-01-02-49-07-33bbb6c4...


In [113]:
# Convert response to json
def safe_eval(text: str, default_value = dict()):
    # Call eval safely
    try:
        output = json.loads(text)
    except:
        output = default_value
    return output

df_output.loc[:,'output'] = df_output['output'].apply(safe_eval)
df_output.loc[:,'output'] = df_output['output'].apply(lambda x: x['events'])

In [115]:
# Join
to_index = ['data.conversation_id', 'file']
df_x_sample_output = df_x_sample.set_index(to_index).join(
    df_output.set_index(to_index)[['output']]
).reset_index()
print(df_x_sample_output.shape)
df_x_sample_output[:3]

(100, 9)


Unnamed: 0,data.conversation_id,file,bucket,matching_rules,data.author_id,data.text,data.referenced_tweets,includes.media,output
0,1895655864536813792,2025/03/01/05/x-1-2025-03-01-05-57-44-d6423c97...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",1497096338277744641,@stavridisj @CNN @jimsciutto This puts Canada ...,"[{'type': 'replied_to', 'id': '189565586453681...","[{'duration_ms': 305386, 'height': 720, 'media...","[{'teller': 'User', 'mode_of_telling': 'Direct..."
1,1895601454225744172,2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",1780186210264100865,@yashar Awesome! Go fight in the war then UK a...,"[{'type': 'replied_to', 'id': '189560145422574...","[{'height': 2000, 'media_key': '3_189560145011...","[{'teller': 'User', 'mode_of_telling': 'Exclam..."
2,1895667483769782502,2025/03/01/02/x-1-2025-03-01-02-49-07-33bbb6c4...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",712236479767040000,"@TezcatliOcta “Not only that, their operations...","[{'type': 'replied_to', 'id': '189566748376978...","[{'height': 2048, 'media_key': '3_189566747664...","[{'teller': 'User', 'mode_of_telling': 'Statem..."


In [126]:
# Explode and export
df_export = df_x_sample_output.explode(['output']).reset_index(drop=True)
df_export = df_export.join(pd.json_normalize(df_export['output']))
print(df_export.shape)

fname = './data/to_annotate/df_x_sample_narrative_extractions.xlsx'
df_export.to_excel(fname, index=False)
print('Wrote to:', fname)

(110, 19)
Wrote to: ./data/to_annotate/df_x_sample_narrative_extractions.xlsx


# Evaluation

In [127]:
# Read annotations
fname = './data/annotated/df_x_sample_narrative_extractions.xlsx'
df_annot = pd.read_excel(fname)
print(df_annot.shape)
df_annot[:3]

(113, 41)


Unnamed: 0,data.conversation_id,file,bucket,matching_rules,data.author_id,data.text,data.referenced_tweets,includes.media,output,teller,teller_correct,teller_corrected,mode_of_telling,mode_of_telling_correct,mode_of_telling_corrected,recipient,recipient_correct,recipient_corrected,social_situation,social_situation_correct,social_situation_corrected,agent,agent_correct,agent_corrected,action_or_event,action_or_event_correct,action_or_event_correct_ed,object,object_correct,object_corrected,location,location_correct,location_corrected,time_frame,time_frame_correct,time_frame_corrected,motivation_or_cause,motivation_or_cause_correct,motivation_or_cause_corrected,good_example,notes
0,1.895656e+18,2025/03/01/05/x-1-2025-03-01-05-57-44-d6423c97...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",1.497096e+18,@stavridisj @CNN @jimsciutto This puts Canada ...,"[{'type': 'replied_to', 'id': '189565586453681...","[{'duration_ms': 305386, 'height': 720, 'media...","{'teller': 'User', 'mode_of_telling': 'Direct ...",User,1.0,,Direct question,1.0,,Admiral Stavridis,1.0,,Online conversation on Twitter,1.0,,Canada,1.0,,faces a difficult situation,1.0,,,,,,,,,,,Unspecified context affecting Canada,1.0,,,
1,1.895601e+18,2025/03/01/00/x-1-2025-03-01-00-01-57-ff8f6d7a...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",1.780186e+18,@yashar Awesome! Go fight in the war then UK a...,"[{'type': 'replied_to', 'id': '189560145422574...","[{'height': 2000, 'media_key': '3_189560145011...","{'teller': 'User', 'mode_of_telling': 'Exclama...",User,1.0,,Exclamatory,1.0,,@yashar,1.0,,Responding to a post,1.0,,@yashar,-1.0,,Fight in the war,-1.0,,UK and Europe,-1.0,,,,,,,,Expressing enthusiasm and sarcasm about the du...,1.0,,,
2,1.895667e+18,2025/03/01/02/x-1-2025-03-01-02-49-07-33bbb6c4...,x-misinfo,"[{'id': '1895531213122588673', 'tag': 'keyword...",7.122365e+17,"@TezcatliOcta “Not only that, their operations...","[{'type': 'replied_to', 'id': '189566748376978...","[{'height': 2048, 'media_key': '3_189566747664...","{'teller': 'User', 'mode_of_telling': 'Stateme...",User,1.0,,Statement,1.0,,General public,1.0,,Commentary on trade relations,1.0,,President Donald Trump,1.0,,threatened to impose tariffs,1.0,,Canada,1.0,,United States,1.0,,,,,to address flooding his country with dangerous...,1.0,,1.0,


In [134]:
output_cols = [
    'agent', 'action_or_event', 'object', 'location', 'time_frame',
    'motivation_or_cause',
]
annot_cols = [col+'_correct' for col in output_cols]

In [139]:
# Grading (including null responses)
df_annot[annot_cols].fillna(1).replace({-1: 0}).mean()

agent_correct                  0.750000
action_or_event_correct        0.730769
object_correct                 0.769231
location_correct               0.942308
time_frame_correct             0.942308
motivation_or_cause_correct    0.846154
dtype: float64

In [140]:
# Grading (excluding null responses)
df_annot[annot_cols].replace({-1: 0}).mean()

agent_correct                  0.711111
action_or_event_correct        0.708333
object_correct                 0.700000
location_correct               0.700000
time_frame_correct             0.400000
motivation_or_cause_correct    0.733333
dtype: float64