In [223]:
import pandas as pd 
import warnings
import pyperclip
import json
import os 
from openai import OpenAI
import ast

warnings.simplefilter(action='ignore')
def simple_json_parse(j):
    try:
        return json.loads(j)
    except:
        try:
            return ast.literal_eval(j)
        except:
            return None


os.environ['OPENAI_API_KEY'] = open('/Users/spangher/.openai-bloomberg-project-key.txt').read().strip()
client = OpenAI()

def prompt_openai(prompt, model='gpt-4o-mini'):
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return completion.choices[0].message.content

# Get Centrality and Narrative Role

In [126]:
file_response = client.files.content("file-KVq4w5yxV4Y1tpxpCcjcRA")
with open('openai_batched_data/source_perspective_and_narrative_function_questions.jsonl', 'w') as f:
    f.write(file_response.text)

file_response = client.files.content("file-1B1uL3HyhNd3pkfmwVnrKE")
with open('openai_batched_data/source_centrality_redo.jsonl', 'w') as f:
    f.write(file_response.text)

file_response = client.files.content("file-YEdr7o96DNihEyvNNm5Ave")
with open('openai_batched_data/source_centrality_redo_2.jsonl', 'w') as f:
    f.write(file_response.text)

In [140]:
response_df = pd.read_json('openai_batched_data/source_perspective_and_narrative_function_questions.jsonl', lines=True)
cent_persp_df = pd.read_json('openai_batched_data/source_centrality_redo_2.jsonl', lines=True)

In [187]:
source_info_df = pd.read_json('../data/output_data/parsed_source_info.jsonl', lines=True)

In [141]:
narr_df = response_df.loc[lambda df: df['custom_id'].str.contains('narrative-understanding')]
# cent_persp_df = response_df.loc[lambda df: df['custom_id'].str.contains('centrality')]

In [144]:
for df in [narr_df, cent_persp_df]:
    df['parsed_responses'] = (
        df['response']
         .str.get('body')
         .str.get('choices')
         .str.get(0)
         .str.get('message')
         .str.get('content')
         # 
         .str.replace('```json', '').str.replace('```','').str.strip()
         .apply(simple_json_parse)
    )

cent_persp_df = cent_persp_df.loc[lambda df: df['parsed_responses'].notnull()]
narr_df = narr_df.loc[lambda df: df['parsed_responses'].notnull()]

In [145]:
import ast 

In [146]:
cent_persp_df.shape

(1036, 5)

# Narrative Function Exploration

In [201]:
narr_df['parsed_responses']

narr_df_exp_df = (
    narr_df
        .explode('parsed_responses')
        .loc[lambda df: df['parsed_responses'].notnull()]
        .pipe(lambda df: pd.DataFrame(df['parsed_responses'].tolist(), index=df['custom_id']))
        .reset_index()
)

In [202]:
narr_df_exp_df

Unnamed: 0,custom_id,Name,Narrative Function
0,narrative-understanding__0,Dr. Cameron Clifford,"""Protagonist/First-person Witness"": This sourc..."
1,narrative-understanding__0,Cal Clifford,"""Emotional Anchor"": This source serves as the ..."
2,narrative-understanding__0,Kelley Voss,"""Expert Critic"": This source is used as a subj..."
3,narrative-understanding__0,Meg Mindlin,"""Science Communicator"": This source provides a..."
4,narrative-understanding__0,Vincent Nijman,"""Industry Analyst"": This source contextualizes..."
...,...,...,...
9042,narrative-understanding__1099,Samuel Scarpino,"""Expert Interpreter"": This source is used to g..."
9043,narrative-understanding__1099,Andrew Bowman,"""Independent Corroborator"": This source is use..."
9044,narrative-understanding__1099,Federal government scientists (unnamed),"""Process Describers"": This source provides the..."
9045,narrative-understanding__1099,Experts (unnamed),"""Safety Emphasizers"": This source is used to p..."


# Centrality Exploration

In [203]:
cent_persp_df['parsed_responses'].isnull().value_counts()

parsed_responses
False    1036
Name: count, dtype: int64

In [204]:
cent_persp_exp_df = (
    cent_persp_df
        .explode('parsed_responses')
        .loc[lambda df: df['parsed_responses'].notnull()]
        .pipe(lambda df: pd.DataFrame(df['parsed_responses'].tolist(), index=df['custom_id']))
        # .loc[lambda df: df['Is_Error'] == 'No']
        .reset_index()
        # .drop(columns='Is_Error')
)

In [205]:
persp_exp_df = cent_persp_exp_df.explode('Perspective')

In [206]:
to_keep = persp_exp_df['Perspective'].value_counts().loc[lambda s: s  > 10].index

In [207]:
persp_counts = (
    persp_exp_df
         .assign(c=1)
         .pivot_table(values='c', index='custom_id', columns='Perspective', aggfunc='sum')
         [to_keep]
         .fillna(0)
         .astype(int)
         .assign(Skeptical=lambda df: df['Skeptical'] + df['Against'])
         .drop(columns='Against')
)

In [208]:
persp_counts

Perspective,Informative,Authoritative,Supportive,Neutral,Skeptical
custom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
centrality-perspective__0,7,5,6,2,2
centrality-perspective__1,15,9,1,1,6
centrality-perspective__10,14,10,9,5,11
centrality-perspective__100,4,2,2,1,0
centrality-perspective__1000,9,8,0,1,0
...,...,...,...,...,...
centrality-perspective__995,7,5,1,0,0
centrality-perspective__996,4,8,5,1,4
centrality-perspective__997,5,3,0,1,2
centrality-perspective__998,8,4,0,1,0


In [209]:
# .apply(lambda x: 'Yes' if 
# cent_persp_exp_df['Spoken_to'].str.split().str.get(0).value_counts()
cent_persp_exp_df['Spoken_to'].apply(lambda x: 'No' if x == 'No' else 'Yes').value_counts()

Spoken_to
No     4651
Yes    3572
Name: count, dtype: int64

In [210]:
cent_persp_exp_df.loc[lambda df: df['Spoken_to'] == 'No'].iloc[9].to_dict()

{'custom_id': 'centrality-perspective__1',
 'Name': 'Dr. Peter Marks',
 'Spoken_to': 'No',
 'Information_Channel': 'Resignation letter was quoted by the reporter to present his perspective.',
 'Perspective': ['Skeptical', 'Informative'],
 'Centrality': 'High',
 'Is_Error': 'No'}

# Source Info

In [211]:
source_info_df.head(2)

Unnamed: 0,id,custom_id,response,error,parsed_responses,sources
0,batch_req_680af09b2fdc8190a3a9a21607ace969,source-extraction__0,"{'status_code': 200, 'request_id': '18ecbdfafb...",,"[{'Name': 'Dr. Cameron Clifford', 'Biography':...","[Dr. Cameron Clifford, Cal Clifford, Kelley Vo..."
1,batch_req_680af09b517881908429b76ae227ecca,source-extraction__1,"{'status_code': 200, 'request_id': '37c6b7cbda...",,"[{'Name': 'Robert F. Kennedy Jr.', 'Biography'...","[Robert F. Kennedy Jr., National Institutes of..."


In [212]:
for df in [narr_df_exp_df, cent_persp_exp_df, source_info_exp_df]:
    df['doc_id'] = df['custom_id'].str.split('__').str.get(1).astype(int)

In [193]:
source_info_exp_df = (
    source_info_df
        .explode('parsed_responses')
        .loc[lambda df: df['parsed_responses'].notnull()]
        .pipe(lambda df: pd.DataFrame(df['parsed_responses'].tolist(), index=df['custom_id']))
        .reset_index()
)

In [221]:
full_source_df = (
    source_info_exp_df
         .drop(columns='custom_id')
         .merge(narr_df_exp_df.drop(columns='custom_id'), on=['doc_id', 'Name'])
         .merge(cent_persp_exp_df.drop(columns='custom_id'), on=['doc_id', 'Name'], how='left')
)

In [249]:
source_info_df

Unnamed: 0,id,custom_id,response,error,parsed_responses,sources
0,batch_req_680af09b2fdc8190a3a9a21607ace969,source-extraction__0,"{'status_code': 200, 'request_id': '18ecbdfafb...",,"[{'Name': 'Dr. Cameron Clifford', 'Biography':...","[Dr. Cameron Clifford, Cal Clifford, Kelley Vo..."
1,batch_req_680af09b517881908429b76ae227ecca,source-extraction__1,"{'status_code': 200, 'request_id': '37c6b7cbda...",,"[{'Name': 'Robert F. Kennedy Jr.', 'Biography'...","[Robert F. Kennedy Jr., National Institutes of..."
2,batch_req_680af09b764c81908d0f50966152633c,source-extraction__2,"{'status_code': 200, 'request_id': '56777596ce...",,"[{'Name': 'Jonathan McDowell', 'Biography': 'J...","[Jonathan McDowell, The New York Times (interv..."
3,batch_req_680af09ba4608190982521d444f14687,source-extraction__3,"{'status_code': 200, 'request_id': '75ac704d99...",,"[{'Name': 'Robert F. Kennedy Jr.', 'Biography'...","[Robert F. Kennedy Jr., Andrew Nixon, Dr. Mart..."
4,batch_req_680af09bcf848190ae04f3472b231d10,source-extraction__4,"{'status_code': 200, 'request_id': '68f71952f4...",,"[{'Name': 'Trump administration', 'Biography':...","[Trump administration, The New York Times, Ear..."
...,...,...,...,...,...,...
1095,batch_req_680af13dbc04819086c28a0094b36f3b,source-extraction__1095,"{'status_code': 200, 'request_id': '76684a8ab1...",,"[{'Name': 'Philips Respironics', 'Biography': ...","[Philips Respironics, Plaintiffs’ lawyers Sand..."
1096,batch_req_680af13ddf28819089bb88f3a254a9c6,source-extraction__1096,"{'status_code': 200, 'request_id': '66737daf47...",,"[{'Name': 'Researchers in Taiwan', 'Biography'...","[Researchers in Taiwan, Dr. Vivek H. Murthy, t..."
1097,batch_req_680af13e01e48190921fc8ef24f1c126,source-extraction__1097,"{'status_code': 200, 'request_id': 'bad19e60c2...",,"[{'Name': 'Karla Bloem', 'Biography': 'Executi...","[Karla Bloem, Bob Sallinger, Wayne Pacelle, Co..."
1098,batch_req_680af13e324081909d77fbc3b88a4748,source-extraction__1098,"{'status_code': 200, 'request_id': 'f381632006...",,[{'Name': 'Center for Disease Analysis Foundat...,"[Center for Disease Analysis Foundation, Cente..."


In [240]:
to_group = ['Name',
 'Biography',
 'Information',
 'Narrative Function',
 'Spoken_to',
 'Information_Channel',
 'Perspective',
 'Centrality']

doc_source_data = (
    full_source_df
        .assign(Centrality=lambda df: df['Centrality'].apply(lambda x: x if x in ['High', 'Medium', 'Low'] else 'Low'))
        .loc[lambda df: df['Is_Error'] == 'No']
        .drop(columns='Is_Error')
        .groupby('doc_id').apply(lambda df: df[to_group].to_dict(orient='records')) 
)

doc_source_data.to_json('../app/app_data/doc_source_data.jsonl')

In [245]:
pd.Series(doc_source_data[0][0]).apply(lambda x: x[:100] + '...' if isinstance(x, str) else x).to_dict()

{'Name': 'Dr. Cameron Clifford...',
 'Biography': 'A dentist from Oklahoma and the father of Cal Clifford, who became widely known in 2023 after his fa...',
 'Information': 'Dr. Cameron Clifford explained the entire timeline of events: buying a pet octopus named Terrance fo...',
 'Narrative Function': '"Protagonist/First-person Witness": This source acts as the main character of the narrative and a fi...',
 'Spoken_to': 'Yes...',
 'Information_Channel': None,
 'Perspective': ['Authoritative', 'Informative', 'Supportive'],
 'Centrality': 'High...'}

In [247]:
len(doc_source_data[34])

4

In [251]:
ls ../data/

file-cache.csv
[34mfound-science-articles[m[m/
[34moutput_data[m[m/
science_articles-with-parsed-files.json.gz
science_articles.json.gz


In [253]:
full_source_df.to_json('../data/full-parsed-source-df.jsonl', orient='records', lines=True)