In [22]:
import json
import pandas as pd 
import os 
from tqdm.auto import tqdm

os.environ['OPENAI_API_KEY'] = open('/Users/spangher/.openai-bloomberg-project-key.txt').read().strip()
from openai import OpenAI
client = OpenAI()

def prompt_openai(prompt, model='gpt-4o-mini'):
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return completion.choices[0].message.content

In [None]:
# Here are the things that the science article mentions that we didn't. Do you want to do deeper in any of them?
# Here are images from the file and a description of each
# Do you want to know more about the image? 

# News Article/Science Article Diff

In [41]:
OVERALL_CONTENT_DIFF_PROMPT = """
You are a fun and descriptive helper who helps me interpret science articles. Here is the text of a scientific article

<science_article>
{science_article}
</science_article>

Here is a news article the New York Times wrote based on the scientific article:

<news_article>
{news_article}
</news_article>

Write a 100 word paragraph from the New York Times' perspective summarizing the angle they took on the science article.
In a new paragraph, give a 100-word high-level summary of the information that's in the science article that 
they chose not to cover directly in the news article.

Be descriptive and informative, but also be approachable. Use plain English, don't use overly flowery language. 

This is intended for a general audience.

Your response:
"""

CONTENT_ITEMS_DIFF_PROMPT = """
You are a fun and descriptive helper who helps me interpret science articles.

Here is the text of a scientific article

<science_article>
{science_article}
</science_article>

Here is a news article that is written off of the scientific article:

<news_article>
{news_article}
</news_article>

Describe the content that is in the scientific article but not the news article. 
Group each category of content separately (e.g. "Description of Methods", "Background", etc.)
Be descriptive and informative, but also be approachable. Use plain English, don't use overly flowery language, 
and explain the significance of each content type in the science aritcle. 
This is intended for a general audience. 

Return this as a JSON with keys being the categories of information and a description of the content differences for each category. 
Return just the JSON, nothing else.

Your response:
"""

In [9]:
science_df_with_files = pd.read_json('../data/science_articles-with-parsed-files.json.gz', lines=True)

In [44]:
RUN_OVERALL_SUMMARY = True
RUN_SPECIFIC_CONTENT_DIFF = True
content_diff_summary_output = []
all_specific_content_diff_output = []
for _, row in tqdm(science_df_with_files.iterrows(), total=len(science_df_with_files)):
    while True:
        science_file_text = open(row['txt_file']).read().strip()
        news_article_text = row['bodyText']
        ##############################################################
        if RUN_OVERALL_SUMMARY:
            ## overall summary
            overall_diff_summ = prompt_openai(
                OVERALL_CONTENT_DIFF_PROMPT.format(
                    news_article=news_article_text,
                    science_article=science_file_text                
                ), model='gpt-4.1'
            )
            content_diff_summary_output.append(overall_diff_summ)
        ##############################################################
        if RUN_SPECIFIC_CONTENT_DIFF:
            try:
                ## specific pieces of items 
                content_diff_output = prompt_openai(
                    CONTENT_ITEMS_DIFF_PROMPT.format(
                        news_article=news_article_text,
                        science_article=science_file_text
                    ), model='gpt-4.1'
                )
                content_diff_output = content_diff_output.replace('```json', '').replace('```', '')
    
                parsed = json.loads(content_diff_output)
                all_specific_content_diff_output.append({
                    'id': row['id'],
                    'output': parsed
                })
                break
            except Exception as e:
                print(f'error: {str(e)}')
                pass
        else:
            break

  0%|          | 0/10 [00:00<?, ?it/s]

In [85]:
science_df_with_files['diff_summary'] = content_diff_summary_output

In [59]:
print(content_diff_summary_output[9])

In their coverage, The New York Times focused on the clever ways spiders adapt to urban noise by engineering their webs to suit their noisy surroundings. They highlighted the spiders’ use of webs both as amplifiers and mufflers of sound—a blend of hearing aid and noise-canceling headphones. The story zeroed in on how urban spiders soften vibrations to tune out constant clamors, while rural spiders boost sensitivity during sudden noise spikes. The article drew broad parallels to human strategies for coping with noise, and emphasized the ability of spiders to rapidly adjust their behavior in response to their environment.

The original scientific article takes a deeper dive into the technical and experimental aspects of the research. It explains the experimental design—how spiders from both rural and urban backgrounds were raised under controlled “loud” or “quiet” vibratory conditions and how researchers measured exactly how vibrations travel across the webs. The study examined specifics

In [62]:
with open('../data/output_data/v3_science_diff.json', 'w') as f:
    json.dump(all_specific_content_diff_output, f)

In [63]:
with open('../data/output_data/science_diff_summaries.json', 'w') as f:
    json.dump(content_diff_summary_output, f)

In [88]:
science_df_with_files[['id', 'diff_summary']].to_json('../app/app_data/diffs_summaries.jsonl', lines=True, orient='records')

In [144]:
all_specific_content_diff_output_copy = []
for o in all_specific_content_diff_output:
    content = o['output']
    new_output = {}
    for k, v in content.items():
        if isinstance(v, dict):
            v = v['content']
        new_output[k] = v
    all_specific_content_diff_output_copy.append({
        'id': o['id'],
        'output': new_output
    })

In [152]:
pd.DataFrame(all_specific_content_diff_output_copy).to_json('../app/app_data/diffs_specific_categories.jsonl', lines=True, orient='records')

# Describe Each Image

In [81]:
from PIL import Image
image_mapper = {}

In [82]:
CAPTION_PROMPT = """
This image is from the following scientific article.

<science_article>
{science_article}
</science_article>

Can you do the follow:
1. "Caption": Please caption this image for a general audience. Use plain English. Be descriptive. Describe what it shows.
2. "Significance": Please describe why the authors used this image.
3. "Fun fact": Please note a fun fact about the image that you notice. Make it fun an engaging!

Return your response as a JSON. Say nothing else.

Your response:
"""

In [None]:
all_image_output = []
for _, row in tqdm(science_df_with_files.iterrows(), total=len(science_df_with_files)):
    science_article_text = open(row['txt_file']).read().strip()
    img_path = '../data/found-science-articles/figures'
    folder_name = os.path.basename(row['pdf_file'].replace('.pdf', ''))
    images = glob.glob(f'{img_path}/{folder_name}/*')
    images = list(filter(lambda x: '.pdf' not in x, images))
    
    for i in images:
        image = Image.open(i)
        if image.mode in ("RGBA", "P"):
            image = image.convert("RGB")
        image.save(i.replace('.png', '.pdf').replace('.jpg', '.pdf'), "PDF")

    pdf_images = glob.glob(f'{img_path}/{folder_name}/*.pdf')
    uploaded_file_ids = []
    for f in tqdm(pdf_images):
        file = client.files.create(
            file=open(f, "rb"),
            purpose="user_data"
        )
        uploaded_file_ids.append({
            'id': os.path.basename(f),
            'openai_id': file.id
        })

    for uploaded_image in tqdm(uploaded_file_ids):
        prompt = CAPTION_PROMPT.format(science_article=science_article_text)
        while True:
            output = prompt_openai_with_files(prompt, uploaded_image['openai_id'])
            output = output.replace('```json', '').replace('```', '')
            try:
                parsed = json.loads(output)
                all_image_output.append({
                    'id': row['id'],
                    'image_id': uploaded_image['id'],
                    'output': parsed
                })
                break
            except:
                print('failed, trying again...')
                pass

In [None]:
all_image_output = list(filter(lambda x: 'image_id' in x, all_output))
with open('../data/output_data/image_output_data.json', 'w') as f:
    json.dump(all_image_output, f)

In [None]:
pd.DataFrame(all_image_output).to_json('../app/app_data/image_descriptions.jsonl', lines=True, orient='records')

In [66]:
science_df_with_files[['id', 'headline', 'bylines', 'bodyText']].to_json(path_or_buf='../app/app_data/science_article_sample.json', orient='records', lines=True)

In [79]:
id_folder_mapper = (
    science_df_with_files
         .assign(image_file_name=lambda df: df['file'].str.split('/').str.get(-1).str.replace('.pdf', ''))
         [['id', 'image_file_name']]
         .set_index('id')['image_file_name'].to_dict()
)

In [80]:
with open('../app/app_data/article_id_folder_mapper.json', 'w') as f:
    json.dump(id_folder_mapper, f)

In [68]:
ls ../app/app_data/science_article_figures/

[34mcompositionality-in-bonobos[m[m/     [34mshingles-vaccine[m[m/
[34mfoie-gras-without-force-feeding[m[m/ [34mstentor-colonies[m[m/
[34mfunnel-web-spiders[m[m/              [34mwarty-birch-caterpillars[m[m/
[34mground-sloths-extinct-species[m[m/   [34mwater-abundance-moon[m[m/
[34mmedieval-manuscripts[m[m/
