In [315]:
import os 
from openai import OpenAI
from pydantic import BaseModel

os.environ['OPENAI_API_KEY'] = open(os.path.expanduser('~/.openai-bloomberg-project-key.txt')).read().strip()
client = OpenAI()

def prompt_openai(prompt: str, response_format: BaseModel = None):
    if response_format:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            response_format=response_format
        )    
        return response.choices[0].message.parsed
    else:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        return response.choices[0].message.content

# Parse News Sources

In [114]:
EXTRACT_NEWS_SOURCES = """
You are a helpful news assistant. Here is a news article:

<article>
{news_article}
</article>

Please summarize each informational source providing information in the article. 
Include unnamed or passively expressed sources (e.g. "witnesses", "price signals") if there is information attributable to them.
Include any facts that might have come from the source.
Make sure each source you return refer to just one source. For example: if "John and Jane" both contribute the same information, generate two separate summaries, one for "John" and one for "Jane". 
Generate only ONE summary per source.

For each source, provide the following information:
    (1) Name: just the name of the source.
    (2) Biography: A brief biography of the source mentioned in the article.
    (3) Information: Restate the facts provided by the source. Be as SPECIFIC and be as VERBOSE as possible. 
        Contextualize ALL the information the source describes. State the full names of all people, places, events and ideas mentioned and
        everything the source says with AS MUCH BACKGROUND INFORMATION from the article so I can fully understand the information
        the source is giving. I will look at each source independently without looking at any others, so help me understand the context.

Here are some examples:
example 1:
{{ "Name": "Supermarkets around the country",
   "Biography": "Retail stores that sell food and other household items",
   "Information": "Supermarkets around the country alerted shoppers that prices are likely to continue going up due to the avian flu outbreak, with eggs now average $2.88 per dozen, up 52% since the first confirmed case of avian influenza in February."
}}

example 2:
{{
  "Name": "The article's author (unnamed)",
  "Biography": "The author of the article",
  "Information": "The author stated that Wing, which is collaborating with FedEx and Walgreens on drone delivery, was the first to receive a limited Part 135 certificate. Wing is launching operations in Virginia this month, and the Standard certification allows UPS to send an unlimited number of drones to the skies, for their cargo load to exceed 55 pounds and for them to fly at night."
}}

example 3:
{{
   "Name": "Delta's customers",
   "Biography": "People who travel with Delta Air Lines",
   "Information": "Delta's customers suggested that they preferred more space on flights amid the COVID-19 pandemic, and they continue to tell Delta that more space provides more peace of mind."
}}

example 4:
{{
   "Name": "European Union countries",
   "Biography": "Countries that are part of the European Union",
   "Information": "European Union countries are working on adopting copyright rules that allow news companies and publishers to negotiate payments with large tech companies like Facebook, Microsoft and Google that use their content on their platforms."
}}

Output the summary in a list of python dictionaries as in the examples. Don't say anything else.
"""

In [115]:
from pydantic import BaseModel
from typing import List

class NewsSource(BaseModel):
    Name: str
    Biography: str
    Information: str

class NewsSourceOutput(BaseModel):
    sources: List[NewsSource]

In [138]:
import pandas as pd
import glob

news_articles_df = pd.read_csv('../data/all-articles-parsed.csv')
all_articles_on_disk = glob.glob('../data/**/*.txt', recursive=True)
news_articles_df = news_articles_df.assign(path_name=
    lambda df: df.apply(lambda x: list(filter(lambda y: x['filename'] in y and x['issue'] in y, all_articles_on_disk))[0], axis=1)
)

In [None]:
import os
import json
from tqdm.auto import tqdm

all_sources = []
for _, row in tqdm(news_articles_df.iterrows(), total=len(news_articles_df)):
    prompt = EXTRACT_NEWS_SOURCES.format(news_article=row['content'])
    output = prompt_openai(prompt, NewsSourceOutput)
    sources = json.loads(output.model_dump_json())['sources']
    all_sources.append({
        'url': row['url'],
        'sources': sources
    })

In [26]:
all_sources_df = pd.DataFrame(all_sources)

In [28]:
news_articles_df.head(2)

Unnamed: 0,title,author,url,publication_date,content,issue,filename
0,Australia kills off carbon tax,Lenore Taylor,https://www.theguardian.com/environment/2014/j...,Wed 16 Jul 2014 21.51 EDT,"Australia’s carbon price has been repealed, le...",australia-carbon-tax,guardian.txt
1,Carbon Pricing in Australia,Wikipedia Contributors,https://en.wikipedia.org/wiki/Carbon_pricing_i...,Unknown,A carbon pricing scheme in Australia was intro...,australia-carbon-tax,wikipedia.txt


In [None]:
(news_articles_df
 [['title', 'author', 'content', 'issue', 'url']]
 .merge(all_sources_df)
 .to_json(
     orient='records',
     path_or_buf='../data/all-news-articles-with-parsed-sources.json', 
     lines=True
    )
 )

# Summarize sources Wikipedia 

In [347]:
import jsonlines
wiki_citations = list(jsonlines.open('../data/all-wikipedia-citations-parsed.json'))
wiki_citation_df = pd.DataFrame(wiki_citations[0])

In [157]:
wiki_articles = (
    news_articles_df
      .loc[lambda df: df['filename'] == 'wikipedia.txt']
 )

test_article = wiki_articles.iloc[2]['content']

In [None]:
wiki_citations_with_articles = (
    wiki_citation_df
        .assign(join_key=lambda df: df['citation_file'].str.split('/').str.get(2))
        .merge(wiki_articles, left_on='join_key', right_on='issue')
        .drop(columns=['join_key', 'filename', 'path_name', 'citation_file'])
)

In [167]:
for _, row in wiki_citations_with_articles.iterrows():
    row['content']
    break

In [None]:
article_text, citations = row['content'], row['citations']
paragraphs = article_text.split('\n\n')

In [176]:
import itertools
from collections import defaultdict

In [180]:
paragraphs_by_citation = defaultdict(list)
for citation in citations:
    for paragraph in paragraphs:
        if f"[{citation['citation_number']}]" in paragraph:
            paragraphs_by_citation[citation['citation_number']].append(paragraph)

In [187]:
pd.Series(paragraphs_by_citation).str.len().value_counts()

1    93
2    10
4     1
Name: count, dtype: int64

In [195]:
paragraphs_by_citation[5]

['The carbon price was part of a broad energy reform package called the Clean Energy Futures Plan, which aimed to reduce greenhouse gas emissions in Australia by 5 % below 2000 levels by 2020 and 80 % below 2000 levels by 2050. Although Australia does not levy a direct carbon price \\[5], the plan set out to achieve these targets by encouraging Australia’s largest emitters to increase energy efficiency and invest in sustainable energy. The scheme was administered by the Clean Energy Regulator. Compensation to industry and households was funded by the revenue derived from the charge. The scheme required entities which emit over 25,000 tonnes of carbon dioxide equivalent greenhouse gases per year, and which were not in the transport or agriculture sectors, to obtain emissions permits, called carbon units. Carbon units were either purchased from the government or issued free as part of industry assistance measures. As part of the scheme, personal income tax was reduced for those earning l

In [295]:
AUGMENT_WIKIPEDIA_SOURCES = """
You are a helpful writing assistant. I will give you a Wikipedia article and {k} citations from that article:

<article>
{wikipedia_article}
</article>

<citations>
{citations}
</citations>

For each citation, please search for [citation_id] in the article and summarize the information provided by the the citation in the article. 
Look for every time that citation is referenced in the article, by searching for [citation_id] in the article.
Summarize all the information provided by the citation in the article.
Generate only ONE summary per citation.

For each citation, provide the following information:
    (1) citation_id: The id of the citation.
    (2) Information: Restate the facts provided by the citation. Be as SPECIFIC and be as VERBOSE as possible. 
        Contextualize ALL the information the citation describes. State the full names of all people, places, events and ideas mentioned and
        everything the citation says with AS MUCH BACKGROUND INFORMATION from the article so I can fully understand the information
        the citation is giving. I will look at each citation independently without looking at any others, so help me understand the context.
Output the summaries in a list of python dictionaries as in the examples. Don't say anything else.

Here are some examples:
example 1:
{{ "citation_id": "1",
   "Information": "Supermarkets around the country alerted shoppers that prices are likely to continue going up due to the avian flu outbreak, with eggs now average $2.88 per dozen, up 52% since the first confirmed case of avian influenza in February."
}}

example 2:
{{
  "citation_id": "5",
  "Information": "The author stated that Wing, which is collaborating with FedEx and Walgreens on drone delivery, was the first to receive a limited Part 135 certificate. Wing is launching operations in Virginia this month, and the Standard certification allows UPS to send an unlimited number of drones to the skies, for their cargo load to exceed 55 pounds and for them to fly at night."
}}

example 3:
{{
   "citation_id": "2",
   "Information": "Delta's customers suggested that they preferred more space on flights amid the COVID-19 pandemic, and they continue to tell Delta that more space provides more peace of mind."
}}

example 4:
{{
   "citation_id": "20",
   "Information": "European Union countries are working on adopting copyright rules that allow news companies and publishers to negotiate payments with large tech companies like Facebook, Microsoft and Google that use their content on their platforms."
}}

Output the summaries in a list of python dictionaries as in the examples. Don't say anything else.
"""

from pydantic import BaseModel, create_model
from typing import Annotated
from annotated_types import Len

class WikipediaCitation(BaseModel):
    citation_id: str
    Information: str

def make_wikipedia_citation_structure(min_len: int, max_len: int):
    return create_model(
        'WikipediaCitationOutput',
        citations=(Annotated[list[WikipediaCitation], Len(min_length=min_len, max_length=max_len)], ...),
    )

In [206]:
def format_citation(citation):    
    return f"""[{citation['citation_number']}] {citation['text']}, url: {citation['url']}"""

In [207]:
def batch_list(input_list, batch_size):
    """Yield successive n-sized chunks from input_list."""
    for i in range(0, len(input_list), batch_size):
        yield input_list[i:i + batch_size]

In [263]:
def filter_wiki_article_with_citations(article_text, citations, start_idx=0, max_citations=15):
    paragraphs = article_text.split('\n\n')
    paragraphs_with_citations, found_citations = [],[]
    for idx, citation in enumerate(citations[start_idx:]):
        found = False
        for paragraph in paragraphs:
            if f"[{citation['citation_number']}]" in paragraph:
                paragraphs_with_citations.append(paragraph)
                found = True
        if found:
            found_citations.append(citation)
            if len(found_citations) >= max_citations:
                break
    return '...'.join(paragraphs_with_citations), found_citations, (idx + start_idx) == len(citations) - 1


In [268]:
all_citation_summaries = []
batch_size = 500
TREAT_LONG_ARTICLES = True

long_wiki_articles = wiki_citations_with_articles.loc[lambda df: df['num_citations'] >= 200]
for _, row in tqdm(long_wiki_articles.iterrows(), total=len(long_wiki_articles)):
    citations = row['citations']
    citation_summaries = []
    batches = list(batch_list(citations, batch_size))
    start_idx = 0
    while True:
        paragraphs_with_citations, found_citations, all_citations_found = filter_wiki_article_with_citations(
            row['content'], citations, start_idx=start_idx, max_citations=15
        )
        print(f"Found {len(found_citations)} ({found_citations[-1]['citation_number']}) of {len(citations)}  citations...")
        citations_formatted = [format_citation(citation) for citation in found_citations]
        if TREAT_LONG_ARTICLES:
            prompt = AUGMENT_WIKIPEDIA_SOURCES.format(
                k=len(citations_formatted),
                wikipedia_article=paragraphs_with_citations,
                citations='\n'.join(citations_formatted)
            )
        else:
            prompt = AUGMENT_WIKIPEDIA_SOURCES.format(
                k=len(citations_formatted),
                wikipedia_article=row['content'],
                citations='\n'.join(citations_formatted)
            )
        output = prompt_openai(
            prompt, 
            make_wikipedia_citation_structure(len(citations_formatted), len(citations_formatted))
        )
        citation_summaries.extend(list(map(lambda x: json.loads(x.model_dump_json()), output.citations)))
        start_idx = found_citations[-1]['citation_number'] + 1
        if all_citations_found:
            break

    all_citation_summaries.append({
        'issue': row['issue'],
        'citation_summaries': citation_summaries
    })


  0%|          | 0/1 [00:00<?, ?it/s]

Found 15 (29) of 709  citations...
Found 15 (45) of 709  citations...
Found 15 (61) of 709  citations...
Found 15 (77) of 709  citations...
Found 15 (93) of 709  citations...
Found 15 (109) of 709  citations...
Found 15 (125) of 709  citations...
Found 15 (141) of 709  citations...
Found 15 (157) of 709  citations...
Found 15 (173) of 709  citations...
Found 15 (189) of 709  citations...
Found 15 (205) of 709  citations...
Found 15 (221) of 709  citations...
Found 15 (237) of 709  citations...
Found 15 (253) of 709  citations...
Found 15 (269) of 709  citations...
Found 15 (285) of 709  citations...
Found 15 (301) of 709  citations...
Found 15 (317) of 709  citations...
Found 15 (333) of 709  citations...
Found 15 (349) of 709  citations...
Found 15 (365) of 709  citations...
Found 15 (381) of 709  citations...
Found 15 (397) of 709  citations...
Found 15 (413) of 709  citations...
Found 15 (429) of 709  citations...
Found 15 (445) of 709  citations...
Found 15 (461) of 709  citations.

In [269]:
pd.DataFrame(all_citation_summaries).to_json(
    '../data/all-wikipedia-citations-parsed-summarized.json', orient='records', lines=True)

# Deep Research Citations Summarized

In [319]:
AUGMENT_DEEP_RESEARCH_SOURCES = """
You are a helpful writing assistant. I will give you a high-level summary of the research report, and then
I'll give you snippets from a research report and {k} citations from the report. The snippets will specifically contain sentences pertaining to the citations.

<summary>
{summary}
</summary>

<article_snippets>
{deep_research_article_snippets}
</article_snippets>

<citations>
{citations}
</citations>

For each citation, please search for citation_id in the article and summarize the information provided by the the citation in the article. 
The citation will referenced with a number at the end of sentences (e.g. "The treaty was signed yesterday.2").
Summarize all the information provided by the citation in the article.
Generate only ONE summary per citation.

For each citation, provide the following information:
    (1) citation_id: The id of the citation.
    (2) Information: Restate the facts provided by the citation. Be as SPECIFIC and be as VERBOSE as possible. 
        Contextualize ALL the information the citation describes. State the full names of all people, places, events and ideas mentioned and
        everything the citation says with AS MUCH BACKGROUND INFORMATION from the article so I can fully understand the information
        the citation is giving. I will look at each citation independently without looking at any others, so help me understand the context.
Output the summaries in a list of python dictionaries as in the examples. Don't say anything else.

Here are some examples:
example 1:
{{ "citation_id": "1",
   "Information": "Supermarkets around the country alerted shoppers that prices are likely to continue going up due to the avian flu outbreak, with eggs now average $2.88 per dozen, up 52% since the first confirmed case of avian influenza in February."
}}

example 2:
{{
  "citation_id": "5",
  "Information": "The author stated that Wing, which is collaborating with FedEx and Walgreens on drone delivery, was the first to receive a limited Part 135 certificate. Wing is launching operations in Virginia this month, and the Standard certification allows UPS to send an unlimited number of drones to the skies, for their cargo load to exceed 55 pounds and for them to fly at night."
}}

example 3:
{{
   "citation_id": "2",
   "Information": "Delta's customers suggested that they preferred more space on flights amid the COVID-19 pandemic, and they continue to tell Delta that more space provides more peace of mind."
}}

example 4:
{{
   "citation_id": "20",
   "Information": "European Union countries are working on adopting copyright rules that allow news companies and publishers to negotiate payments with large tech companies like Facebook, Microsoft and Google that use their content on their platforms."
}}

Output the summaries in a list of python dictionaries as in the examples. Don't say anything else.
"""


SUMMARY_PROMPT = """
Please summarize this article in 1-2 sentences. Return just the summary, no other text.

<article>
{article}
</article>

Your response:
"""

from pydantic import BaseModel, create_model
from typing import Annotated
from annotated_types import Len

class DeepResearchCitation(BaseModel):
    citation_id: str
    Information: str

def make_deep_research_citation_structure(min_len: int, max_len: int):
    return create_model(
        'DeepResearchCitationOutput',
        citations=(Annotated[list[DeepResearchCitation], Len(min_length=min_len, max_length=max_len)], ...),
    )

In [308]:
deep_research_parsed = glob.glob('../data/*/deep-research/*.json')
all_deep_research_parsed = []
for f in deep_research_parsed:
    with open(f, 'r') as f:
        data = json.load(f)
        all_deep_research_parsed.append(data)

In [309]:
import numpy as np 

all_deep_research_parsed_df = (
    pd.DataFrame(all_deep_research_parsed)
        .assign(content=lambda df: df.apply(lambda x: x['content'] if pd.isnull(x['full_content']) else x['full_content'], axis=1))
        .drop(columns=['full_content'])
 )

In [335]:
all_deep_research_parsed_df.to_json('../data/all-deep-research-content.json', orient='records', lines=True)

In [310]:
def filter_deep_research_article_with_citations(article_text, citations, start_idx=0, max_citations=15):
    paragraphs = article_text.split('\n\n')
    paragraphs_with_citations, found_citations = [],[]
    for idx, citation in enumerate(citations[start_idx:]):
        found = False
        for paragraph in paragraphs:
            if f".{citation['citation_number']} " in paragraph:
                paragraphs_with_citations.append(paragraph)
                found = True
        if found:
            found_citations.append(citation)
            if len(found_citations) >= max_citations:
                break
    return '...'.join(paragraphs_with_citations), found_citations, (idx + start_idx) == len(citations) - 1


In [311]:
all_deep_research_parsed_df['content'].str.split().str.len()

0     6561
1     3310
2     6767
3    12518
4     6416
Name: content, dtype: int64

In [312]:
all_deep_research_parsed_df['citations'].iloc[0][:2]

[{'citation_number': 1,
  'text': 'Net Zero - DCCEEW',
  'retrieval_date': 'June 6, 2025',
  'url': 'https://www.dcceew.gov.au/climate-change/emissions-reduction/net-zero'},
 {'citation_number': 2,
  'text': 'Safeguard Mechanism - Clean Energy Regulator',
  'retrieval_date': 'June 6, 2025',
  'url': 'https://cer.gov.au/schemes/safeguard-mechanism'}]

In [291]:
all_deep_research_parsed_df['citations'].str.len()

0    22
1    10
2    24
3     5
4    23
Name: citations, dtype: int64

In [336]:
# all_deep_research_citation_summaries = []
batch_size = 500
TREAT_LONG_ARTICLES = True

for _, row in tqdm(
    all_deep_research_parsed_df.iloc[:1].iterrows(), 
    total=len(all_deep_research_parsed_df)
):
    citations = row['citations']
    citation_summaries = []
    batches = list(batch_list(citations, batch_size))
    summary = prompt_openai(SUMMARY_PROMPT.format(article=row['content']))
    start_idx = 0
    while True:
        paragraphs_with_citations, found_citations, all_citations_found = (
            filter_deep_research_article_with_citations(
                row['content'], citations, start_idx=start_idx, max_citations=3
            )
        )
        print(f"Found {len(found_citations)} ({found_citations[-1]['citation_number']}) of {len(citations)}  citations...")
        citations_formatted = [format_citation(citation) for citation in found_citations]
        prompt = AUGMENT_DEEP_RESEARCH_SOURCES.format(
            summary=summary,
            k=len(citations_formatted),
            deep_research_article_snippets=paragraphs_with_citations,
            citations='\n'.join(citations_formatted)
        )
        output = prompt_openai(
            prompt, 
            make_deep_research_citation_structure(len(citations_formatted), len(citations_formatted))
        )
        citation_summaries.extend(list(map(lambda x: json.loads(x.model_dump_json()), output.citations)))
        start_idx = found_citations[-1]['citation_number'] + 1
        if all_citations_found:
            break

    all_deep_research_citation_summaries.append({
        'issue': row['title'],
        'citation_summaries': citation_summaries
    })


  0%|          | 0/5 [00:00<?, ?it/s]

Found 3 (3) of 22  citations...
Found 3 (8) of 22  citations...
Found 3 (13) of 22  citations...
Found 3 (17) of 22  citations...
Found 2 (20) of 22  citations...


In [339]:
pd.DataFrame(all_deep_research_citation_summaries).to_json('../data/deep-research-citation-summaries.jsonl', orient='records', lines=True)

In [344]:
import pyperclip
pyperclip.copy(json.dumps(citation_summaries[:3], indent=2))

In [346]:
pyperclip.copy(summary)
summary

"Australia's carbon pricing journey has been marked by policy shifts from a direct carbon tax under the Clean Energy Act 2011, which was repealed in 2014, to the 'Direct Action' policy with the Emissions Reduction Fund, culminating in the reformed Safeguard Mechanism from July 2023 that now acts as a baseline-and-credit emissions trading scheme aimed at reducing emissions from the country's largest industrial emitters. This evolution reflects ongoing efforts to balance environmental goals with economic and political challenges while aiming for a 43% reduction in emissions below 2005 levels by 2030 and net-zero emissions by 2050."

In [None]:
{'url': ''}

In [24]:
news_articles.loc[3]

title               Australia's key climate policy faces an uncert...
author                                                     Tom Lowrey
url                 https://www.abc.net.au/news/2025-03-03/safegua...
publication_date                                       Sun 2 Mar 2025
content             Some business and climate groups have voiced t...
issue                                            australia-carbon-tax
filename                                                      abc.txt
Name: 3, dtype: object

In [None]:
os.listdir('../data/australia-carbon-tax/news/abc.json')