In [None]:
import os 
from openai import OpenAI
client = OpenAI()
os.environ['OPENAI_API_KEY'] = open(os.path.expanduser('~/.openai-bloomberg-project-key.txt')).read().strip()

In [74]:
from pydantic import BaseModel

class ArticleOutput(BaseModel):
    title: str
    author: str
    url: str
    publication_date: str
    content: str

class WikipediaArticleOutput(BaseModel):
    title: str
    author: str
    url: str
    publication_date: str
    content: str


CLEAN_RAW_ARTICLE_TEXT_PROMPT = """You are a helpful assistant. 
Here is raw text from a news webpage I copied from the internet.
Please extract the article title, author, publication date, and all the article content.
Remove any text that is not part of the article content.
Copy the article content exactly as is, do not summarize anything.

Return in a JSON format with the following fields: title, author, url, publication_date, content.
All the fields should be strings.

<article>
{raw_article_text}
</article>
"""

CLEAN_WIKIPEDIA_ARTICLE_TEXT_PROMPT = """You are a helpful assistant. 
Here is raw text from a Wikipedia article I copied from the internet.
Please extract the article title, author, publication date, content and citations.
In the content field, keep ALL citation markers (e.g. [1], [2], [3], etc.), copy them exactly as they are. 
Remove any text that is not part of the article content.

Return in a JSON format with the following fields: title, author, url, publication_date, content.

<article>
{raw_article_text}
</article>
"""


In [187]:
from openai import OpenAI
client = OpenAI()

def prompt_openai(prompt: str, response_format: BaseModel):
    response = client.beta.chat.completions.parse(
        model="gpt-4.1",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        response_format=response_format
    )    
    return response.choices[0].message.parsed

In [None]:
import glob 
from tqdm.auto import tqdm
files = glob.glob('../data/*/*.txt')

all_articles = []

for file in tqdm(files):
    _, _, issue, filename = file.split('/')
    raw_article_text = open(file).read()

    if filename == 'wikipedia.txt':
        article_prompt = CLEAN_WIKIPEDIA_ARTICLE_TEXT_PROMPT.format(raw_article_text=raw_article_text)
        response_format = WikipediaArticleOutput
    else:
        article_prompt = CLEAN_RAW_ARTICLE_TEXT_PROMPT.format(raw_article_text=raw_article_text)
        response_format = ArticleOutput

    article = prompt_openai(article_prompt, response_format)
    article_dict = dict(article)
    article_dict['issue'] = issue
    article_dict['filename'] = filename
    all_articles.append(article_dict)

  0%|          | 0/27 [00:00<?, ?it/s]

In [92]:
all_articles_df = pd.DataFrame(all_articles)
all_articles_df.to_csv('../data/all-articles-parsed.csv', index=False)

In [81]:
from bs4 import BeautifulSoup

def parse_citations(html: str):
    """
    Parse a Wikipedia <div class="reflist"> block and return a list of dicts:
      - citation_number: int
      - text: str
      - retrieval_date: str or None
      - url: str or None
    """
    soup = BeautifulSoup(html, 'html.parser')
    # select all list items in the references list
    items = soup.select('div.reflist ol.references > li')
    results = []
    for idx, li in enumerate(items, start=1):
        # 1) full citation text
        ref_span = li.find('span', class_='reference-text')
        text = ref_span.get_text(separator=' ', strip=True) if ref_span else ''

        # 2) retrieval date, if present
        access = li.find('span', class_='reference-accessdate')
        if access:
            # e.g. ". Retrieved 24 February 2022"
            # strip leading punctuation/words
            retrieved = access.get_text(separator=' ', strip=True)
            # remove any leading period or 'Retrieved'
            retrieved = retrieved.replace('Retrieved', '').lstrip('. ').strip()
        else:
            retrieved = None

        # 3) first external URL in this citation
        a = li.find('a', class_='external text')
        url = a['href'] if a and a.has_attr('href') else None

        results.append({
            'citation_number': idx,
            'text': text,
            'retrieval_date': retrieved,
            'url': url
        })

    return results

In [82]:
citations = glob.glob('../data/*/*.html')

In [85]:
all_citations = []
for citation in citations:
    html = open(citation).read()
    citations = parse_citations(html)
    all_citations.append({
        'citation_file': citation,
        'citations': citations
    })

In [94]:
import pandas as pd

citations_df = pd.DataFrame(all_citations)
citations_df = (
    citations_df
        .assign(num_citations=citations_df['citations'].str.len())
)

In [98]:
# (citations_df
#  .explode('citations')
#  .assign(
#     citation_number=lambda x: x['citations'].apply(lambda x: x['citation_number']),
#     text=lambda x: x['citations'].apply(lambda x: x['text']),
#     retrieval_date=lambda x: x['citations'].apply(lambda x: x['retrieval_date']),
#     url=lambda x: x['citations'].apply(lambda x: x['url'])
#  )
#  .drop(columns=['citations'])
#  )

In [None]:
citations_df.to_json('../data/all-wikipedia-citations-parsed.json', orient='records')

# Deep Research

In [146]:
from docx import Document
import glob
deep_research_files = glob.glob('/Users/spangher/Downloads/drive-download-20250607T210740Z-1-001/*')
all_full_texts = []
for doc_file in deep_research_files:
    doc = Document(doc_file)
    full_text = '\n\n'.join(list(map(lambda x: x.text, doc.paragraphs)))
    all_full_texts.append({
        'file': doc_file,
        'full_text': full_text
    })


all_full_texts_df = pd.DataFrame(all_full_texts)


In [147]:
all_full_texts_df = all_full_texts_df.assign(file=lambda x: x['file'].apply(lambda x: x.split('/')[-1]))

In [148]:
name_to_foldername = {
'Musk, Trump Feud Overview_.docx': 'musk-trump-feud',
'Russia-Ukraine Conflict Overview, 2022+_.docx': 'russia-ukraine-invasion',
'COP29 Conference Overview Requested_.docx': 'cop29',
'Panda Diplomacy_ Origins and Impacts_.docx': 'panda-diplomacy',
'Australia_s Carbon Tax History_.docx': 'australia-carbon-tax'
}
all_full_texts_df = all_full_texts_df.assign(foldername=lambda df: df['file'].map(name_to_foldername)).dropna()

In [149]:
for _, row in all_full_texts_df.iterrows():
    with open(f"../data/{row['foldername']}/deep-research/{row['file'].replace('.docx', '.txt')}", 'w') as f:
        f.write(row['full_text'])

In [150]:
(all_full_texts_df
 .assign(file=lambda df: df['file'].str.replace('.docx', '.txt'))
 .assign(filepath=lambda df: df['foldername'] + '/deep-research/' + df['file'])
 .to_csv('../data/all-deep-research-full-texts.csv', index=False)
)

In [194]:
import pyperclip
pyperclip.copy(all_full_texts_df.loc[5]['full_text'].replace('\n', '\\n'))

In [170]:
CLEAN_DEEP_RESEARCH_ARTICLE_TEXT_PROMPT = """You are a helpful assistant. 
Here is raw text from Deep Research I generated.
Please extract the title, content and thought process (which should be at the end).
In the content field, pay special attention to keep ALL citation markers (e.g. 1, 2, 3, etc.), copy them exactly as they are. 
The thought process is the chain-of-thought reasoning that occurs at the end of the article, after the citations.

Return in a JSON format with the following fields: title, content, citations, thought_process.
Citations should be a list of responses.

<article>
{raw_article_text}
</article>
"""

In [171]:
from pydantic import BaseModel

class DeepResearchCitation(BaseModel):
    citation_number: int
    text: str
    retrieval_date: str
    url: str

class DeepResearchArticle(BaseModel):
    title: str
    content: str
    thought_process: str
    citations: list[DeepResearchCitation]

In [None]:
prompt = CLEAN_DEEP_RESEARCH_ARTICLE_TEXT_PROMPT.format(raw_article_text=all_full_texts_df['full_text'].iloc[0])

In [172]:
deep_research = prompt_openai(prompt, DeepResearchArticle)

In [202]:
main_content = '\n\n'.join(all_full_texts_df.loc[3]['full_text'].split('\n\n')[1:]).split('Works cited')[0]
pyperclip.copy(main_content.replace('\n', '\\n').replace('"', '\\"'))

In [188]:
## Careful, does not reliably work and needs manual checking...

import json
from tqdm.auto import tqdm
for _, row in tqdm(all_full_texts_df.iloc[1:].iterrows(), total=len(all_full_texts_df) - 1):
    prompt = CLEAN_DEEP_RESEARCH_ARTICLE_TEXT_PROMPT.format(raw_article_text=row['full_text'])
    deep_research = prompt_openai(prompt, DeepResearchArticle)
    with open(f'../data/{row["foldername"]}/deep-research/{row["foldername"]}.json', 'w') as f:
        json.dump(json.loads(deep_research.model_dump_json()), f)

  0%|          | 0/5 [00:00<?, ?it/s]

In [225]:
all_full_texts_df

Unnamed: 0,file,full_text,foldername
0,"Musk, Trump Feud Overview_.docx",A Collision of Titans: The Turbulent Saga of E...,musk-trump-feud
2,"Russia-Ukraine Conflict Overview, 2022+_.docx",The Russia-Ukraine Conflict: 2022 Onwards – Ge...,russia-ukraine-invasion
3,COP29 Conference Overview Requested_.docx,"COP29 in Baku: Navigating Finance, Finalizing ...",cop29
4,Panda Diplomacy_ Origins and Impacts_.docx,The Enduring Allure and Strategic Significance...,panda-diplomacy
5,Australia_s Carbon Tax History_.docx,Australia's Enduring Pursuit of Carbon Pricing...,australia-carbon-tax
