In [1]:
import os
import json
import pandas as pd
import openai
from dotenv import load_dotenv

In [2]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [3]:
# Load the environment variables from .env file
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
cache_file = '../cache_summary.json'

# Load cache from disk if it exists
try:
    with open(cache_file, 'r') as f:
        cache = json.load(f)
except FileNotFoundError:
    cache = {}

In [5]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate

llm = init_chat_model("gpt-4.1-nano-2025-04-14", model_provider="openai", temperature=0)

In [6]:
df = pd.read_json("../data/2023_processed.json")
df = df.dropna(subset=["maintext"])
print(df.language.unique()) # Articles always in English
print(df.date_publish.map(lambda x: x[:4]).unique()) # Articles always published in 2023

['en']
['2023']


In [7]:
cols_to_keep = ["date_publish", "description", "maintext", "title", "url", "related_companies"]

In [8]:
df.head()

Unnamed: 0,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,title,title_page,title_rss,url,mentioned_companies,related_companies,industries,named_entities,prev_day_price_C,next_day_price_C,curr_day_price_C,sentiment,emotion,news_outlet,prev_day_price_BA,next_day_price_BA,curr_day_price_BA,prev_day_price_GOOGL,next_day_price_GOOGL,curr_day_price_GOOGL,prev_day_price_MA,next_day_price_MA,curr_day_price_MA,prev_day_price_BABA,next_day_price_BABA,curr_day_price_BABA,prev_day_price_WFC,next_day_price_WFC,curr_day_price_WFC,prev_day_price_TSLA,next_day_price_TSLA,curr_day_price_TSLA,prev_day_price_T,next_day_price_T,curr_day_price_T,prev_day_price_GS,next_day_price_GS,curr_day_price_GS,prev_day_price_WMT,next_day_price_WMT,curr_day_price_WMT,prev_day_price_MSFT,next_day_price_MSFT,curr_day_price_MSFT,prev_day_price_JPM,next_day_price_JPM,curr_day_price_JPM,prev_day_price_V,next_day_price_V,curr_day_price_V,prev_day_price_BAC,next_day_price_BAC,curr_day_price_BAC,prev_day_price_AVGO,next_day_price_AVGO,curr_day_price_AVGO,prev_day_price_INTC,next_day_price_INTC,curr_day_price_INTC,prev_day_price_NVDA,next_day_price_NVDA,curr_day_price_NVDA,prev_day_price_MU,next_day_price_MU,curr_day_price_MU,prev_day_price_AMZN,next_day_price_AMZN,curr_day_price_AMZN,prev_day_price_MRK,next_day_price_MRK,curr_day_price_MRK,prev_day_price_COST,next_day_price_COST,curr_day_price_COST,prev_day_price_AAPL,next_day_price_AAPL,curr_day_price_AAPL,prev_day_price_CVX,next_day_price_CVX,curr_day_price_CVX,prev_day_price_PG,next_day_price_PG,curr_day_price_PG,prev_day_price_ADBE,next_day_price_ADBE,curr_day_price_ADBE,prev_day_price_PYPL,next_day_price_PYPL,curr_day_price_PYPL,prev_day_price_KO,next_day_price_KO,curr_day_price_KO,prev_day_price_GE,next_day_price_GE,curr_day_price_GE,prev_day_price_ORCL,next_day_price_ORCL,curr_day_price_ORCL,prev_day_price_DIS,next_day_price_DIS,curr_day_price_DIS,prev_day_price_CSCO,next_day_price_CSCO,curr_day_price_CSCO,prev_day_price_BRK,next_day_price_BRK,curr_day_price_BRK,prev_day_price_NFLX,next_day_price_NFLX,curr_day_price_NFLX,prev_day_price_CRM,next_day_price_CRM,curr_day_price_CRM,prev_day_price_LLY,next_day_price_LLY,curr_day_price_LLY,prev_day_price_HD,next_day_price_HD,curr_day_price_HD,prev_day_price_CMCSA,next_day_price_CMCSA,curr_day_price_CMCSA,prev_day_price_SHOP,next_day_price_SHOP,curr_day_price_SHOP,prev_day_price_ROKU,next_day_price_ROKU,curr_day_price_ROKU,prev_day_price_QCOM,next_day_price_QCOM,curr_day_price_QCOM,prev_day_price_PFE,next_day_price_PFE,curr_day_price_PFE,prev_day_price_JNJ,next_day_price_JNJ,curr_day_price_JNJ,prev_day_price_UNH,next_day_price_UNH,curr_day_price_UNH,prev_day_price_NIO,next_day_price_NIO,curr_day_price_NIO,prev_day_price_XOM,next_day_price_XOM,curr_day_price_XOM,prev_day_price_VZ,next_day_price_VZ,curr_day_price_VZ,prev_day_price_MRNA,next_day_price_MRNA,curr_day_price_MRNA,prev_day_price_SQ,next_day_price_SQ,curr_day_price_SQ
0,[HAVEN DALEY],2023-06-23 07:21:46+00:00,,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...",https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fus-to...,https://s.yimg.com/ny/api/res/1.2/Gwc7ViNgtrTh...,en,,"LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",finance.yahoo.com,The US has tons of leftover food. Upcycling se...,,,https://finance.yahoo.com/news/us-tons-leftove...,[C],"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,...",[6021],"[{'entity_group': 'LOC', 'word': 'Calif', 'nor...",46.63,46.24,46.02,"{'negative': 0.00045554022653900006, 'neutral'...","{'neutral': 0.714272022247314, 'disgust': 0.24...",finance.yahoo.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,[Simply Wall St],2023-08-26 15:00:07+00:00,,2023-08-26 14:00:17,"The worst result, after buying shares in a com...",https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fbaker...,https://media.zenfs.com/en/simply_wall_st__316...,en,,"The worst result, after buying shares in a com...",finance.yahoo.com,Baker Hughes (NASDAQ:BKR) shareholders have ea...,,,https://finance.yahoo.com/news/baker-hughes-na...,[T],"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON...",[4812],"[{'entity_group': 'ORG', 'word': 'T', 'normali...",,,,"{'negative': 0.00020435870101200002, 'neutral'...","{'neutral': 0.58715546131134, 'disgust': 0.258...",finance.yahoo.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,[Giulia Morpurgo and Libby Cherry],2023-12-06 19:54:59+00:00,,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fsigna...,https://s.yimg.com/ny/api/res/1.2/01_VAuYq03xX...,en,,(Bloomberg) -- An insolvency filing by Signa H...,finance.yahoo.com,Signa’s Insolvency Yields Long List of Credito...,,,https://finance.yahoo.com/news/signa-insolvenc...,[BA],[TXT],[3721],"[{'entity_group': 'PER', 'word': 'Rene Benko',...",,,,"{'negative': 0.050802122801542005, 'neutral': ...","{'neutral': 0.7576777338981621, 'fear': 0.0751...",finance.yahoo.com,234.16,237.33,236.89,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,[John Revill],2023-06-14 08:30:30+00:00,,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Flow-t...,https://media.zenfs.com/en/reuters-finance.com...,en,,By John Revill\nZURICH (Reuters) - Swiss citiz...,finance.yahoo.com,Low-tax Switzerland votes on global minimum co...,,,https://finance.yahoo.com/news/low-tax-switzer...,[GOOGL],"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,...",[7375],"[{'entity_group': 'PER', 'word': 'John Revill ...",,,,"{'negative': 0.004963840357959001, 'neutral': ...","{'neutral': 0.6667575240135191, 'disgust': 0.1...",finance.yahoo.com,,,,124.43,125.79,124.38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,[PR Newswire],2023-01-10 21:09:03+00:00,,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fnatio...,https://media.zenfs.com/en/prnewswire.com/6318...,en,,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",finance.yahoo.com,National Branding Agency HAVEN Creative Looks ...,,,https://finance.yahoo.com/news/national-brandi...,[MA],"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ...",[7389],"[{'entity_group': 'LOC', 'word': 'WA', 'normal...",,,,"{'negative': 0.00017606733308600002, 'neutral'...","{'neutral': 0.8396999835968011, 'joy': 0.09460...",finance.yahoo.com,,,,,,,370.97,377.85999,371.01001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
df = df[cols_to_keep]
df.head()

Unnamed: 0,date_publish,description,maintext,title,url,related_companies
0,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...","LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",The US has tons of leftover food. Upcycling se...,https://finance.yahoo.com/news/us-tons-leftove...,"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,..."
1,2023-08-26 14:00:17,"The worst result, after buying shares in a com...","The worst result, after buying shares in a com...",Baker Hughes (NASDAQ:BKR) shareholders have ea...,https://finance.yahoo.com/news/baker-hughes-na...,"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON..."
2,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,(Bloomberg) -- An insolvency filing by Signa H...,Signa’s Insolvency Yields Long List of Credito...,https://finance.yahoo.com/news/signa-insolvenc...,[TXT]
3,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,By John Revill\nZURICH (Reuters) - Swiss citiz...,Low-tax Switzerland votes on global minimum co...,https://finance.yahoo.com/news/low-tax-switzer...,"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,..."
4,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",National Branding Agency HAVEN Creative Looks ...,https://finance.yahoo.com/news/national-brandi...,"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ..."


In [10]:
df["maintext"].sample(1).values[0].split("\n")

['(Bloomberg) -- Arm Ltd., the chip designer backed by SoftBank Group Corp., is in talks with Nvidia Corp. to join a group of potential investors to anchor its initial public offering, according to the Financial Times.',
 'Most Read from Bloomberg',
 'DeSantis Says No Thanks to $377 Million in Federal Energy Funds',
 'Singapore Puts Transport Minister on Leave Amid Graft Probe',
 'Poisoned Cough Syrup Killed Kids. Authorities Cut the Investigation Short',
 'Microsoft Cleared to Buy Activision in US as UK Pauses Fight',
 'Turkey Agrees to Back Sweden’s NATO Bid in Boost to Alliance',
 'Nvidia, a longtime partner and client of Arm, is seeking to invest at a valuation of $35 billion to $40 billion, the paper said, citing unidentified people briefed on the talks. That compares with a valuation closer to $80 billion that SoftBank wants, it said.',
 'Bloomberg earlier reported that Arm was in talks with possible strategic investors including Nvidia competitor Intel Corp. to participate in th

In [11]:
prompt_template = PromptTemplate.from_template("""
As a professional summarizer for a financial newspaper, create a concise and comprehensive summary of the provided article while adhering to these guidelines:

Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.

Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.

Rely strictly on the provided text, without including external information.

Format the summary in paragraph form for easy understanding.
                                               
You are creating the summary to be put in the frontpage of the newspaper, so be catchy and critical.

Use from 250 to 400 words.                                                                                         

By following this optimized prompt, you will generate an effective summary that encapsulates the essence of the given article in a clear, concise, and reader-friendly manner.

Article: {Article}
""")

In [12]:
llm.invoke(prompt_template.format(Article=df["maintext"].values[0]))

AIMessage(content='In a compelling shift toward sustainability, the upcycling movement is transforming the food industry by turning food waste into high-quality, innovative products, exemplified by Salt & Straw’s pioneering ice cream flavors. Led by head ice cream maker Tyler Malek, the chain utilizes remnants such as whey from yogurt production and brewing byproducts like rice and grains to craft unique flavors like lemon curd and chocolate barley milk, challenging traditional notions of waste. This approach aligns with a broader consumer trend demanding transparency and environmental responsibility, as Americans waste over 35 million tons of food annually—roughly 40% of production—costing the economy more than $200 billion. The Upcycled Food Association (UFA) is at the forefront, certifying products with its “Upcycling Certified” seal, which now adorns over 450 items, including Salt & Straw’s innovative offerings. The certification aims to educate consumers about reducing food waste 

In [13]:
df_limited = df.iloc[:50, :]

In [14]:
def get_summary_with_cache(x):
    if x in cache:
        print("Using cached summary")
        return cache[x]
    summary = llm.invoke(prompt_template.format(Article=x)).content
    cache[x] = summary
    with open(cache_file, 'w') as f:
        json.dump(cache, f)
    return summary

In [15]:
df_limited["summary"] = df_limited["maintext"].map(lambda x: get_summary_with_cache(x))

Using cached summary
Using cached summary
Using cached summary
Using cached summary


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_limited["summary"] = df_limited["maintext"].map(lambda x: get_summary_with_cache(x))


In [16]:
df_limited.head()

Unnamed: 0,date_publish,description,maintext,title,url,related_companies,summary
0,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...","LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",The US has tons of leftover food. Upcycling se...,https://finance.yahoo.com/news/us-tons-leftove...,"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,...","In a compelling shift toward sustainability, t..."
1,2023-08-26 14:00:17,"The worst result, after buying shares in a com...","The worst result, after buying shares in a com...",Baker Hughes (NASDAQ:BKR) shareholders have ea...,https://finance.yahoo.com/news/baker-hughes-na...,"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON...",Baker Hughes Company (NASDAQ:BKR) has demonstr...
2,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,(Bloomberg) -- An insolvency filing by Signa H...,Signa’s Insolvency Yields Long List of Credito...,https://finance.yahoo.com/news/signa-insolvenc...,[TXT],"Signa Holding, the umbrella entity of Rene Ben..."
3,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,By John Revill\nZURICH (Reuters) - Swiss citiz...,Low-tax Switzerland votes on global minimum co...,https://finance.yahoo.com/news/low-tax-switzer...,"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,...",Switzerland faces a pivotal referendum this we...
4,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",National Branding Agency HAVEN Creative Looks ...,https://finance.yahoo.com/news/national-brandi...,"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ...","Haven Creative, a nationally acclaimed brandin..."


In [17]:
df_limited.iloc[2, :].summary

'Signa Holding, the umbrella entity of Rene Benko’s sprawling property and retail empire, has filed for insolvency, shedding light on its intricate and high-stakes financial web. The court documents, obtained by Bloomberg, reveal a creditor list comprising 273 entities, including major financial institutions such as Julius Baer, BNP Paribas, and the Saudi Public Investment Fund, alongside a diverse array of creditors ranging from private security firms to local service providers. Notably, the firm’s assets include iconic trophy properties like New York’s Chrysler Building, London’s Selfridges, and Berlin’s KaDeWe, underscoring its prominence in the luxury real estate sector. However, the filing estimates potential liquidation losses at €5 billion ($5.4 billion), with significant uncertainties surrounding the origins of billions in contingent liabilities and the overall restructuring prospects.\n\nThe financial distress is attributed to a confluence of macroeconomic and sector-specific 

In [18]:
len(df_limited.iloc[2, :].description.split())

73

In [19]:
len(df_limited.iloc[2, :].maintext.split())

831

In [20]:
len(df_limited.iloc[2, :].summary.split())

322

In [21]:
import asyncio

In [22]:
async def process_single_article(article_text, prompt_template):
    """Process a single article asynchronously"""
    if article_text in cache:
        print("Using cached summary")
        return cache[article_text]
    try:
        formatted_prompt = prompt_template.format(Article=article_text)
        response = await llm.ainvoke(formatted_prompt)
        summary = response.content
        cache[article_text] = summary
        with open(cache_file, 'w') as f:
            json.dump(cache, f)
        return summary
    except Exception as e:
        print(f"Error processing article: {e}")
        return None

async def process_articles_batch(df, prompt_template, batch_size=5):
    """Process articles in batches to avoid rate limits"""
    results = []
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        
        # Create tasks for the batch
        tasks = [
            process_single_article(row["maintext"], prompt_template) 
            for _, row in batch.iterrows()
        ]
        
        # Process batch concurrently
        batch_results = await asyncio.gather(*tasks, return_exceptions=True)
        results.extend(batch_results)
        
        # Optional: Add delay between batches to respect rate limits
        if i + batch_size < len(df):
            await asyncio.sleep(1)  # 1 second delay between batches
    
    return results

In [23]:
res = await process_articles_batch(df_limited, prompt_template)

Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached 

In [24]:
res[0]

'In a compelling shift toward sustainability, the upcycling movement is transforming the food industry by turning food waste into high-quality, innovative products, exemplified by Salt & Straw’s pioneering ice cream flavors. Led by head ice cream maker Tyler Malek, the chain utilizes remnants such as whey from yogurt production and rice and grain residues from brewing to craft unique flavors like lemon curd and chocolate barley milk, challenging traditional notions of waste and emphasizing the importance of reducing food wastage. This approach aligns with a broader consumer trend of scrutinizing ingredient sourcing and environmental impact, as over 35 million tons of food—approximately 40% of U.S. food production—are discarded annually, costing the economy over $200 billion. The upcycling trend extends beyond ice cream into mainstream markets, with products like cake mixes and veggie chips incorporating cosmetically imperfect produce—such as misshapen vegetables and blemished fruits—th

In [25]:
# res = await process_articles_batch(df, prompt_template)