In [1]:
import os
import json
import pandas as pd
import openai
from dotenv import load_dotenv

In [2]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [3]:
# Load the environment variables from .env file
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
cache_file = '../cache_summary.json'

# Load cache from disk if it exists
try:
    with open(cache_file, 'r') as f:
        cache = json.load(f)
except FileNotFoundError:
    cache = {}

In [None]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate

llm = init_chat_model("gpt-4.1-nano-2025-04-14", model_provider="openai", temperature=0)

: 

In [None]:
df = pd.read_json("../data/2023_processed.json")
df = df.dropna(subset=["maintext"])
print(df.language.unique()) # Articles always in English
print(df.date_publish.map(lambda x: x[:4]).unique()) # Articles always published in 2023

In [None]:
cols_to_keep = ["date_publish", "description", "maintext", "title", "url", "related_companies"]

In [None]:
df.head()

Unnamed: 0,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,title,title_page,title_rss,url,mentioned_companies,related_companies,industries,named_entities,prev_day_price_C,next_day_price_C,curr_day_price_C,sentiment,emotion,news_outlet,prev_day_price_BA,next_day_price_BA,curr_day_price_BA,prev_day_price_GOOGL,next_day_price_GOOGL,curr_day_price_GOOGL,prev_day_price_MA,next_day_price_MA,curr_day_price_MA,prev_day_price_BABA,next_day_price_BABA,curr_day_price_BABA,prev_day_price_WFC,next_day_price_WFC,curr_day_price_WFC,prev_day_price_TSLA,next_day_price_TSLA,curr_day_price_TSLA,prev_day_price_T,next_day_price_T,curr_day_price_T,prev_day_price_GS,next_day_price_GS,curr_day_price_GS,prev_day_price_WMT,next_day_price_WMT,curr_day_price_WMT,prev_day_price_MSFT,next_day_price_MSFT,curr_day_price_MSFT,prev_day_price_JPM,next_day_price_JPM,curr_day_price_JPM,prev_day_price_V,next_day_price_V,curr_day_price_V,prev_day_price_BAC,next_day_price_BAC,curr_day_price_BAC,prev_day_price_AVGO,next_day_price_AVGO,curr_day_price_AVGO,prev_day_price_INTC,next_day_price_INTC,curr_day_price_INTC,prev_day_price_NVDA,next_day_price_NVDA,curr_day_price_NVDA,prev_day_price_MU,next_day_price_MU,curr_day_price_MU,prev_day_price_AMZN,next_day_price_AMZN,curr_day_price_AMZN,prev_day_price_MRK,next_day_price_MRK,curr_day_price_MRK,prev_day_price_COST,next_day_price_COST,curr_day_price_COST,prev_day_price_AAPL,next_day_price_AAPL,curr_day_price_AAPL,prev_day_price_CVX,next_day_price_CVX,curr_day_price_CVX,prev_day_price_PG,next_day_price_PG,curr_day_price_PG,prev_day_price_ADBE,next_day_price_ADBE,curr_day_price_ADBE,prev_day_price_PYPL,next_day_price_PYPL,curr_day_price_PYPL,prev_day_price_KO,next_day_price_KO,curr_day_price_KO,prev_day_price_GE,next_day_price_GE,curr_day_price_GE,prev_day_price_ORCL,next_day_price_ORCL,curr_day_price_ORCL,prev_day_price_DIS,next_day_price_DIS,curr_day_price_DIS,prev_day_price_CSCO,next_day_price_CSCO,curr_day_price_CSCO,prev_day_price_BRK,next_day_price_BRK,curr_day_price_BRK,prev_day_price_NFLX,next_day_price_NFLX,curr_day_price_NFLX,prev_day_price_CRM,next_day_price_CRM,curr_day_price_CRM,prev_day_price_LLY,next_day_price_LLY,curr_day_price_LLY,prev_day_price_HD,next_day_price_HD,curr_day_price_HD,prev_day_price_CMCSA,next_day_price_CMCSA,curr_day_price_CMCSA,prev_day_price_SHOP,next_day_price_SHOP,curr_day_price_SHOP,prev_day_price_ROKU,next_day_price_ROKU,curr_day_price_ROKU,prev_day_price_QCOM,next_day_price_QCOM,curr_day_price_QCOM,prev_day_price_PFE,next_day_price_PFE,curr_day_price_PFE,prev_day_price_JNJ,next_day_price_JNJ,curr_day_price_JNJ,prev_day_price_UNH,next_day_price_UNH,curr_day_price_UNH,prev_day_price_NIO,next_day_price_NIO,curr_day_price_NIO,prev_day_price_XOM,next_day_price_XOM,curr_day_price_XOM,prev_day_price_VZ,next_day_price_VZ,curr_day_price_VZ,prev_day_price_MRNA,next_day_price_MRNA,curr_day_price_MRNA,prev_day_price_SQ,next_day_price_SQ,curr_day_price_SQ
0,[HAVEN DALEY],2023-06-23 07:21:46+00:00,,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...",https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fus-to...,https://s.yimg.com/ny/api/res/1.2/Gwc7ViNgtrTh...,en,,"LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",finance.yahoo.com,The US has tons of leftover food. Upcycling se...,,,https://finance.yahoo.com/news/us-tons-leftove...,[C],"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,...",[6021],"[{'entity_group': 'LOC', 'word': 'Calif', 'nor...",46.63,46.24,46.02,"{'negative': 0.00045554022653900006, 'neutral'...","{'neutral': 0.714272022247314, 'disgust': 0.24...",finance.yahoo.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,[Simply Wall St],2023-08-26 15:00:07+00:00,,2023-08-26 14:00:17,"The worst result, after buying shares in a com...",https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fbaker...,https://media.zenfs.com/en/simply_wall_st__316...,en,,"The worst result, after buying shares in a com...",finance.yahoo.com,Baker Hughes (NASDAQ:BKR) shareholders have ea...,,,https://finance.yahoo.com/news/baker-hughes-na...,[T],"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON...",[4812],"[{'entity_group': 'ORG', 'word': 'T', 'normali...",,,,"{'negative': 0.00020435870101200002, 'neutral'...","{'neutral': 0.58715546131134, 'disgust': 0.258...",finance.yahoo.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,[Giulia Morpurgo and Libby Cherry],2023-12-06 19:54:59+00:00,,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fsigna...,https://s.yimg.com/ny/api/res/1.2/01_VAuYq03xX...,en,,(Bloomberg) -- An insolvency filing by Signa H...,finance.yahoo.com,Signa’s Insolvency Yields Long List of Credito...,,,https://finance.yahoo.com/news/signa-insolvenc...,[BA],[TXT],[3721],"[{'entity_group': 'PER', 'word': 'Rene Benko',...",,,,"{'negative': 0.050802122801542005, 'neutral': ...","{'neutral': 0.7576777338981621, 'fear': 0.0751...",finance.yahoo.com,234.16,237.33,236.89,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,[John Revill],2023-06-14 08:30:30+00:00,,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Flow-t...,https://media.zenfs.com/en/reuters-finance.com...,en,,By John Revill\nZURICH (Reuters) - Swiss citiz...,finance.yahoo.com,Low-tax Switzerland votes on global minimum co...,,,https://finance.yahoo.com/news/low-tax-switzer...,[GOOGL],"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,...",[7375],"[{'entity_group': 'PER', 'word': 'John Revill ...",,,,"{'negative': 0.004963840357959001, 'neutral': ...","{'neutral': 0.6667575240135191, 'disgust': 0.1...",finance.yahoo.com,,,,124.43,125.79,124.38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,[PR Newswire],2023-01-10 21:09:03+00:00,,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fnatio...,https://media.zenfs.com/en/prnewswire.com/6318...,en,,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",finance.yahoo.com,National Branding Agency HAVEN Creative Looks ...,,,https://finance.yahoo.com/news/national-brandi...,[MA],"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ...",[7389],"[{'entity_group': 'LOC', 'word': 'WA', 'normal...",,,,"{'negative': 0.00017606733308600002, 'neutral'...","{'neutral': 0.8396999835968011, 'joy': 0.09460...",finance.yahoo.com,,,,,,,370.97,377.85999,371.01001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df = df[cols_to_keep]
df.head()

Unnamed: 0,date_publish,description,maintext,title,url,related_companies
0,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...","LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",The US has tons of leftover food. Upcycling se...,https://finance.yahoo.com/news/us-tons-leftove...,"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,..."
1,2023-08-26 14:00:17,"The worst result, after buying shares in a com...","The worst result, after buying shares in a com...",Baker Hughes (NASDAQ:BKR) shareholders have ea...,https://finance.yahoo.com/news/baker-hughes-na...,"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON..."
2,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,(Bloomberg) -- An insolvency filing by Signa H...,Signa’s Insolvency Yields Long List of Credito...,https://finance.yahoo.com/news/signa-insolvenc...,[TXT]
3,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,By John Revill\nZURICH (Reuters) - Swiss citiz...,Low-tax Switzerland votes on global minimum co...,https://finance.yahoo.com/news/low-tax-switzer...,"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,..."
4,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",National Branding Agency HAVEN Creative Looks ...,https://finance.yahoo.com/news/national-brandi...,"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ..."


In [None]:
df["maintext"].sample(1).values[0].split("\n")

['The Global Textured Vegetable Protein Market is projected to register a CAGR of 10.25%. Key Highlights. Largest Segment by Region - Europe : The demand for meat substitutes has soared as vegan foods market share increased and the vegan or flexitarian population in the region rose significantly in recent years.',
 'New York, Jan. 23, 2023 (GLOBE NEWSWIRE) -- Reportlinker.com announces the release of the report "Global Textured Vegetable Protein Market - SIZE, SHARE, COVID-19 IMPACT & FORECASTS UP TO 2028" - https://www.reportlinker.com/p06381342/?utm_source=GNW',
 'Largest Segment by Distribution Channel - On-Trade : The channel accounts for over two-thirds of the value of TVP consumption, with more restaurants and hotels turning to TVP-based animal substitutes to satisfy vegan diets.',
 'Fastest-growing Segment by Region - Asia-Pacific : Many international food companies along with regional players are developing innovative meat substitutes derived from pulses, wheat, potato, and oth

In [None]:
prompt_template = PromptTemplate.from_template("""
As a professional summarizer for a financial newspaper, create a concise and comprehensive summary of the provided article while adhering to these guidelines:

Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.

Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.

Rely strictly on the provided text, without including external information.

Format the summary in paragraph form for easy understanding.
                                               
You are creating the summary to be put in the frontpage of the newspaper, so be catchy and critical.

Use from 150 to 200 words.                                                                                         

By following this optimized prompt, you will generate an effective summary that encapsulates the essence of the given article in a clear, concise, and reader-friendly manner.

Article: {Article}
""")

In [None]:
llm.invoke(prompt_template.format(Article=df["maintext"].values[0]))

AIMessage(content='Amid rising consumer awareness and mounting economic and environmental pressures, the upcycling movement is transforming food waste into high-quality, innovative products across the U.S. food industry. Salt & Straw, a pioneering ice cream chain, exemplifies this shift by incorporating leftover ingredients such as whey from yogurt production, rice remnants from brewing, and cacao pulp into their flavors, emphasizing the need to reframe waste as “wasted food” and reduce unnecessary disposal. This trend aligns with broader efforts to combat the staggering 35 million tons of food waste annually in the U.S., which costs over $200 billion and accounts for roughly 40% of the nation’s food production. The Upcycled Food Association, which certifies products with its “Upcycling Certified” seal, has seen its certified offerings grow from 30 in 2021 to 450 today, highlighting a surge in eco-conscious consumer products like cake mixes, veggie chips, and bakery items made from cos

In [None]:
df_limited = df.iloc[:50, :]

In [None]:
def get_summary_with_cache(x):
    if x in cache:
        print("Using cached summary")
        return cache[x]
    summary = llm.invoke(prompt_template.format(Article=x)).content
    cache[x] = summary
    with open(cache_file, 'w') as f:
        json.dump(cache, f)
    return summary

In [None]:
df_limited["summary"] = df_limited["maintext"].map(lambda x: get_summary_with_cache(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_limited["summary"] = df_limited["maintext"].map(lambda x: get_summary_with_cache(x))


In [None]:
df_limited.head()

Unnamed: 0,date_publish,description,maintext,title,url,related_companies,summary
0,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...","LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",The US has tons of leftover food. Upcycling se...,https://finance.yahoo.com/news/us-tons-leftove...,"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,...",Amid rising consumer awareness and mounting ec...
1,2023-08-26 14:00:17,"The worst result, after buying shares in a com...","The worst result, after buying shares in a com...",Baker Hughes (NASDAQ:BKR) shareholders have ea...,https://finance.yahoo.com/news/baker-hughes-na...,"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON...",Baker Hughes Company (NASDAQ:BKR) has demonstr...
2,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,(Bloomberg) -- An insolvency filing by Signa H...,Signa’s Insolvency Yields Long List of Credito...,https://finance.yahoo.com/news/signa-insolvenc...,[TXT],Signa Holding’s recent insolvency filing expos...
3,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,By John Revill\nZURICH (Reuters) - Swiss citiz...,Low-tax Switzerland votes on global minimum co...,https://finance.yahoo.com/news/low-tax-switzer...,"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,...",Switzerland faces a pivotal referendum this we...
4,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",National Branding Agency HAVEN Creative Looks ...,https://finance.yahoo.com/news/national-brandi...,"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ...","Haven Creative, a nationally acclaimed brandin..."


In [None]:
df_limited.iloc[0, :].summary

'Amid rising consumer awareness and mounting economic and environmental pressures, the upcycling movement is transforming food waste into high-quality, sustainable products, exemplified by Salt & Straw’s innovative ice cream flavors crafted from leftover ingredients. Portland-based Malek’s chain champions this trend by incorporating whey from yogurt production and remnants from beer brewing into unique flavors, emphasizing the need to reframe waste as “wasted food” and reduce unnecessary disposal. This approach aligns with broader industry shifts, as the U.S. wastes over 35 million tons of food annually—roughly 40% of its production—costing more than $200 billion and contributing significantly to environmental degradation. The Upcycled Food Association, which certifies products with its “Upcycling Certified” seal, has seen rapid growth from 30 certified items in 2021 to 450 today, highlighting increasing consumer demand for transparency and sustainability. These labels help consumers i

In [None]:
len(df_limited.iloc[2, :].description.split())

73

In [None]:
len(df_limited.iloc[2, :].maintext.split())

831

In [None]:
len(df_limited.iloc[2, :].summary.split())

255

In [None]:
import asyncio

In [None]:
async def process_single_article(article_text, prompt_template):
    """Process a single article asynchronously"""
    if article_text in cache:
        print("Using cached summary")
        return cache[article_text]
    try:
        formatted_prompt = prompt_template.format(Article=article_text)
        response = await llm.ainvoke(formatted_prompt)
        summary = response.content
        cache[article_text] = summary
        with open(cache_file, 'w') as f:
            json.dump(cache, f)
        return summary
    except Exception as e:
        print(f"Error processing article: {e}")
        return None

async def process_articles_batch(df, prompt_template, batch_size=5):
    """Process articles in batches to avoid rate limits"""
    results = []
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        
        # Create tasks for the batch
        tasks = [
            process_single_article(row["maintext"], prompt_template) 
            for _, row in batch.iterrows()
        ]
        
        # Process batch concurrently
        batch_results = await asyncio.gather(*tasks, return_exceptions=True)
        results.extend(batch_results)
        
        # Optional: Add delay between batches to respect rate limits
        if i + batch_size < len(df):
            await asyncio.sleep(1)  # 1 second delay between batches
    
    return results

In [None]:
res = await process_articles_batch(df.iloc[:100, :], prompt_template)

Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary


Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary
Using cached summary


In [None]:
res[0]

'Amid rising consumer awareness and mounting economic and environmental pressures, the upcycling movement is transforming food waste into high-quality, sustainable products, exemplified by Salt & Straw’s innovative ice cream flavors crafted from leftover ingredients. Portland-based Malek’s chain champions this trend by incorporating whey from yogurt production and remnants from beer brewing into unique flavors, emphasizing the need to reframe waste as “wasted food” and reduce unnecessary disposal. This approach aligns with broader industry shifts, as the U.S. wastes over 35 million tons of food annually—roughly 40% of its production—costing more than $200 billion and contributing significantly to environmental degradation. The Upcycled Food Association, which certifies products with its “Upcycling Certified” seal, has seen rapid growth from 30 certified items in 2021 to 450 today, highlighting increasing consumer demand for transparency and sustainability. These labels help consumers i

In [None]:
#res = await process_articles_batch(df, prompt_template)

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="articles",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

In [None]:
from uuid import uuid4
from langchain_core.documents import Document

In [None]:
df.head()

Unnamed: 0,date_publish,description,maintext,title,url,related_companies
0,2023-06-23 05:38:00,"At Tyler Malek's ice cream parlors, one cook's...","LOS GATOS, Calif. (AP) — At Tyler Malek's ice ...",The US has tons of leftover food. Upcycling se...,https://finance.yahoo.com/news/us-tons-leftove...,"[BSAC, FHN, PACW, BSMX, VLY, MBRG, SMMF, GNBC,..."
1,2023-08-26 14:00:17,"The worst result, after buying shares in a com...","The worst result, after buying shares in a com...",Baker Hughes (NASDAQ:BKR) shareholders have ea...,https://finance.yahoo.com/news/baker-hughes-na...,"[CHU, INSG, S, TDS, DCM, TMUS, CHT, SPOK, VEON..."
2,2023-12-06 16:57:28,(Bloomberg) -- An insolvency filing by Signa H...,(Bloomberg) -- An insolvency filing by Signa H...,Signa’s Insolvency Yields Long List of Credito...,https://finance.yahoo.com/news/signa-insolvenc...,[TXT]
3,2023-06-14 07:21:56,Swiss citizens vote this weekend on whether to...,By John Revill\nZURICH (Reuters) - Swiss citiz...,Low-tax Switzerland votes on global minimum co...,https://finance.yahoo.com/news/low-tax-switzer...,"[IGLD, RAMP, NSR, TWTR, ACXM, COR, PINS, META,..."
4,2023-01-10 20:23:00,Nationally recognized branding agency HAVEN Cr...,"WAXHAW, N.C., Jan. 10, 2023 /PRNewswire/ -- Na...",National Branding Agency HAVEN Creative Looks ...,https://finance.yahoo.com/news/national-brandi...,"[FIS, FRXB, AAQC, EEX, AUXO, BBOX, GHY, CTLP, ..."


In [None]:
documents = [Document(page_content=s, metadata={"source_id": i, "date_publish": d}) for i, (s, d) \
             in enumerate(df[["title", "date_publish"]].values)]

In [None]:
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)

In [None]:
query = "News about Apple Iphone shipments"
results = vectorstore.similarity_search(query, k=10)

In [None]:
results

[Document(id='ae9ef45f-e848-4d12-9e83-c1c41590fdb3', metadata={'source_id': 23407, 'date_publish': '2023-07-24 10:56:33'}, page_content='Apple aims to keep iPhone shipments steady - Bloomberg News'),
 Document(id='97159fc1-3d73-4b70-ae99-5b70f6e441fb', metadata={'source_id': 6648, 'date_publish': '2023-07-24 14:01:42'}, page_content='Apple Aims to Keep iPhone Shipments Steady Despite 2023 Turmoil'),
 Document(id='dfcc8587-33be-44f0-aca0-23a0e8ce818c', metadata={'date_publish': '2023-01-27 09:00:48', 'source_id': 3919}, page_content='Apple’s iPhone Dominated China Last Quarter Despite Disruptions'),
 Document(id='e08ae6fe-00ff-4afb-b4c7-a02b01e1e0c7', metadata={'date_publish': '2023-12-15 21:25:38', 'source_id': 13722}, page_content="China's ban on Apple's iPhone accelerates- Bloomberg News"),
 Document(id='17d49f3b-9806-49b7-a1da-1050523b5e75', metadata={'source_id': 8629, 'date_publish': '2023-08-03 21:06:00'}, page_content='Apple’s Sluggish iPhone Sales Overshadow Services Growth'),


In [None]:
from langchain.chains import RetrievalQA

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [None]:
query = "what is the situation of the Iphone shipment in 2023?"
qa_chain.invoke(query)

{'query': 'what is the situation of the Iphone shipment in 2023?',
 'result': "In 2023, Apple aimed to keep iPhone shipments steady despite various challenges. While there have been disruptions, particularly in China, where Apple's iPhone sales faced some headwinds due to a ban and increased local competition, the overall strategy was to maintain shipment levels. Notably, iPhone exports from India doubled to surpass $2.5 billion, indicating growth in that market. Additionally, some production of iPhone 15 models, such as the Pro Max, experienced delays, with deliveries slipping to November, which suggests some supply chain or demand adjustments. Despite these issues, the iPhone continued to dominate the Chinese market and remained a leading product globally, even as overall global smartphone sales remained flat.",
 'source_documents': [Document(id='97159fc1-3d73-4b70-ae99-5b70f6e441fb', metadata={'date_publish': '2023-07-24 14:01:42', 'source_id': 6648}, page_content='Apple Aims to Kee

In [None]:
vector_store = Chroma(
    collection_name="articles",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

documents = [Document(page_content=s, metadata={"source_id": i, "date_publish": d}) for i, (s, d) \
             in enumerate(df[["description", "date_publish"]].values)]

vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

qa_chain.invoke(query)

{'query': 'what is the situation of the Iphone shipment in 2023?',
 'result': 'The situation of iPhone shipments in 2023 is that Apple aims to keep shipments steady despite a projected decline in the overall smartphone market. They are targeting around 85 million units, roughly in line with the previous year. Apple is also considering raising the prices for Pro models to support this goal. Additionally, production in India has increased significantly, with the country now accounting for almost 7% of total iPhone production, up from 1% in 2021. Despite market challenges, Apple has maintained a strong market share in the U.S., and the demand for the new iPhone 15 models appears high, as indicated by longer delivery times.',
 'source_documents': [Document(id='97159fc1-3d73-4b70-ae99-5b70f6e441fb', metadata={'date_publish': '2023-07-24 14:01:42', 'source_id': 6648}, page_content='Apple Aims to Keep iPhone Shipments Steady Despite 2023 Turmoil'),
  Document(id='ae9ef45f-e848-4d12-9e83-c1c41

In [None]:
query="How Meta is leveraging the AI revolution?"
qa_chain.invoke(query)

{'query': 'How Meta is leveraging the AI revolution?',
 'result': "Meta is heavily investing in artificial intelligence by developing more powerful AI systems and integrating AI into its products. The company has dedicated a new team focused on AI products as part of its 'year of efficiency' initiative. Meta aims to use AI to increase the return on investment (ROI) for advertising, especially in response to challenges like Apple's ad tracking policies. Additionally, Meta is developing advanced AI tools and systems to enhance its platform, and it has committed significant resources—around $16 billion over recent years—to defensive and protective AI systems to combat misinformation and disinformation. The company's strategic focus on AI is also part of its broader effort to turn its fortunes around by making the company leaner and more innovative in AI applications.",
 'source_documents': [Document(id='b208c717-cd53-4991-9e65-5e1496cb8af7', metadata={'date_publish': '2023-04-06 18:23:50'