In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

from langchain_community.document_loaders import NewsURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from newspaper import Article
import re
from langchain.docstore.document import Document

import os
from dotenv import load_dotenv
load_dotenv()

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from typing import List

In [2]:
def scrape_inquirer_links_selenium(pages=20):
    base_url = 'https://business.inquirer.net/category/latest-stories'
    categories = ['TRADE', 'ECONOMY', 'STOCK MARKETS', 'BONDS', 'INVESTMENTS', 'ENERGY (OIL, ELECTRICITY, COAL)', 'ENERGY', 'AVIATION', 'BANKS', 
                  'CORPORATE EARNINGS', 'RETAIL', 'LATEST STORIES']
    hrefs = []

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        for i in range(1, pages + 1):
            url = base_url if i == 1 else f'{base_url}/page/{i}'
            print(f"Scraping URL: {url}")
            driver.get(url)

            articles = driver.find_elements(By.CSS_SELECTOR, 'div#ch-ls-box')
            print(f"Found {len(articles)} articles with id 'ch-ls-box'")

            for article in articles:
                try:
                    category_div = article.find_element(By.CSS_SELECTOR, 'div#ch-cat')
                    category = category_div.text.strip()
                    print(f"Article category: {category}")

                    if category in categories:
                        a_tag = article.find_element(By.TAG_NAME, 'a')
                        hrefs.append(a_tag.get_attribute('href'))
                except NoSuchElementException as e:
                    print(f"Error finding element: {e}")
    
    finally:
        driver.quit()
    
    return hrefs

if __name__ == '__main__':
    news_urls = scrape_inquirer_links_selenium()
    print(f"Total links found: {len(news_urls)}")
    print("news_urls = [")
    for link in news_urls:
        print(f"    '{link}',")
    print("]")

Scraping URL: https://business.inquirer.net/category/latest-stories
Found 10 articles with id 'ch-ls-box'
Article category: EMPLOYMENT
Article category: LATEST STORIES
Article category: LATEST STORIES
Article category: INFLATION
Article category: CLIMATE CHANGE
Article category: LATEST STORIES
Article category: ENERGY
Article category: ENERGY
Article category: BANKS
Article category: LATEST STORIES
Scraping URL: https://business.inquirer.net/category/latest-stories/page/2
Found 10 articles with id 'ch-ls-box'
Article category: INFLATION
Article category: INVESTMENTS
Article category: STOCKS
Article category: INVESTMENTS
Article category: RATE HIKE
Article category: BANKING
Article category: LATEST STORIES
Article category: HEADLINES
Article category: RETAIL
Article category: LATEST STORIES
Scraping URL: https://business.inquirer.net/category/latest-stories/page/3
Found 10 articles with id 'ch-ls-box'
Article category: EMPLOYMENT / RECRUITMENT
Article category: STOCK MARKET
Article cate

In [3]:
news_urls

['https://business.inquirer.net/469108/allianz-to-buy-majority-stake-in-singapore-insurer',
 'https://business.inquirer.net/469100/stockholm-tests-electric-flying-ferry',
 'https://business.inquirer.net/469081/australia-pledges-to-clean-up-major-union-tainted-by-scandal',
 'https://business.inquirer.net/469094/new-transmission-lines-to-improve-bohols-power-situation',
 'https://business.inquirer.net/469087/luzon-grid-under-yellow-alert-on-wednesday-afternoon-evening',
 'https://business.inquirer.net/469080/hsbc-names-georges-elhedery-as-next-chief-executive',
 'https://business.inquirer.net/469066/t-bond-rates-ease-after-dovish-bsp-fed-signals',
 'https://business.inquirer.net/469064/northern-samar-execs-invite-robinsons-group-to-invest-in-the-province',
 'https://business.inquirer.net/469059/consing-maharlika-seen-to-boost-economic-growth-reduce-poverty',
 'https://business.inquirer.net/469032/dow-soars-to-fresh-record-while-gold-prices-hit-all-time-high',
 'https://business.inquirer.

In [4]:
loader = NewsURLLoader(urls=news_urls, nlp = True)
data = loader.load()



In [5]:
data

[Document(metadata={'title': 'Allianz to buy majority stake in Singapore insurer', 'link': 'https://business.inquirer.net/469108/allianz-to-buy-majority-stake-in-singapore-insurer', 'authors': ['Agence France-Presse'], 'language': 'en', 'description': "German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore's Income Insurance for around 1.5 billion euros", 'publish_date': datetime.datetime(2024, 7, 17, 15, 58, 41), 'keywords': ['transaction', 'singapores', 'singapore', 'billion', 'insurer', 'regionallianz', 'majority', 'insurance', 'presence', 'euros', 'allianz', 'stake', 'income', 'buy'], 'summary': 'Frankfurt, Germany — German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore’s Income Insurance for around 1.5 billion euros, as it seeks to expand its foothold in the Asia-Pacific region.\nAllianz said it had made an offer to acquire at least 51 percent of the shares in Income Insurance at 40.58 Singapore dollars (27.7

In [6]:
def clean_page_content(page_content):
    page_content = re.sub(r"\n\n(ADVERTISEMENT|READ:).*?\n\n", "", page_content, flags=re.DOTALL)
    page_content = re.sub(r"SIGN ME UP", "", page_content, flags=re.DOTALL)
    page_content = re.sub(r"READ:.*?\n\n", "", page_content, flags=re.DOTALL)
    page_content = re.sub(r"Your subscription could not be saved. Please try again. Your subscription has been successful. Subscribe to our daily newsletter ", "", page_content, flags=re.DOTALL)
    return page_content

for doc in data:
    doc.page_content = clean_page_content(doc.page_content)

In [7]:
data[0]

Document(metadata={'title': 'Allianz to buy majority stake in Singapore insurer', 'link': 'https://business.inquirer.net/469108/allianz-to-buy-majority-stake-in-singapore-insurer', 'authors': ['Agence France-Presse'], 'language': 'en', 'description': "German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore's Income Insurance for around 1.5 billion euros", 'publish_date': datetime.datetime(2024, 7, 17, 15, 58, 41), 'keywords': ['transaction', 'singapores', 'singapore', 'billion', 'insurer', 'regionallianz', 'majority', 'insurance', 'presence', 'euros', 'allianz', 'stake', 'income', 'buy'], 'summary': 'Frankfurt, Germany — German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore’s Income Insurance for around 1.5 billion euros, as it seeks to expand its foothold in the Asia-Pacific region.\nAllianz said it had made an offer to acquire at least 51 percent of the shares in Income Insurance at 40.58 Singapore dollars (27.7 

In [8]:
print(data[0].page_content)

Frankfurt, Germany — German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore’s Income Insurance for around 1.5 billion euros, as it seeks to expand its foothold in the Asia-Pacific region.

Allianz said it had made an offer to acquire at least 51 percent of the shares in Income Insurance at 40.58 Singapore dollars (27.7 euros) per share, for a total transaction value of 2.2 billion Singapore dollars.The deal would boost its presence in the “fast-growing and attractive” Singapore insurance market, Allianz said, adding that Income Insurance served around two million policyholders across a range of property, health and life insurance products.

The transaction, subject to regulatory approval, is expected to close at the end of 2024 or in the first quarter of 2025.

“This proposed transaction brings two strong businesses together for the benefit of Singapore’s customers and solidifies Allianz’s leadership position in the region,” said Renate Wagner, membe

In [9]:
def chunk_text(doc, chunk_size=2000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(doc.page_content)
    del doc.metadata['authors']
    del doc.metadata['publish_date']
    del doc.metadata['keywords']
    del doc.metadata['language']
    del doc.metadata['summary']

    return [Document(page_content=chunk, metadata=doc.metadata) for chunk in chunks]

final_chunks = []

for doc in data:
    chunks = chunk_text(doc)
    final_chunks.extend(chunks)

for i, chunk in enumerate(final_chunks):
    print(f"Chunk {i + 1}:")
    print(chunk.page_content)
    print()

Chunk 1:
Frankfurt, Germany — German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore’s Income Insurance for around 1.5 billion euros, as it seeks to expand its foothold in the Asia-Pacific region.

Allianz said it had made an offer to acquire at least 51 percent of the shares in Income Insurance at 40.58 Singapore dollars (27.7 euros) per share, for a total transaction value of 2.2 billion Singapore dollars.The deal would boost its presence in the “fast-growing and attractive” Singapore insurance market, Allianz said, adding that Income Insurance served around two million policyholders across a range of property, health and life insurance products.

The transaction, subject to regulatory approval, is expected to close at the end of 2024 or in the first quarter of 2025.

“This proposed transaction brings two strong businesses together for the benefit of Singapore’s customers and solidifies Allianz’s leadership position in the region,” said Renate Wagn

In [10]:
final_chunks

[Document(metadata={'title': 'Allianz to buy majority stake in Singapore insurer', 'link': 'https://business.inquirer.net/469108/allianz-to-buy-majority-stake-in-singapore-insurer', 'description': "German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore's Income Insurance for around 1.5 billion euros"}, page_content='Frankfurt, Germany — German insurer Allianz said Wednesday that it planned to buy a majority stake in Singapore’s Income Insurance for around 1.5 billion euros, as it seeks to expand its foothold in the Asia-Pacific region.\n\nAllianz said it had made an offer to acquire at least 51 percent of the shares in Income Insurance at 40.58 Singapore dollars (27.7 euros) per share, for a total transaction value of 2.2 billion Singapore dollars.The deal would boost its presence in the “fast-growing and attractive” Singapore insurance market, Allianz said, adding that Income Insurance served around two million policyholders across a range of proper

In [11]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [12]:
COLLECTION_NAME = ""
CHROMA_PATH = "news_embeddings"

def embedding_function():
    return OpenAIEmbeddings(model = 'text-embedding-ada-002')

def add_to_chroma(chunks: List[Document]):
    vectorstore = Chroma(persist_directory = CHROMA_PATH,
                         embedding_function = embedding_function())
    vectorstore.add_documents(chunks)

    return vectorstore

In [13]:
vectorstore = add_to_chroma(final_chunks)