In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from scraper.config.news_scraper_config import NewsScraperConfig
from scraper.config.news_site_config import NewsSiteConfig
from scraper.config.metadata import Schema, Quantity, Parser
from utils.general_utils import filter_not_contain_str

import pandas as pd
import numpy as np
import re

In [2]:
driver = webdriver.Edge()

## Google News Search 

---

In [21]:
driver.get("https://news.google.com/home?hl=en-CA&gl=CA&ceid=CA%3Aen")

In [22]:
search_bar = driver.find_element(By.CSS_SELECTOR, "input[class='Ax4B8 ZAGvjd']")

In [23]:
search_bar.send_keys('Vegetable oils used in biodiesel production demand')

In [24]:
search_bar.send_keys(Keys.ENTER)

In [25]:
news_xpath_locator = ".//div[@class='IL9Cne']/a[@class='JtKRv']"

news = driver.find_elements(By.XPATH, news_xpath_locator)

In [None]:
# news_site_xpath_location = f".//div[@class='IL9Cne']/div[@class='MCAGUe']" # //div[@class='vr1PYe']"

# news_sites = driver.find_elements(By.XPATH, news_site_xpath_location)

In [26]:
news_links = [new.get_attribute('href') for new in news if new.text != '']

In [27]:
news_links

['https://news.google.com/read/CBMirwFBVV95cUxNck9hdnFvckdWem9RSlhkQlZoY2lKNkVYS0VCZ3FEVHBBVkV4MGl5d3NsWExLUFN2NEktNFV3M2lTOHkwWmYwV2dLR3d6Q0txbnd0TWRsckZkNUdCcWZtWVVuTnZkYUhwc285dUFPUEppUEdsTGZvTlQ3b3RYNG9rZnB2OF95dkROZjl6SkdxUlpGWHcxTUxzeGkyODljTHZrcGprUzUzUXk4TS05dmw0?hl=en-CA&gl=CA&ceid=CA%3Aen',
 'https://news.google.com/read/CBMilAFBVV95cUxNZmJsaExaalUzN3h5eE95MkEzeWtxY0tZbnR1NGNfRzVkODdyZnY2S0hTa3ZzSEgzY2hDMlhySHR6Mnh0UmJ2aE9oWlVDY0xHSi00UzBYbl9UbXdpNHByRE1NOXRhZHV6YWtvUGx2Um1VdGpPRXZKT3h3Vkhld29mMHpuTHZXUjk5R2hzNlpVdndQS3ph?hl=en-CA&gl=CA&ceid=CA%3Aen',
 'https://news.google.com/read/CBMioAFBVV95cUxNOGZOS0JfYkNlOTFrTVo0UzBGWC0zaFh5WTFYdFdJSzVpOGVRd2xiVE1YeTZlRG8wUGxERk1aUHR1b1pXMnpPQ1hhQmZ5RnJzRlE5MTdvZ3FpYkVDU1RjenYzU0s2NFpEekl2YXJ6dy1LLVl3dzdSWVNNbzdjS3RnN252dkV3Yko5MmNmeU42bl8wX2tXVjdHV3JfdzFzYmdB?hl=en-CA&gl=CA&ceid=CA%3Aen',
 'https://news.google.com/read/CBMi6AFBVV95cUxNanBScW9KS0RhUC1VNFg4c3J3eUdGREdSU0JKWWJ4NThkVFY0aXk0dmJqeThnVi1iWXY3NW5UZlozR3dWbFROYTdKcUhfWHYzZmdtU1U

---

# POC

---

In [3]:
config = NewsScraperConfig()

In [None]:
site_url = "https://www.chemanalyst.com/NewsAndDeals/NewsDetails/rapeseed-oil-prices-rise-amid-tight-supplies-and-growing-demand-in-europe-28036"

driver.get(site_url)

In [None]:
news_config = NewsSiteConfig()

url_pattern = r'http\S*://(\S*?)/\S+'
base_url = re.findall(url_pattern, driver.current_url)[0]

site_name = news_config.get_site_name(base_url)

In [11]:
scrape_config = config.instruction[site_name]

scrape_config

{'title': {'quantity': 'unique',
  'search': {'parser': 'selenium',
   'syntax': 'xpath',
   'query': ".//article[@class='blog-detail-summary']/h1"}},
 'date': {'quantity': 'unique',
  'search': {'parser': 'selenium',
   'syntax': 'xpath',
   'query': ".//article[@class='blog-detail-summary']/div[@class='relaventnewspublisheddate']//span[*[1][name()='svg']]"}},
 'author': {'quantity': 'unique',
  'search': {'parser': 'selenium',
   'syntax': 'xpath',
   'query': ".//article[@class='blog-detail-summary']/div[@class='relaventnewspublisheddate']//span[not(*[name()='svg'])]"}},
 'contents': {'quantity': 'multiple',
  'search': {'parser': 'selenium',
   'syntax': 'css selector',
   'query': "div[class='blog-list-data']"}}}

In [12]:
title, subtitle, date, author, tags, contents = [], [], [], [], [], []

html_body = driver.page_source
article = BeautifulSoup(html_body, 'html.parser')

output_columns = scrape_config.keys()
if Schema.TITLE not in output_columns:
    title.append(np.nan)
if Schema.SUBTITLE not in output_columns:
    subtitle.append(np.nan)
if Schema.DATE not in output_columns:
    date.append(np.nan)
if Schema.AUTHOR not in output_columns:
    author.append(np.nan)
if Schema.TAGS not in output_columns:
    tags.append(np.nan)
if Schema.CONTENT not in output_columns:
    contents.append(np.nan)

for output_col, parsing_info in scrape_config.items():
    search = parsing_info['search']
    if search['parser'] == Parser.SELENIUM:
        if parsing_info['quantity'] == Quantity.UNIQUE:
            parsed_data = driver.find_element(search['syntax'], search['query']).text.strip()
        else:
            parsed_data = driver.find_elements(search['syntax'], search['query'])
            parsed_data = '\n'.join([parsed.text for parsed in parsed_data])
    else:
        if parsing_info['quantity'] == Quantity.UNIQUE:
            parsed_data = article.find(**search['args']).text.strip()
        else:
            parsed_data = article.find_all(**search['args'])
            parsed_data = '\n'.join([parsed.text for parsed in parsed_data])

    if output_col == Schema.TITLE:
        title.append(parsed_data)
    elif output_col == Schema.SUBTITLE:
        subtitle.append(parsed_data)
    elif output_col == Schema.DATE:
        date.append(parsed_data)
    elif output_col == Schema.AUTHOR:
        author.append(parsed_data)
    elif output_col == Schema.TAGS:
        tags.append(parsed_data)
    else:
        contents.append(parsed_data)

In [13]:
articles_df = pd.DataFrame({
    'title': title,
    'subtitle': subtitle,
    'date': date,
    'author': author,
    'tags': tags,
    'contents': contents
}).astype({
    'title': 'string',
    'subtitle': 'string',
    'date': 'string',
    'author': 'string',
    'tags': 'object',
    'contents': 'string'
})

In [14]:
articles_df

Unnamed: 0,title,subtitle,date,author,tags,contents
0,Rapeseed Oil Prices Rise Amid Tight Supplies a...,,27-May-2024 4:13 PM,Journalist: Yage Kwon,,The European market has experienced a signific...


---

In [10]:
html_body = driver.page_source

In [11]:
article = BeautifulSoup(html_body, 'html.parser')

In [380]:
article_date = driver.find_element(By.XPATH, ".//article[@class='blog-detail-summary']/div[@class='relaventnewspublisheddate']//span[*[1][name()='svg']]").text.strip()
article_author = driver.find_element(By.XPATH, ".//article[@class='blog-detail-summary']/div[@class='relaventnewspublisheddate']//span[not(*[name()='svg'])]").text.strip()
article_title = driver.find_element(By.XPATH, ".//article[@class='blog-detail-summary']/h1").text.strip()

In [381]:
article_content = driver.find_element(By.CSS_SELECTOR, "div[class='blog-list-data']").text.strip()
# article_content = '\n'.join([article_c.text for article_c in content])

In [385]:
print(article_content)

Germany’s largest chemical manufacturer, BASF SE, is exploring strategic options for its agricultural chemicals division, including a potential listing in either the United States or Germany. This move is part of a broader restructuring effort aimed at reducing costs by over €2 billion annually, driven by a challenging industrial outlook in Europe and increasing pressure on the company’s operations.
BASF has reportedly approached investment banks to solicit proposals for the potential public offering of its agricultural chemical’s unit, which has generated significant revenue, posting sales of approximately €10 billion in 2023. The division competes directly with global agricultural giants such as Bayer, Corteva, and Syngenta. The company’s decision to consider a listing follows previous remarks made in September 2024, where BASF highlighted that its agricultural business had been undervalued by the market, particularly in terms of its earnings potential within the larger group.
The po

In [394]:
content_block = article.find(name='div', attrs={'class': 'RichTextArticleBody RichTextBody'})

In [399]:
print(content_block.text)


Fastmarkets’ UCO CIF Amsterdam, Rotterdam, Antwerp (ARA) assessment reached a 2024 high of $1,070 per tonne in December, a nearly two-year high, marking a 21% increase from its 2024 low of $885 per tonne in March. 
Fastmarkets’ UCO DDP North-West Europe (NWE) price for the inland market reached €1,175 ($1,222) per tonne in December, a two-year high, up by 28% from its 2024 low of €920 per tonne assessed in January.
“So many factors pushed UCO up and down in price this year; firm demand and high vegetable oil prices kept prices at high levels at the end of the year,” a European source told Fastmarkets.
UCO prices started to uptrend in May amid expectations that the European Commission (EC) would announce preliminary anti-dumping measures against Chinese biodiesel as a result of an investigation launched in December 2023. 
Sources anticipated European UCO demand to increase in order to cover biodiesel supply shortages from China, which mostly comes in the form of used cooking oil methyl

In [None]:
exclude_tags = ['audio', 'video', 'map', 'iframe', 'form', 'button', 'img', 'figure', 'figcaption', 'article', 'caption']

def filter_tags(tag):
    return tag.name not in exclude_tags

print('\n'.join([element.text.strip() for element in content_block.find_all(filter_tags, recursive=False)]))


A group of U.S. farm state senators is urging regulators to tighten controls on the rapidly increasing imports of used cooking oil, particularly from China, amid concerns over potential fraud and environmental impact. The senators' call to action comes as U.S. biofuel manufacturers increasingly turn to used cooking oil to produce biodiesel, a product eligible for lucrative federal and state climate subsidies.
In a letter addressed to several U.S. regulatory agencies and released Thursday, the senators raised alarm over the possibility that some imported cargoes may contain palm oil, a product associated with deforestation and environmental damage. The dramatic surge in imports has seen the U.S. go from importing less than 200 million pounds of used cooking oil annually to nearly 3 billion pounds in 2023, with over half originating from China.
This development occurs against the backdrop of a global edible oils market experiencing unprecedented growth and volatility. The increasing use

In [None]:


metadata = NewsScraperConfig()

fastmarket_scraper = NewsScraperConfig()['Fastmarkets news']

title, subtitle, date, author, tags, contents = [], [], [], [], [], []

for data, parse_instruction in fastmarket_scraper.items():
    if parse_instruction['method'] == 'find_element':
        parsed_data = driver.find_element(parse_instruction['by'], parse_instruction['query']).text 
    else:
        parsed_data = driver.find_elements(parse_instruction['by'], parse_instruction['query'])
        parsed_data = [parsed.text for parsed in parsed_data]

    if data == 'title':
        title.append(parsed_data)
    elif data == 'subtitle':
        subtitle.append(parsed_data)
    elif data == 'date':
        date.append(parsed_data)
    elif data == "author":
        author.append(parsed_data)
    elif data == 'tags':
        tags.append(parsed_data)
    else:
        contents.append('\n'.join(parsed_data))
        

articles_df = pd.DataFrame({
    'title': title,
    'subtitle': subtitle,
    'date': date,
    'author': author,
    'tags': tags,
    'contents': contents
}).astype({
    'title': 'string',
    'subtitle': 'string',
    'date': 'string',
    'author': 'string',
    'tags': 'object',
    'contents': 'string'
})