In [None]:
from scraper.config.metadata import Theme
from scraper.news.google_news_scraper import GoogleNewsScraper
from scraper.news.article_parser import ArticleParser

import pandas as pd
import numpy as np

# Scrape and Parse

In [2]:
theme = Theme.DEMAND
keyword = "Vegetable oils used in biodiesel production demand"

urls = [
    'https://www.fas.usda.gov/data/us-renewable-diesel-production-growth-drastically-impacts-global-feedstock-trade',
    'https://www.cmegroup.com/education/articles-and-reports/increasing-volatility-drives-increased-opportunities-for-hedging-in-waste-oils.html',
    'https://www.reuters.com/markets/commodities/world-food-commodity-prices-fall-january-sugar-vegetable-oils-decline-fao-2025-02-07/',
    'https://www.cmegroup.com/articles/2022/low-carbon-fuels-drive-vegetable-oil-price-volatility.html',
    'https://www.fastmarkets.com/insights/uco-prices-reach-two-year-high-in-2024-increases-to-continue-in-2025-amid-demand/',
    'https://www.gminsights.com/industry-analysis/used-cooking-oil-market'
]

searched_articles = GoogleNewsScraper().get_search_articles(theme, keyword, urls)
articles = [article for article in searched_articles]
articles_df = ArticleParser(articles).tabulate_articles()

2025-02-24 11:31:34,731 - GoogleNewsScraper - INFO - Opening Edge browser
2025-02-24 11:31:43,345 - GoogleNewsScraper - INFO - Waiting for page to complete loading (maximum 120 seconds)
2025-02-24 11:31:43,445 - GoogleNewsScraper - INFO - Navigating to the Google news article URL -> https://www.fas.usda.gov/data/us-renewable-diesel-production-growth-drastically-impacts-global-feedstock-trade
2025-02-24 11:31:47,870 - GoogleNewsScraper - INFO - Waiting for page to complete loading (maximum 120 seconds)
2025-02-24 11:31:47,922 - GoogleNewsScraper - INFO - Waiting for the article URL domain to change from "news.google.com" to its original domain
2025-02-24 11:31:47,939 - GoogleNewsScraper - INFO - Getting the article's site name, domain URL, and HTML content from https://www.fas.usda.gov/data/us-renewable-diesel-production-growth-drastically-impacts-global-feedstock-trade
2025-02-24 11:31:48,009 - GoogleNewsScraper - INFO - Article info successfully scraped from https://www.fas.usda.gov/d

# All Articles

In [3]:
articles_df

Unnamed: 0,site,url,theme,keyword,title,date,author,content
0,USDA Foreign Agricultural Service,https://www.fas.usda.gov/data/us-renewable-die...,Demand,Vegetable oils used in biodiesel production de...,,,Timothy O’Neil,"Executive Summary During the past few years, t..."
1,CME Group,https://www.cmegroup.com/education/articles-an...,Demand,Vegetable oils used in biodiesel production de...,,,,
2,Reuters,https://www.reuters.com/markets/commodities/wo...,Demand,Vegetable oils used in biodiesel production de...,,,,
3,CME Group,https://www.cmegroup.com/articles/2022/low-car...,Demand,Vegetable oils used in biodiesel production de...,Low carbon fuels drive vegetable oil price vol...,14 Oct 2022,By CME Group,"The availability of low carbon fuels, produced..."
4,Fastmarkets,https://www.fastmarkets.com/insights/uco-price...,Demand,Vegetable oils used in biodiesel production de...,"UCO prices reach two-year high in 2024, increa...","January 14, 2025",Veronika Prykhodko,"Fastmarkets’ UCO CIF Amsterdam, Rotterdam, Ant..."
5,Global Market Insights (GMI),https://www.gminsights.com/industry-analysis/u...,Demand,Vegetable oils used in biodiesel production de...,,,,


In [None]:
print(articles_df.loc[articles_df['site'] == 'Fastmarkets', 'content'][4])

# Validly Parsed Articles

In [None]:
valid_df = articles_df[(~articles_df['title'].isna()) & (~articles_df['content'].isna())]
valid_df

# Unable-to-Parse Articles

In [None]:
invalid_df = articles_df.assign(
    title_scraped=~articles_df['title'].isna(), 
    date_scraped=~articles_df['date'].isna(),
    author_scraped=~articles_df['author'].isna(),
    content_scraped=~articles_df['content'].isna()
) 

invalid_df = invalid_df.loc[
    (invalid_df['title_scraped'] == False) | (invalid_df['content_scraped'] == False), 
    ['site', 'url', 'title_scraped', 'date_scraped', 'author_scraped', 'content_scraped']
]

invalid_df

# Document Unable-to-Parse Articles

In [None]:
invalid_df.groupby('site', as_index=False).size().rename(columns={'size': 'count'}).sort_values('count', ascending=False)

In [None]:
invalid_df.to_excel(excel_writer='.\\data\\info\\failed_site_urls.xlsx', sheet_name='news', index=False, header=True)

---

# Development

In [None]:
# articles_df = pd.DataFrame({
#     'site': [row.site for row in rows],
#     'url': [row.url for row in rows],
#     'theme': [row.theme for row in rows],
#     'keyword': [row.keyword for row in rows],
#     'title': [row.title if row.title is not None else np.nan for row in rows],
#     'date': [row.date if row.date is not None else np.nan for row in rows],
#     'author': [row.author if row.author is not None else np.nan for row in rows],
#     'content': [row.content if row.content is not None else np.nan for row in rows]
# }).astype({
#     'site': 'string',
#     'url': 'string',
#     'theme': 'string',
#     'keyword': 'string',
#     'title': 'string',
#     'date': 'string',
#     'author': 'string',
#     'content': 'string'
# })

In [None]:
# config = NewsParserConfig()

# def parse(query: str, selector: Selector) -> List[str]:
#     if '/' in query and 'text()' in query:
#         return selector.xpath(query).getall()
#     elif '/' in query:
#         return selector.xpath(query).css('*::text').getall()
#     elif '::text' in query:
#         return selector.css(query).getall()
#     else:
#         return selector.css(f'{query} *::text').getall()

# def piecewise_parse(query: List[str], selector: Selector) -> List[str]: 
#     subqueries = [parse(subquery, selector) for subquery in query]
#     return reduce(lambda x, y: x + y, subqueries)

# rows = []

# for search in articles:
    
#     try:
#         instruction = config.get_instruction(search.article.site)
        
#         if not isinstance(instruction, dict):
#             raise TypeError
#         if not config.instruction_in_scope(search.article.site):
#             raise AttributeError
#         if Schema.TITLE not in instruction:
#             title = None
#         if Schema.DATE not in instruction:
#             date = None 
#         if Schema.AUTHOR not in instruction:
#             author = None 
#         if Schema.CONTENT not in instruction:
#             content = None

#         selector = Selector(text=search.article.html_content)

#         for column, query in instruction.items():

#             if isinstance(query, str):
#                 if column == Schema.TITLE:
#                     title_value = parse(query, selector) 
#                     title = title_value[0].strip() if title_value != [] else None
#                 elif column == Schema.DATE:
#                     date_value = parse(query, selector) 
#                     date = date_value[0].strip() if date_value != [] else None
#                 elif column == Schema.AUTHOR:
#                     author_value = parse(query, selector) 
#                     author = author_value[0].strip() if author_value != [] else None
#                 else:
#                     content_values = parse(query, selector) 
#                     content = '\n'.join([text.strip() for text in content_values]).strip() if content_values != [] else None
#             else:
#                 if column == Schema.TITLE:
#                     title_values = piecewise_parse(query, selector) 
#                     title = '\n'.join([text.strip() for text in title_values]).strip() if title_values != [] else None
#                 elif column == Schema.DATE:
#                     date_values = piecewise_parse(query, selector) 
#                     date = '\n'.join([text.strip() for text in date_values]).strip() if date_values != [] else None
#                 elif column == Schema.AUTHOR:
#                     author_values = piecewise_parse(query, selector) 
#                     author = '\n'.join([text.strip() for text in author_values]).strip() if author_values != [] else None
#                 else:
#                     content_values = piecewise_parse(query, selector) 
#                     content = '\n'.join([text.strip() for text in content_values]).strip() if content_values != [] else None

#         row = Row(title, date, author, search.article.site, search.article.url, search.theme, search.keyword, content)
#         rows.append(row)
#     except KeyError or TypeError or AttributeError:
#         # Cannot find the news site in parser config
#         row = Row(None, None, None, search.article.site, search.article.url, search.theme, search.keyword, None)
#         rows.append(row)
