In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector
from scraper.config.news_parser_config import NewsParserConfig
from scraper.config.news_site_config import SiteConfig
from scraper.config.metadata import Row, Theme, Schema
from scraper.news.google_news_scraper import GoogleNewsScraper
from functools import reduce
from typing import List

import pandas as pd
import numpy as np
import re

In [2]:
theme = Theme.SUPPLY
keyword = "Vegetable oils used in biodiesel production demand"

urls = [
    'https://news.google.com/read/CBMif0FVX3lxTE5EOUdpRG5FQjJtTGdnOHZDcWhxMVpyZDBWbS0zaHNpVTJoNEMtT3M5QXEwQ0tfa1hyZ3JqS1A1R1JuTWVFNkM4bXU3ZHEzeHdlaWwwcnVLZ3FaN0phQUJ1Tk9EajdwbzQyZ1FuSms2OXNqRko0dmM2NmVmSnRWSkk?hl=en-CA&gl=CA&ceid=CA%3Aen'
]

scraper = GoogleNewsScraper()

# article_urls = scraper._scrape_article_urls(keyword, True)

searched_articles = scraper.get_search_articles(theme, keyword)

articles = [article for article in searched_articles]

In [3]:
len(articles)

66

In [4]:
config = NewsParserConfig()

def parse(query: str, selector: Selector) -> List[str]:
    if '/' in query and 'text()' in query:
        return selector.xpath(query).getall()
    elif '/' in query:
        return selector.xpath(query).css('*::text').getall()
    elif '::text' in query:
        return selector.css(query).getall()
    else:
        return selector.css(f'{query} *::text').getall()

def piecewise_parse(query: List[str], selector: Selector) -> List[str]: 
    subqueries = [parse(subquery, selector) for subquery in query]
    return reduce(lambda x, y: x + y, subqueries)

rows = []

for search in articles:
    
    try:
        instruction = config.get_instruction(search.article.site)
        
        if not isinstance(instruction, dict):
            raise TypeError
        if not config.instruction_in_scope(search.article.site):
            raise AttributeError
        if Schema.TITLE not in instruction:
            title = None
        if Schema.DATE not in instruction:
            date = None 
        if Schema.AUTHOR not in instruction:
            author = None 
        if Schema.CONTENT not in instruction:
            content = None

        selector = Selector(text=search.article.html_content)

        for column, query in instruction.items():

            if isinstance(query, str):
                if column == Schema.TITLE:
                    title_value = parse(query, selector) 
                    title = title_value[0].strip() if title_value != [] else None
                elif column == Schema.DATE:
                    date_value = parse(query, selector) 
                    date = date_value[0].strip() if date_value != [] else None
                elif column == Schema.AUTHOR:
                    author_value = parse(query, selector) 
                    author = author_value[0].strip() if author_value != [] else None
                else:
                    content_values = parse(query, selector) 
                    content = '\n'.join([text.strip() for text in content_values]).strip() if content_values != [] else None
            else:
                if column == Schema.TITLE:
                    title_values = piecewise_parse(query, selector) 
                    title = '\n'.join([text.strip() for text in title_values]).strip() if title_values != [] else None
                elif column == Schema.DATE:
                    date_values = piecewise_parse(query, selector) 
                    date = '\n'.join([text.strip() for text in date_values]).strip() if date_values != [] else None
                elif column == Schema.AUTHOR:
                    author_values = piecewise_parse(query, selector) 
                    author = '\n'.join([text.strip() for text in author_values]).strip() if author_values != [] else None
                else:
                    content_values = piecewise_parse(query, selector) 
                    content = '\n'.join([text.strip() for text in content_values]).strip() if content_values != [] else None

        row = Row(title, date, author, search.article.site, search.article.url, search.theme, search.keyword, content)
        rows.append(row)
    except KeyError or TypeError or AttributeError:
        # Cannot find the news site in parser config
        row = Row(None, None, None, search.article.site, search.article.url, search.theme, search.keyword, None)
        rows.append(row)


In [5]:
articles_df = pd.DataFrame({
    'title': [row.title if row.title is not None else np.nan for row in rows],
    'date': [row.date if row.date is not None else np.nan for row in rows],
    'author': [row.author if row.author is not None else np.nan for row in rows],
    'site': [row.site for row in rows],
    'url': [row.url for row in rows],
    'theme': [row.theme for row in rows],
    'keyword': [row.keyword for row in rows],
    'content': [row.content if row.content is not None else np.nan for row in rows]
}).astype({
    'title': 'string',
    'date': 'string',
    'author': 'string',
    'site': 'string',
    'url': 'string',
    'theme': 'string',
    'keyword': 'string',
    'content': 'string'
})

In [7]:
valid_df = articles_df[(~articles_df['title'].isna()) & (~articles_df['content'].isna())]
valid_df

Unnamed: 0,title,date,author,site,url,theme,keyword,content
0,U.S. Renewable Diesel Production Growth Drasti...,"June 11, 2024",Timothy O’Neil,USDA Foreign Agricultural Service,https://www.fas.usda.gov/data/us-renewable-die...,Supply,Vegetable oils used in biodiesel production de...,"Executive Summary During the past few years, t..."
4,"UCO prices reach two-year high in 2024, increa...","January 14, 2025",Veronika Prykhodko,Fastmarkets,https://www.fastmarkets.com/insights/uco-price...,Supply,Vegetable oils used in biodiesel production de...,"Fastmarkets’ UCO CIF Amsterdam, Rotterdam, Ant..."
15,U.S. Senators Call for Crackdown on Surging Us...,28-Jun-2024 3:24 PM,Journalist: Li Hua,ChemAnalyst,https://www.chemanalyst.com/NewsAndDeals/NewsD...,Supply,Vegetable oils used in biodiesel production de...,A group of U.S. farm state senators is urging ...
25,EIA now estimates biodiesel production and con...,"July 24, 2020",Mickey Francis,U.S. Energy Information Administration (EIA),https://www.eia.gov/todayinenergy/detail.php?i...,Supply,Vegetable oils used in biodiesel production de...,The U.S. Energy Information Administration (EI...
26,Vegetable oils: Supply stress and biofuel blen...,"August 11, 2022",,Fastmarkets,https://www.fastmarkets.com/insights/vegetable...,Supply,Vegetable oils used in biodiesel production de...,The most prominent shift in the vegetable oils...
27,Clean Fuels outlook predicts growing supplies ...,"November 28, 2023",BY Clean Fuels Alliance America,Biodiesel Magazine,https://biodieselmagazine.com/articles/clean-f...,Supply,Vegetable oils used in biodiesel production de...,"Today, Clean Fuels Alliance America released a..."
37,Heat Exchangers in Renewable Diesel Production,"May 31, 2022",BY John Michelin,Biodiesel Magazine,https://biodieselmagazine.com/articles/heat-ex...,Supply,Vegetable oils used in biodiesel production de...,Although the COVID-19 pandemic reduced demand ...
38,Biofuel Feedstocks in the European Union,06 Nov 2023,By Paul Wightman,CME Group,https://www.cmegroup.com/articles/whitepapers/...,Supply,Vegetable oils used in biodiesel production de...,The European Union (EU) is one of the largest ...


In [9]:
invalid_df = articles_df.assign(
    title_scraped=~articles_df['title'].isna(), 
    date_scraped=~articles_df['date'].isna(),
    author_scraped=~articles_df['author'].isna(),
    content_scraped=~articles_df['content'].isna()
) 

invalid_df = invalid_df.loc[
    (invalid_df['title_scraped'] == False) | (invalid_df['content_scraped'] == False), 
    ['site', 'url', 'title_scraped', 'date_scraped', 'author_scraped', 'content_scraped']
]

In [16]:
invalid_df

Unnamed: 0,site,url,title_scraped,date_scraped,author_scraped,content_scraped
1,Business Research Insights,https://www.businessresearchinsights.com/marke...,False,False,False,False
2,TBD,https://www.grainjournal.com/article/1058044/r...,False,False,False,False
3,GlobeNewswire,https://www.globenewswire.com/news-release/202...,False,False,False,False
5,Clean Air Task Force (CATF),https://www.catf.us/2024/08/carbon-intensity-o...,False,False,False,False
6,TBD,https://biomassmagazine.com/articles/usda-grow...,False,False,False,False
7,Transport Environment (T&E),https://www.transportenvironment.org/articles/...,False,False,False,False
8,Global Market Insights (GMI),https://www.gminsights.com/industry-analysis/u...,False,False,False,False
9,Grand View Research,https://www.grandviewresearch.com/industry-ana...,False,False,False,False
10,Resource Wise,https://www.resourcewise.com/chemicals-blog/ol...,False,False,False,False
11,Rystad Energy,https://www.rystadenergy.com/news/booming-biof...,False,False,False,False


In [12]:
invalid_df.to_excel(excel_writer='.\\data\\info\\failed_site_urls.xlsx', sheet_name='news', index=False, header=True)

In [15]:
invalid_df.groupby('site', as_index=False).size().rename(columns={'size': 'count'}).sort_values('count', ascending=False)

Unnamed: 0,site,count
18,TBD,18
24,USDA Economic Research Service,8
6,GlobeNewswire,3
22,Transport Environment (T&E),2
21,The Guardian,2
26,Union of Concerned Scientists,2
4,Clean Air Task Force (CATF),1
7,Grand View Research,1
5,Global Market Insights (GMI),1
1,Business Research Insights,1
