In [None]:
from playwright.async_api import async_playwright
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from transformers import pipeline

In [347]:
url_list = ['https://www.cnbc.com/world/?region=world',
            'https://finance.yahoo.com/',
            'https://www.ft.com/',
            'https://financialpost.com/']

title_list = []
h1_list = []
h2_list = []
h3_list = []
article_list = []

async def non_google_scrap():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        )
        for i in range(len(url_list)):
            print(f'Scraping in {url_list[i]}')
            page = await context.new_page()
            await page.goto(url_list[i],timeout=30000, wait_until="domcontentloaded")

            title = await page.title()
            h1 = await page.locator("h1").all_text_contents()
            h2 = await page.locator("h2").all_text_contents()
            h3 = await page.locator("h3").all_text_contents()
            
            if i >= 4:
                article = await page.locator('div[aria-level="3"][role="heading"].n0jPhd.ynAwRc.MBeuO.nDgy9d, div[aria-level="3"][role="heading"]').all_text_contents()
            else:  
                article = await page.locator('a[title]').all_text_contents()

            title_list.append(title)
            h1_list.append(h1)
            h2_list.append(h2)
            h3_list.append(h3)

             
            article_list.append(article[:])
            
            print(f'Found {len(article[:])} articles in {url_list[i]}')
            print(f'\n')

            await page.close()
        await browser.close()

In [348]:
#Scrap Google News
url_list = ['https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en',
            'https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en',
            'https://news.google.com/rss/headlines/section/topic/TECHNOLOGY?hl=en-US&gl=US&ceid=US:en']

title_list = []
h1_list = []
h2_list = []
h3_list = []
article_list = []

async def google_scrap():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        )
        for i in range(len(url_list)):
            print(f'Scraping in {url_list[i]}')
            page = await context.new_page()
            await page.goto(url_list[i],timeout=30000, wait_until="domcontentloaded")

            title = await page.title()
            h1 = await page.locator("h1").all_text_contents()
            h2 = await page.locator("h2").all_text_contents()
            h3 = await page.locator("h3").all_text_contents()
            
            article = await page.locator('item title').all_text_contents()

            title_list.append(title)
            h1_list.append(h1)
            h2_list.append(h2)
            h3_list.append(h3)

             
            article_list.append(article[:])
            
            print(f'Found {len(article[:])} articles in {url_list[i]}')
            print(f'\n')

            await page.close()
        await browser.close()

In [354]:
#Scrap Caller
choice = input("Scrap with Google News or Other News? Enter G/N")
if choice.lower() == 'g':
    await google_scrap()
elif choice.lower == 'n':
    await non_google_scrap()


Scraping in https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en
Found 38 articles in https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en


Scraping in https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en
Found 27 articles in https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en


Scraping in https://news.google.com/rss/headlines/section/topic/TECHNOLOGY?hl=en-US&gl=US&ceid=US:en
Found 57 articles in https://news.google.com/rss/headlines/section/topic/TECHNOLOGY?hl=en-US&gl=US&ceid=US:en




In [350]:
#Data Handling
flattened_list = []
article_source = []

web_index = 0

for website in article_list:
    domain = urlparse(url_list[web_index]).netloc.replace('www.','').replace('.com', '').title()
    for article in website:
        if article != "" and article != ' ':
            flattened_list.append(article)
            article_source.append(domain)
    web_index += 1

In [351]:
#DataFrame Creation
df = pd.DataFrame(list(zip(article_source,flattened_list)),columns=['Source','Title'])
#Extract only the news-like titles by length >= 30 (alphabetic)
df_news = df[df['Title'].str.len() >= 30]
df

Unnamed: 0,Source,Title
0,News.Google,No West Virginia National Guard troops deploye...
1,News.Google,"U.S. halts all asylum decisions, pauses visas ..."
2,News.Google,Northwestern Agrees to Deal With Trump Adminis...
3,News.Google,Three injured in Westfield Valley Fair shoppin...
4,News.Google,Trump claims he will nullify executive orders ...
...,...,...
117,News.Google,Museums are no longer afraid of ‘selling out’....
118,News.Google,Helldivers 2 devs correct “mistake” in Python ...
119,News.Google,Here's the MacBook You Should Buy for Black Fr...
120,News.Google,Use ChatGPT as Your iPhone's Action Button Ass...


In [352]:
#Sentiment Analysis
classifier = pipeline("sentiment-analysis")
Sentiment_Results = classifier(df_news['Title'].tolist())

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [353]:
positive_count = 0
negative_count = 0

for news in Sentiment_Results:
    if news['label'] == 'POSITIVE':
        positive_count += 1
    elif news['label'] == 'NEGATIVE':
        negative_count += 1

print('Sentiment Analysis Results')
print(f'Positive News Count is {positive_count}')
print(f'Negative News Count is {negative_count}')
print(f'Ratio of Positive/Negative is {round(positive_count/negative_count,3)}')

Sentiment Analysis Results
Positive News Count is 45
Negative News Count is 77
Ratio of Positive/Negative is 0.584
