In [1]:
# Importing libs
import re
import time
import json
import requests
from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer
from urllib.parse import unquote

In [2]:
def get_news_html(search_url: str = 'https://finance.yahoo.com/topic/latest-news/'):
    rq = requests.get(search_url)   

    if rq.status_code in range(200, 300, 1):
        print(f'Scrapping de {search_url} finalizado com sucesso')
        return rq
    else:
        raise Exception


get_news_html()

Scrapping de https://finance.yahoo.com/topic/latest-news/ finalizado com sucesso


<Response [200]>

In [3]:
def get_news_soup(raw_html = get_news_html()):
    soup = bs(raw_html.text, 'html.parser')
    return soup


get_news_soup()

Scrapping de https://finance.yahoo.com/topic/latest-news/ finalizado com sucesso


<!DOCTYPE html>
<html class="NoJs chrome desktop failsafe" data-color-theme="light" id="atomic" lang="en-US"><head prefix="og: https://ogp.me/ns#"><script>window.performance && window.performance.mark && window.performance.mark('PageStart');</script><meta charset="utf-8"/><title>Latest News</title><meta content="401k, Business, Financial Information, Investing, Investor, Market News, Stock Research, Stock Valuation, business news, economy, finance, investment tools, mortgage, mutual funds, personal finance, quote, real estate, retirement, stock, stocks, Suze Orman, tax, track portfolio" name="keywords"/><meta content="on" http-equiv="x-dns-prefetch-control"/><meta content="on" property="twitter:dnt"/><meta content="458584288257241" property="fb:app_id"/><meta content="#037B66" name="theme-color"/><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="At Yahoo Finance, you get free stock quotes, up-to-date news, portfolio management resources, international 

In [4]:
def get_news_items_html(soup = get_news_soup(), max_index = 10):
    list_items_raw = []

    soup_news = soup.select('.js-stream-content')

    for index_new in range(0, len(soup_news), 1):
        if index_new <= max_index:
            soup_index = soup.select('.js-stream-content')[index_new]
            list_items_raw.append(soup_index)

    return list_items_raw

get_news_items_html()

[<li class="js-stream-content Pos(r)"><div class="Py(14px) Pos(r)" data-test-locator="mega"><div class="Cf"><div class="Fl(start) Pos(r) Mt(2px) W(26.5%) Maw(220px)"><div class="H(0) Ov(h) Bdrs(2px)" style="padding-bottom:56%"><img alt="" class="W(100%) Trsdu(0s)! Bdrs(2px)" data-status="LOADING" src="https://s.yimg.com/uu/api/res/1.2/tSXeAcXMKbX9.DcPrHIWlA--~B/Zmk9c3RyaW07aD0xMjM7cT04MDt3PTIyMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/reuters-finance.com/ed4a91b5fef7d302e6dbac98a95f3df9.cf.jpg" srcset="https://s.yimg.com/uu/api/res/1.2/tSXeAcXMKbX9.DcPrHIWlA--~B/Zmk9c3RyaW07aD0xMjM7cT04MDt3PTIyMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/reuters-finance.com/ed4a91b5fef7d302e6dbac98a95f3df9.cf.jpg 1x,https://s.yimg.com/uu/api/res/1.2/KgTQipPu_.M_ckFnqTftOA--~B/Zmk9c3RyaW07aD0yNDY7cT04MDt3PTQ0MDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/reuters-finance.com/ed4a91b5fef7d302e6dbac98a95f3df9.cf.jpg 2x"/></div></div><div class="Ov(h) Pend(44px) Pstart(25px)"><div class

In [13]:
def get_news_data(html_raw_list: list = get_news_items_html()):

    search_data = [] 

    for item in html_raw_list:

        # pegando a url
        url = item.h3.a['href']

        # pegando o headline
        headline = item.h3.text

        # coletando mais informações dentro de cada url
        news_page = requests.get(url)
    
        # fazendo o parse do html
        news_content = bs(news_page.text, 'html.parser')

        # pegando o autor da notícia
        author = news_content.select('.caas-attr-item-author')[0].text

        # Datetime e a quantidade de minutos da notícia
        updated_time = news_content.select('.caas-attr-time-style')

        datetime_news = updated_time[0].time['datetime']
        mins_read = updated_time[0].select('.caas-attr-mins-read')[0].text.split()[0]

        # pegando outras características da notícia
        data_site = news_content.select('.reactions-count')[0]['data-site']
        data_source = news_content.select('.reactions-count')[0]['data-source']
        data_type = news_content.select('.reactions-count')[0]['data-type']
        count_comments = news_content.select('.reactions-count')[0].text
        
        news_data = {
        'url': url if len(url) > 0 else 'None',
        'author': author if len(author) > 0  else 'None',
        'headline': headline if len(headline) > 0 else 'None',
        'datetime_news': datetime_news if len(datetime_news) > 0 else 'None',
        'mins_read': mins_read if len(mins_read) > 0 else 'None',
        'data_site': data_site if len(data_site) > 0 else 'None',
        'data_source': data_source if len(data_source) > 0 else 'None',
        'data_type': data_type if len(data_type) > 0 else 'None',
        'count_comments': count_comments if len(count_comments) > 0 else 'None'
        }
    
        search_data.append(news_data)

    return search_data

get_news_data()

[{'url': 'https://finance.yahoo.com/news/oil-slips-us-growth-worries-005019294.html',
  'author': 'Georgina McCartney',
  'headline': 'Oil slips on US growth worries, ample crude supply',
  'datetime_news': '2024-06-13T00:50:19.000Z',
  'mins_read': '2',
  'data_site': 'finance',
  'data_source': 'spotIm',
  'data_type': 'commentsCount',
  'count_comments': '0'},
 {'url': 'https://finance.yahoo.com/news/chinas-premier-li-visiting-zealand-004652978.html',
  'author': 'CHARLOTTE GRAHAM-McLAY',
  'headline': "China's Premier Li is visiting New Zealand, where security fears vie with trade hopes on the agenda",
  'datetime_news': '2024-06-13T00:46:52.000Z',
  'mins_read': '4',
  'data_site': 'finance',
  'data_source': 'spotIm',
  'data_type': 'commentsCount',
  'count_comments': '0'},
 {'url': 'https://finance.yahoo.com/news/boj-weigh-bond-buying-investors-210000270.html',
  'author': 'Toru Fujioka',
  'headline': 'BOJ to Weigh Bond Buying as Investors Seek Hints for July Hike',
  'datetim

In [116]:
def get_news_data(html_raw_list: list = get_news_items_html()):

    search_data = [] 

    for item in html_raw_list:

        try:

            news_data = {
            'url': 'None',
            'author': 'None',
            'headline': 'None',
            'datetime_news': 'None',
            'mins_read': 'None',
            'data_site': 'None',
            'data_source': 'None',
            'data_type': 'None',
            'count_comments': 'None'
            }
        
            # pegando a url
            url = item.h3.a['href']

            # pegando o headline
            headline = item.h3.text

            # coletando mais informações dentro de cada url
            news_page = requests.get(url)
        
            # fazendo o parse do html
            news_content = bs(news_page.text, 'html.parser')

            # pegando o autor da notícia
            author = news_content.select('.caas-attr-item-author')[0].text

            # Datetime e a quantidade de minutos da notícia
            updated_time = news_content.select('.caas-attr-time-style')

            datetime_news = updated_time[0].time['datetime']
            mins_read = updated_time[0].select('.caas-attr-mins-read')[0].text.split()[0]

            # pegando outras características da notícia
            data_site = news_content.select('.reactions-count')[0]['data-site']
            data_source = news_content.select('.reactions-count')[0]['data-source']
            data_type = news_content.select('.reactions-count')[0]['data-type']
            count_comments = news_content.select('.reactions-count')[0].text
        
        except: 
            pass

        news_data = {
        'url': url,
        'author': author,
        'headline': headline,
        'datetime_news': datetime_news,
        'mins_read': mins_read,
        'data_site': data_site,
        'data_source': data_source,
        'data_type': data_type,
        'count_comments': count_comments
        }
    
        search_data.append(news_data)

    return search_data

get_news_data()

[{'url': 'https://finance.yahoo.com/news/virgin-galactic-spaceplane-takes-tourists-172215142.html',
  'author': 'Reuters',
  'headline': 'Virgin Galactic spaceplane takes tourists on flight',
  'datetime_news': '2024-06-08T17:22:15.000Z',
  'mins_read': '1',
  'data_site': 'finance',
  'data_source': 'spotIm',
  'data_type': 'commentsCount',
  'count_comments': '0'},
 {'url': 'https://finance.yahoo.com/news/brazil-sugar-mogul-says-lula-171336385.html',
  'author': 'Leonardo Lara',
  'headline': 'Brazil Sugar Mogul Says Lula’s Fiscal Plan Will Keep Rates High',
  'datetime_news': '2024-06-08T17:13:36.000Z',
  'mins_read': '1',
  'data_site': 'finance',
  'data_source': 'spotIm',
  'data_type': 'commentsCount',
  'count_comments': '0'},
 {'url': 'https://finance.yahoo.com/news/biden-stinks-at-being-trumpy-170749015.html',
  'author': 'Rick Newman·Senior Columnist',
  'headline': 'Biden stinks at being Trumpy',
  'datetime_news': '2024-06-08T17:07:49.000Z',
  'mins_read': '5',
  'data_sit

In [11]:
## pega o texto da noticia
soup.select('.js-stream-content')[0].p.text



'The operator and owner of brands such as Regal, Cinema City, Picturehouse and Planet has lined up AlixPartners as a consultant to work on a potential disposal of its UK operations, the report said, adding that the sale process was expected to run for several weeks.  The company surfaced from Chapter 11 bankruptcy less than a year ago after filling for U.S. bankruptcy in 2022 to restructure debt.  It is also considering alternative options, including a company voluntary arrangement which could put an unspecified number of its UK cinemas at risk of closure, Sky said.'

In [33]:
#caas-art-ddbc3274-d724-3010-8040-19bce6217663 > article > div > div > div > div > div > div > div:nth-child(1) > div.caas-content-byline-wrapper > div.caas-attr > div > div.caas-attr-item-author > span
author = news_content.select('.caas-attr-item-author')[0].text
author

updated_time = news_content.select('.caas-attr-time-style')

mins_read = updated_time[0].select('.caas-attr-mins-read')[0].text.split()[0]

news_content.select('.reactions-count')


'Natalie  Grover'

In [34]:
#caas-art-ddbc3274-d724-3010-8040-19bce6217663 > article > div > div > div > div > div > div > div:nth-child(1) > div.caas-content-byline-wrapper > div.caas-attr > div > div.caas-attr-time-style > time
updated_time = news_content.select('.caas-attr-time-style')
updated_time

[<div class="caas-attr-time-style"><span>Updated </span><time class="caas-attr-meta-time" datetime="2024-05-27T14:24:54.000Z">Mon, May 27, 2024, 7:24 AM</time><span class="caas-attr-mins-read">2 min read</span></div>]

In [51]:
# pegando a data do datetime e a quantidade de minutos da notícia
datetime_news = updated_time[0].time['datetime']
mins_read = updated_time[0].select('.caas-attr-mins-read')[0].text.split()[0]
mins_read



'2'

In [53]:
### pegando a quantidade de comentários na notícia
#reactions-count-copy-ddbc3274-d724-3010-8040-19bce6217663
news_content.select('.reactions-count')

[<span class="reactions-count caas-dynamic-count" data-id="ddbc3274-d724-3010-8040-19bce6217663" data-site="finance" data-source="spotIm" data-type="commentsCount">1</span>]

In [57]:
# pegando outras características da notícia
data_site = news_content.select('.reactions-count')[0]['data-site']
data_source = news_content.select('.reactions-count')[0]['data-source']
data_type = news_content.select('.reactions-count')[0]['data-type']
count_comments = news_content.select('.reactions-count')[0].text

print(data_site)
print(data_source)
print(data_type)
print(count_comments)

finance
spotIm
commentsCount
1
