# Bibliotecas

In [21]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re

# Lidando com diferentes layouts de sites

In [3]:
class Content:
    def __init__(self, url, title, body):
        self.url    = url
        self.title  = title
        self.body   = body

    def print(self):
        print(f"TITLE: {self.title}")
        print(f"URL: {self.url}")
        print(f"BODY: {self.body}")

In [26]:
def scrapeCNN(url):
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('h1').text
    body = bs.find('div', {'class' : 'article__content'}).text
    print('body: ')
    print(body)
    return Content(url, title, body)
    
def scrapeBrookings(url):
    req = Request(url, headers= {'User-Agent' : 'Brave'})
    response = urlopen(req)
    bs = BeautifulSoup(response, 'html.parser')
    title = bs.find('h1').text
    body = bs.find('div', {'class' : 'byo-block'}).text
    return Content(url, title, body)

In [27]:
url = 'https://www.brookings.edu/research/robotic-rulemaking/'
content = scrapeBrookings(url)
content.print()

TITLE: 
            Robotic rulemaking
          
URL: https://www.brookings.edu/research/robotic-rulemaking/
BODY: 
As it has rocketed to some 100 million active users in record time, ChatGPT is provoking conversations about the role of artificial intelligence (AI) in drafting written materials such as student exams, news articles, legal pleadings, poems, and more. The chatbot, developed by OpenAI, relies on a large language model (LLM) to respond to user-submitted requests, or “prompts” as they are known. It is an example of generative AI, a technology that upends our understanding of who creates written materials and how they do it, challenging what it means to create, analyze, and express ideas.



In [28]:
url = 'https://www.cnn.com/2023/04/03/investing/dogecoin-elon-musk-twitter/index.html'
content = scrapeCNN(url)
content.print()

body: 



New York
CNN
         — 
    


            Twitter’s traditional bird icon was booted and replaced with an image of a Shiba Inu, an apparent nod to dogecoin, the joke cryptocurrency that CEO Elon Musk is being sued over. 
    

            Musk addressed the change Monday afternoon, tweeting, “as promised” above an image of a year-old conversation in which another user suggested that Musk “just buy Twitter” and “change the bird logo to a doge.” 
    











CNN/Adobe Stock





Elon Musk's Twitter promised a purge of blue check marks. Instead he singled out one account




            The doge logo appeared on the site two days after Musk asked a judge to throw out a $258 billion racketeering lawsuit accusing him of running a pyramid scheme to support the dogecoin, according to Reuters.


            Lawyers for Musk and Tesla called the lawsuit by dogecoin investors a “fanciful work of fiction” over Musk’s “innocuous and often silly tweets.”
    

            It wasn’t 

In [29]:
class Content:
    """
    Classe-base comum para todos os artigos/paginas
    """

    def __init__(self, url, title, body):
        self.url    = url
        self.title  = title
        self.body   = body

    def print(self):
        """
        Função flexível de exibição que controla a saída
        """
        print(f"URL: {self.url}")
        print(f"TITLE: {self.title}")
        print(f"BODY: {self.body}")

In [30]:
class Website:
    """
    Contém informções sobre a estrutura do site
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name       = name
        self.url        = url
        self.titleTag   = titleTag
        self.bodyTag    = bodyTag

In [36]:
class Crawler:
    def getPage(url):
        try:
            html = urlopen(url)
        except Exception:
            return None
        return BeautifulSoup(html, 'html.parser')
    
    def safeGet(bs, selector):
        """
        Função utilitária utilizada para obter uma string de conteúdo de um
        objeto Beautiful Soup e um seletor. Retorna uma string vazia caso nenhum
        obejto seja encontrado para o seletor especificado.
        """
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    
    def getContent(website, path):
        """
        Extrai conteúdo de um URL de página específico
        """
        url = website.url+path
        bs = Crawler.getPage(url)
        if bs is not None:
            title   = Crawler.safeGet(bs, website.titleTag)
            body    = Crawler.safeGet(bs, website.bodyTag)
            return Content(url, title, body)
        return Content(url, '')

In [37]:
siteData = [
    ['O\'Reilly',   'https://www.oreilly.com',      'h1',   'div.title-description'],
    ['Reuters',     'https://www.reuters.com',      'h1',   'div.ArticleBodyWrapper'],
    ['Brookings',   'https://www.brookings.edu',    'h1',   'div.byo-block'],
    ['CNN',         'https://www.cnn.com',          'h1',   'div.article__content']
]

In [38]:
websites = []
for name, url, title, body in siteData:
    websites.append(Website(name, url, title, body))

In [None]:
Crawler.getContent(
    websites[0],
    '/library/view/web-scraping-with/9781491910283'
).print()

# Crawler.getContent(
#     websites[1],
#     '/article/us-usa-epa-pruitt-idUSKBN19W2D0'
# ).print()

# Crawler.getContent(
#     websites[2],
#     'blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/'
# ).print()

# Crawler.getContent(
#     websites[3],
#     '/2023/04/03/investing/dogecoin-elon-musk-twitter/index-html'
# ).print()

URL: https://www.oreilly.com/library/view/web-scraping-with/9781491910283
TITLE: Web Scraping with Python
BODY: 


      
        Book
      description
Learn web scraping and crawling techniques to access unlimited data from any web source in any format. With this practical guide, you’ll learn how to use Python scripts and web APIs to gather and process data from thousands—or even millions—of web pages at once.Ideal for programmers, security professionals, and web administrators familiar with Python, this book not only teaches basic web scraping mechanics, but also delves into more advanced topics, such as analyzing raw data or using scrapers for frontend website testing. Code samples are available to help you understand the concepts in practice.
Show and hide more

Publisher resources
View/Submit Errata






# Estruturando crawlers

## Rastreando sites por meio de pesquisa

In [64]:
class Content:
    """
    Classe-base comum para todos os artigos/páginas
    """

    def __init__(self, topic, url, title, body):
        self.topic  = topic
        self.url    = url
        self.title  = title
        self.title  = body

    def print(self):
        """
        Função flexível de exibição que controla a saída
        """
        print(f"New article found for topic: {self.topic}")
        print(f"URL: {self.url}")
        print(f"TITLE: {self.title}")
        print(f"BODY:\n{self.body}")

In [65]:
class Website:
    """Contém informações sobre a estrutura do site"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, 
                 absoluteUrl, titleTag, bodyTag):
        self.name           = name
        self.url            = url
        self.searchUrl      = searchUrl
        self.resultListing  = resultListing
        self.resultUrl      = resultUrl
        self.absoluteUrl    = absoluteUrl
        self.titleTag       = titleTag
        self.bodyTag        = bodyTag

In [66]:
class Crawler:
    def __init__(self, website):
        self.site   = website
        self.found  = {}

    def getPage(url):
        try:
            html = urlopen(url)
        except Exception:
            return None
        return BeautifulSoup(html, 'html.parser')
    
    def safeGet(bs, selector):
        """
        Função utilitária utilizada para obter uma string de conteúdo de um
        objeto Beautiful Soup e um seletor. Retorna uma string vazia caso nenhum
        obejto seja encontrado para o seletor especificado.
        """
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def getContent(self, topic, url):
        """
        Extrai conteúdo de um URL de página específico
        """

        bs = Crawler.getPage(url)
        if bs is not None:
            title   = Crawler.safeGet(bs, self.site.titleTag)
            body    = Crawler.safeGet(bs, self.site.bodyTag)
            return Content(topic, url, title, body)
        return Content(topic, url, '', '')
    
    def search(self, topic):
        """
        Realiza uma busca em um site específico por um determinado tópico
        e registra todas as páginas encontradas.
        """

        bs = Crawler.getPage(self.site.searchUrl + topic)
        searchResults = bs.select(self.site.resultListing)
        for result in searchResults:
            url = result.select(self.site.resultUrl)[0].attrs['href']
            # Verifica se é um URL relativo ou absoluto
            url = url if self.site.absoluteUrl else self.site.url + url
            if url not in self.found:
                self.found[url] = self.getContent(topic, url)
            self.found[url].print()

In [67]:
siteData = [
    ["Reuters",
     "http://reuters.com",
     "https://www.reuters.com/search/news?blob=",
     "div.search-result-indiv",
     "h3.search-result-tile a", 
     False, 
     'h1', 
     "div.ArticleBodyWrapper"],

    ["Brookings",
     "http://www.brookings.edu",
     "https://www.brookings.edu/search/?s=",
     "div.article-info", "h4.title a",
     True,
     "h1",
     "div.core-block"]
]

In [None]:
sites = []
for name, url, search, rListing, rUrl, absUrl, tt, bt in siteData:
    sites.append(Website(name, url, search, rListing, rUrl, absUrl, tt, bt))

crawlers    = [Crawler(site) for site in sites]
topics      = ['python', "data%20science"]

for topic in topics:
    for crawler in crawlers:
        crawler.search(topic)

## Rastreando sites por meio de links

In [85]:
class Website:
    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name           = name
        self.url            = url
        self.targetPattern  = targetPattern
        self.absoluteUrl    = absoluteUrl
        self.titleTag       = titleTag
        self.bodyTag        = bodyTag

In [86]:
class Content:
    def __init__(self, url, title, body):
        self.url    = url
        self.title  = title
        self.body   = body

    def print(self):
        print(f"URL: {self.url}")
        print(f"TITLE: {self.title}")
        print(f"BODY:\n{self.body}")

In [87]:
class Crawler:
    def __init__(self, site):
        self.site       = site
        self.visited    = {}

    def getPage(self, url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        req = Request(url, headers=headers)
        try:
            html = urlopen(req)
            return BeautifulSoup(html, 'html.parser')
        except Exception as e:
            print(f"Erro ao acessar {url}: {e}")
            return None
    
    def safeGet(self, bs, selector):
        if bs is None:
            return ''
        selectedElems = bs.select(selector)
        if selectedElems:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    
    def getContent(self, url):
        """Extrai o conteúdo de um URL de página específico"""
        bs = self.getPage(url)
        if bs:
            title   = self.safeGet(bs, self.site.titleTag)
            body    = self.safeGet(bs, self.site.bodyTag)
            return Content(url, title, body)
        return Content(url, '', '')
    
    def crawl(self):
        """Obtém páginas da página inicial do site"""
        bs = self.getPage(self.site.url)
        if not bs:
            print("Falha ao carregar página inicial.")
            return
        
        targetPages = bs.findAll('a', href= re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            url = targetPage.attrs['href']
            if not self.site.absoluteUrl:
                url = f"{self.site.url}{url}"
            if url not in self.visited:
                content = self.getContent(url)
                self.visited[url] = content
                content.print()

In [88]:
brookings = Website(
    "Brookings", 
    "https://brookings.edu", 
    r"/(research|blog|articles)/",
    True,
    'h1',
    'div.byo-block'
)

In [89]:
crawler = Crawler(brookings)
crawler.crawl()

URL: https://www.brookings.edu/articles/opening-paths-to-good-jobs-welcoming-eduardo-levy-yeyati-back-to-brookings/
TITLE: Opening paths to good jobs—Welcoming Eduardo Levy Yeyati back to Brookings
BODY:

The Global Economy and Development program is pleased to welcome Eduardo Levy Yeyati back to the Brookings Institution as a senior fellow. Eduardo was previously a nonresident senior fellow with Brookings and has published extensively with us on issues from debt and monetary policy to labor markets and income inequality.
This week, Esther Lee Rosen, senior director of communications at Global, sat down with Eduardo to discuss his research agenda and the future of work in the age of artificial intelligence.

Esther Lee Rosen (ELR): Welcome to the Global Economy and Development team at Brookings! We are thrilled to have you join us.
Eduardo Levy Yeyati (ELY): I am delighted to be here, working with such a talented group at these challenging times.
ELR: You are now leading Global’s Workf