# Iniciando com Beautiful Soup

In [1]:
html_doc = '''
<!DOCTYPE html>
<html lang="pt-br">
    <head>
        <meta charset="utf-8">
        <title>Minha primeira pag HTML</title>
    </head>
    <body>
        <h1 class="bckblue">Título do meu site!</h1>
        <h2 class="bckblue">Um subtitulo</h2>
        <h3>Um subtitulo</h3>

        <p>Olá!</p>
        <p id="welcome">Bem vindo, Anderson!</p>

        <a href="https://www.google.com" target="_blank">Google</a>
    </body>
</html>
'''

In [2]:
html_doc

'\n<!DOCTYPE html>\n<html lang="pt-br">\n    <head>\n        <meta charset="utf-8">\n        <title>Minha primeira pag HTML</title>\n    </head>\n    <body>\n        <h1 class="bckblue">Título do meu site!</h1>\n        <h2 class="bckblue">Um subtitulo</h2>\n        <h3>Um subtitulo</h3>\n\n        <p>Olá!</p>\n        <p id="welcome">Bem vindo, Anderson!</p>\n\n        <a href="https://www.google.com" target="_blank">Google</a>\n    </body>\n</html>\n'

In [3]:
from bs4 import BeautifulSoup

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [5]:
soup


<!DOCTYPE html>

<html lang="pt-br">
<head>
<meta charset="utf-8"/>
<title>Minha primeira pag HTML</title>
</head>
<body>
<h1 class="bckblue">Título do meu site!</h1>
<h2 class="bckblue">Um subtitulo</h2>
<h3>Um subtitulo</h3>
<p>Olá!</p>
<p id="welcome">Bem vindo, Anderson!</p>
<a href="https://www.google.com" target="_blank">Google</a>
</body>
</html>

In [6]:
soup.title

<title>Minha primeira pag HTML</title>

In [7]:
soup.title.text

'Minha primeira pag HTML'

In [8]:
soup.h1

<h1 class="bckblue">Título do meu site!</h1>

In [9]:
soup.p

<p>Olá!</p>

In [10]:
soup.find_all('p')

[<p>Olá!</p>, <p id="welcome">Bem vindo, Anderson!</p>]

In [11]:
for i in soup.find_all('p'):
    print(i.text)

Olá!
Bem vindo, Anderson!


In [12]:
soup.find(id='welcome').text

'Bem vindo, Anderson!'

In [13]:
soup.find_all(class_="bckblue")

[<h1 class="bckblue">Título do meu site!</h1>,
 <h2 class="bckblue">Um subtitulo</h2>]

In [14]:
soup.find_all('h1', class_="bckblue")

[<h1 class="bckblue">Título do meu site!</h1>]

In [15]:
soup.find_all('h1', attrs={'class':'bckblue'})

[<h1 class="bckblue">Título do meu site!</h1>]

# Mão na massa

In [20]:
import requests

In [24]:
html = requests.get('https://www.climatempo.com.br/previsao-do-tempo/cidade/314/novafriburgo-rj').content

In [22]:
html

b'<!DOCTYPE html>\n<html>\n<head lang="pt-br">\n<title>Previs\xc3\xa3o do tempo para hoje em Nova Friburgo - RJ | Climatempo</title>\n<meta charset="UTF-8">\n\n<meta name="google-site-verification" content="f-v1CUADcZO9RTlI5wOpt11LsuNyyqWC6zHgEG43hQA" />\n\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<meta http-equiv="Cache-Control" content="no-cache, no-store">\n<meta http-equiv="Pragma" content="no-cache, no-store">\n<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0">\n<meta name="author" content="Climatempo">\n<meta name="copyright" content="\xc2\xa9 Climatempo">\n<meta name="url" content="https://www.climatempo.com.br/previsao-do-tempo/cidade/314/novafriburgo-rj">\n<meta name="description" content="Saiba qual \xc3\xa9 a previs\xc3\xa3o do tempo para hoje em Nova Friburgo - RJ. Confira se haver\xc3\xa1 previs\xc3\xa3o de chuva para Nova Fr

In [26]:
soup = BeautifulSoup(html, 'html.parser')

In [27]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head lang="pt-br">
  <title>
   Previsão do tempo para hoje em Nova Friburgo - RJ | Climatempo
  </title>
  <meta charset="utf-8"/>
  <meta content="f-v1CUADcZO9RTlI5wOpt11LsuNyyqWC6zHgEG43hQA" name="google-site-verification">
   <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
   <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
   <meta content="no-cache, no-store" http-equiv="Cache-Control"/>
   <meta content="no-cache, no-store" http-equiv="Pragma"/>
   <meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0" name="viewport"/>
   <meta content="Climatempo" name="author"/>
   <meta content="© Climatempo" name="copyright"/>
   <meta content="https://www.climatempo.com.br/previsao-do-tempo/cidade/314/novafriburgo-rj" name="url"/>
   <meta content="Saiba qual é a previsão do tempo para hoje em Nova Friburgo - RJ. Confira se haverá previsão de chuva para Nova Friburgo - RJ na Climatempo, o melh

In [56]:
soup.title.text

'Previsão do tempo para hoje em Nova Friburgo - RJ | Climatempo'

In [28]:
temperatura_minima = soup.find(id='min-temp-1').text

In [29]:
temperatura_minima

'13°'

In [30]:
temperatura_maxima = soup.find(id='max-temp-1').text

In [32]:
temperatura_maxima

'20°'

In [37]:
lista_variaveis = soup.find("ul", class_="variables-list")

In [38]:
lista_variaveis

<ul class="variables-list" data-end-sticky-banner-mobile="">
<li class="item">
<span class="variable -bold -gray">Temperatura</span>
<div class="_flex">
<p class="-gray _flex _align-center">
<img alt="Temperatura mínima" class="_margin-r-5 _margin-b-3" src="/dist/images/v2/svg/ic-arrow-min.svg"/>
<span class="-gray-light" id="min-temp-1">13°</span>
<img alt="Temperatura máxima" class="_margin-l-10 _margin-r-5 _margin-b-3" src="/dist/images/v2/svg/ic-arrow-max.svg"/>
<span class="-gray-light" id="max-temp-1">20°</span>
</p>
</div>
</li>
<li class="item">
<span class="variable -bold -gray">Chuva</span>
<div class="_flex _align-center">
<img alt="" class="lazyload _flex _width-auto" data-src="/dist/images/gota-azul.png" height="10" width="10"/>
<img alt="" class="lazyload _flex _width-auto" data-src="/dist/images/gota-azul.png" height="10" width="10"/>
<img alt="" class="lazyload _flex _width-auto" height="10" src="/dist/images/gota-cinza.png" width="10"/>
<span class="_margin-l-5">10mm -

In [39]:
itens  = lista_variaveis.find_all("li", class_="item")

In [40]:
itens

[<li class="item">
 <span class="variable -bold -gray">Temperatura</span>
 <div class="_flex">
 <p class="-gray _flex _align-center">
 <img alt="Temperatura mínima" class="_margin-r-5 _margin-b-3" src="/dist/images/v2/svg/ic-arrow-min.svg"/>
 <span class="-gray-light" id="min-temp-1">13°</span>
 <img alt="Temperatura máxima" class="_margin-l-10 _margin-r-5 _margin-b-3" src="/dist/images/v2/svg/ic-arrow-max.svg"/>
 <span class="-gray-light" id="max-temp-1">20°</span>
 </p>
 </div>
 </li>,
 <li class="item">
 <span class="variable -bold -gray">Chuva</span>
 <div class="_flex _align-center">
 <img alt="" class="lazyload _flex _width-auto" data-src="/dist/images/gota-azul.png" height="10" width="10"/>
 <img alt="" class="lazyload _flex _width-auto" data-src="/dist/images/gota-azul.png" height="10" width="10"/>
 <img alt="" class="lazyload _flex _width-auto" height="10" src="/dist/images/gota-cinza.png" width="10"/>
 <span class="_margin-l-5">10mm - 67%</span>
 </div>
 </li>,
 <li class="it

In [46]:
for item in itens:
    variable = item.find("span", class_="variable").text
    try:
        value = item.find("div", class_="_flex").text
    except:
        value = '-'
        
    print(variable, value.replace(' ', '').replace('\n', ''))

Temperatura 13°20°
Chuva 10mm-67%
Vento E-14km/h
Umidade 76%99%
Sol -


# Tabelas

In [49]:
url = 'https://pt.wikipedia.org/wiki/Lista_de_capitais_do_Brasil_por_%C3%A1rea'

html = requests.get(url).content

In [50]:
html

b'<!DOCTYPE html>\n<html class="client-nojs" lang="pt" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Lista de capitais do Brasil por \xc3\xa1rea \xe2\x80\x93 Wikip\xc3\xa9dia, a enciclop\xc3\xa9dia livre</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":[",\\t.","\xc2\xa0\\t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","janeiro","fevereiro","mar\xc3\xa7o","abril","maio","junho","julho","agosto","setembro","outubro","novembro","dezembro"],"wgRequestId":"552b5e0b-82d9-4d02-a4d3-1f90ad56d403","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Lista_de_capitais_do_Brasil_por_\xc3\xa1rea","wgTitle":"Lista de capitais do Brasil por \xc3\xa1rea","wgCurRevisionId":59645590,"wgRevisionId":59645590,"wgArticleId":2780611,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["

In [51]:
soup = BeautifulSoup(html, 'html.parser')

In [52]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="pt">
 <head>
  <meta charset="utf-8"/>
  <title>
   Lista de capitais do Brasil por área – Wikipédia, a enciclopédia livre
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","janeiro","fevereiro","março","abril","maio","junho","julho","agosto","setembro","outubro","novembro","dezembro"],"wgRequestId":"552b5e0b-82d9-4d02-a4d3-1f90ad56d403","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Lista_de_capitais_do_Brasil_por_área","wgTitle":"Lista de capitais do Brasil por área","wgCurRevisionId":59645590,"wgRevisionId":59645590,"wgArticleId":2780611,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["!Artigos com ligações inativas","Listas de municípi

In [54]:
table = soup.find('table', class_='wikitable sortable')

In [56]:
print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr style="background:#ececec;">
   <th>
    Posição
   </th>
   <th>
    Sede de governo
   </th>
   <th>
    Código do IBGE
   </th>
   <th>
    Unidade federativa
   </th>
   <th>
    Área (km²)
   </th>
  </tr>
  <tr>
   <td align="center">
    1
   </td>
   <td>
    <a href="/wiki/Porto_Velho" title="Porto Velho">
     Porto Velho
    </a>
   </td>
   <td align="center">
    1100205
   </td>
   <td>
    <img alt="" class="thumbborder" data-file-height="1400" data-file-width="2000" decoding="async" height="14" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Bandeira_de_Rond%C3%B4nia.svg/20px-Bandeira_de_Rond%C3%B4nia.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Bandeira_de_Rond%C3%B4nia.svg/30px-Bandeira_de_Rond%C3%B4nia.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Bandeira_de_Rond%C3%B4nia.svg/40px-Bandeira_de_Rond%C3%B4nia.svg.png 2x" width="20"/>
    <a href="/wiki/Rond%C3%B4nia" 

In [57]:
rows = table.find_all('tr')

In [119]:
sede = []
codibge = []
uf = []
area = []

for row in rows:
    cols = row.find_all('td')
    if len(cols) > 1:
        sede.append(cols[1].text)
        codibge.append(cols[2].text)
        uf.append(cols[3].text)
        area.append(cols[4].text.replace('\n', '').replace(',', '.'))
        

In [120]:
import pandas as pd

In [121]:
df = pd.DataFrame()

In [124]:
df['Sede'] = sede
df['CodigoIbge'] = codibge
df['UF'] = uf
df['Area'] = area

In [125]:
df

Unnamed: 0,Sede,CodigoIbge,UF,Area
0,Porto Velho,1100205,Rondônia,34 090.952
1,Manaus,1302603,Amazonas,11 401.092
2,Rio Branco,1200401,Acre,8 834.942
3,Campo Grande,5002704,Mato Grosso do Sul,8 082.978
4,Macapá,1600303,Amapá,6 563.849
5,Brasília,5300108,Distrito Federal,5 760.783
6,Boa Vista,1400100,Roraima,5 687.037
7,Cuiabá,5103403,Mato Grosso,3 266.538
8,Palmas,1721000,Tocantins,2 227.444
9,São Paulo,3550308,São Paulo,1 521.110


# IMDB

In [126]:
html =  requests.get('https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/').content

In [127]:
html

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///list/ls088181017?src=mdot">\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Everything Coming to Netflix in April 2021 - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\

In [128]:
soup = BeautifulSoup(html, 'html.parser')

In [129]:
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///list/ls088181017?src=mdot" name="apple-itunes-app"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Everything Coming to Netflix in April 2021 - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex(

In [130]:
filmes = soup.find_all('h3', class_='lister-item-header')

In [131]:
filmes

[<h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">1.</span>
 <a href="/title/tt1190080/">2012</a>
 <span class="lister-item-year text-muted unbold">(I) (2009)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">2.</span>
 <a href="/title/tt1385867/">Tiras em Apuros</a>
 <span class="lister-item-year text-muted unbold">(2010)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">3.</span>
 <a href="/title/tt1632708/">Amizade Colorida</a>
 <span class="lister-item-year text-muted unbold">(2011)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">4.</span>
 <a href="/title/tt1591095/">Sobrenatural</a>
 <span class="lister-item-year text-muted unbold">(I) (2010)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">5.</span>
 <a href="/title/tt0250494/">Legalmente L

In [133]:
for filme in filmes:
    print(filme.a.text)

2012
Tiras em Apuros
Amizade Colorida
Sobrenatural
Legalmente Loira
O Duende
Andes Mágicos
O Pianista
Possessão
Grite, Você Está Sendo Filmado
Secrets of Great British Castles
Encontrando o Amor
Te Amarei Para Sempre
Madea's Big Happy Family
White Boy
Histórias para Vestir
Sim Senhor
Alma de Cowboy
Apenas Diga Sim
Os Segredos de Madame Claude
O Paraíso e a Serpente
Até o Céu
A Fuga do Planeta Terra
What Lies Below
Coded Bias
Reunião de Família
4 Contra o Apocalipse
The Big Day
Tributo a Dolly Parton
Dinheiro Fácil: A Série
O Maior Roubo de Arte de Todos os Tempos
Casamento sem Filtro
Gokushufudo
Sen Hiç Atesböcegi Gördün mü?
Noite no Paraíso
Esquadrão Trovão
Duas por Uma
Diana: The Interview That Shocked the World
Xin Shen Bang: Ne Zha Chongsheng
Nicky, Ricky, Dicky & Dawn
A Bela e o Padeiro
Mighty Express
My Love: Six Stories of True Love
Meu Pai e Outros Vexames
The Circle: EUA
Law School
Ji hun
Why Did You Kill Me?
Dark City Beneath the Beat
O Mestre
Ride or Die
Arlo the Alligator B

In [142]:
url = 'https://www.imdb.com'
links = []

for filme in filmes:
    links.append(url+filme.a['href'])

In [143]:
links

['https://www.imdb.com/title/tt1190080/',
 'https://www.imdb.com/title/tt1385867/',
 'https://www.imdb.com/title/tt1632708/',
 'https://www.imdb.com/title/tt1591095/',
 'https://www.imdb.com/title/tt0250494/',
 'https://www.imdb.com/title/tt0107387/',
 'https://www.imdb.com/title/tt11229002/',
 'https://www.imdb.com/title/tt0253474/',
 'https://www.imdb.com/title/tt0431021/',
 'https://www.imdb.com/title/tt10487072/',
 'https://www.imdb.com/title/tt5181284/',
 'https://www.imdb.com/title/tt11668994/',
 'https://www.imdb.com/title/tt0452694/',
 'https://www.imdb.com/title/tt1787759/',
 'https://www.imdb.com/title/tt5635638/',
 'https://www.imdb.com/title/tt14177168/',
 'https://www.imdb.com/title/tt1068680/',
 'https://www.imdb.com/title/tt8846176/',
 'https://www.imdb.com/title/tt12154638/',
 'https://www.imdb.com/title/tt10307724/',
 'https://www.imdb.com/title/tt7985576/',
 'https://www.imdb.com/title/tt10978398/',
 'https://www.imdb.com/title/tt0765446/',
 'https://www.imdb.com/titl

In [None]:
### Exemplo de paginacao
### nao funciona!

In [145]:
for i in range(1,50):
    url = 'https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page='+str(i)
    html =  requests.get(url).content 
    soup = BeautifulSoup(html, 'html.parser')
    filmes = soup.find_all('h3', class_='lister-item-header')
    for filme in filmes:
        print(filme.a.text)

https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=1
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=2
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=3
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=4
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=5
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=6
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=7
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=8
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=9
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=10
https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page=

In [None]:
url = 'https://www.imdb.com/list/ls088181017/?ref_=hm_edcft_ft_netflix_a_2/title/tt1190080/page='+str(i)
html =  requests.get(url).content 
soup = BeautifulSoup(html, 'html.parser')
filmes = soup.find_all('h3', class_='lister-item-header')
for filme in filmes:
    print(filme.a.text)