## Web Scraping

BeautifulSoup
open-source Python library
extract data from HTML files
understands HTML structure by working with a parser (lxml, html5lib, etc.)
BeautifulSoup documentation for reference
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [3]:
import requests 
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import re
import time

import os
from urllib.parse import urlparse

In [4]:
from urllib.request import urlopen
from lxml import html

In [5]:
simple_html = """
<html>

<head>
  <style>
    li {font-size: 18px;}
  </style
</head>

<body>
  <div style="border-style: dotted; padding: 10px">
    <h1>Today's Learning Objectives</h1>
    <ul>
      <li>Decipher basic HTML</li>
      <li>Retrieve information from Internet</li>
      <li>Parse web data</li>
      <li>Gather and prepare data systematically</li>
    </ul>
    <br>
  </div>
</body>

</html>
"""

In [6]:
soup = bs(simple_html)
print(soup)

<html>
<head>
<style>
    li {font-size: 18px;}
  </style>
</head><body>
<div style="border-style: dotted; padding: 10px">
<h1>Today's Learning Objectives</h1>
<ul>
<li>Decipher basic HTML</li>
<li>Retrieve information from Internet</li>
<li>Parse web data</li>
<li>Gather and prepare data systematically</li>
</ul>
<br/>
</div>
</body>
</html>



In [7]:
type(soup)

bs4.BeautifulSoup

In [8]:
soup.find("h1")

<h1>Today's Learning Objectives</h1>

In [9]:
type(soup.find("h1"))

bs4.element.Tag

In [10]:
soup.find("h1").text

"Today's Learning Objectives"

In [11]:
type(soup.find("h1").text)

str

In [12]:
soup.find_all("li")

[<li>Decipher basic HTML</li>,
 <li>Retrieve information from Internet</li>,
 <li>Parse web data</li>,
 <li>Gather and prepare data systematically</li>]

In [13]:
type(soup.find_all("li"))

bs4.element.ResultSet

In [14]:
for item in soup.find_all("li"):
    print(item.text)

Decipher basic HTML
Retrieve information from Internet
Parse web data
Gather and prepare data systematically


In [15]:
learning_objetives = [item.text for item in soup.find_all("li")]
print(learning_objetives)

['Decipher basic HTML', 'Retrieve information from Internet', 'Parse web data', 'Gather and prepare data systematically']


In [16]:
workshop_html = """
<html>

<body>
  <h1>Today's Workshop</h1>
  <div id='agenda' style="background-color: aliceblue">
    <h2>Agenda</h2>
    <p>Today's workshop is comprised of three main sections:</p>
    <ol>
      <li>HTML Basics</li>
      <li>Scraping Basics</li>
      <li>Scraping Pipeline</li>
    </ol>
  </div>
  
  <div id='tools' style='background-color: honeydew'>
    <h2>Tools</h2>
    <p>You will be learning about two primary Python libraries:</p>  
    <ol>
      <li>BeautifulSoup</li>
      <li>requests</li>
    </ol>
  </div>
</body>

</html>
"""

In [17]:
soup1 = bs(workshop_html)
print(soup1)

<html>
<body>
<h1>Today's Workshop</h1>
<div id="agenda" style="background-color: aliceblue">
<h2>Agenda</h2>
<p>Today's workshop is comprised of three main sections:</p>
<ol>
<li>HTML Basics</li>
<li>Scraping Basics</li>
<li>Scraping Pipeline</li>
</ol>
</div>
<div id="tools" style="background-color: honeydew">
<h2>Tools</h2>
<p>You will be learning about two primary Python libraries:</p>
<ol>
<li>BeautifulSoup</li>
<li>requests</li>
</ol>
</div>
</body>
</html>



In [18]:
soup1.find("h1")

<h1>Today's Workshop</h1>

In [19]:
workshop_header=soup1.find("h1").text

In [20]:
type(workshop_header)

str

In [21]:
soup1.find_all("p")

[<p>Today's workshop is comprised of three main sections:</p>,
 <p>You will be learning about two primary Python libraries:</p>]

In [22]:
for item in soup1.find_all("p"):
    print(item.text)

Today's workshop is comprised of three main sections:
You will be learning about two primary Python libraries:


In [23]:
soup1.find_all("li")[:3]

[<li>HTML Basics</li>, <li>Scraping Basics</li>, <li>Scraping Pipeline</li>]

In [24]:
agenda_items = [li.text for li in soup1.find_all("li")[:3]]
print(agenda_items)

['HTML Basics', 'Scraping Basics', 'Scraping Pipeline']


In [29]:
url1 = 'https://raw.github.com/kimfetti/Conferences/master/PyCon_2020/pycon_info.html'
print(url1)

https://raw.github.com/kimfetti/Conferences/master/PyCon_2020/pycon_info.html


In [None]:
print(soup2)

In [31]:
pycon_html = open('pycon_info.html').read()

In [32]:

print(pycon_html)

<html>
    <head>
        <title>PyCon 2020 Info</title>

        <style>
            body {
                background-color: cornsilk;
            }

            h1 {
                font-size: 40px;
                font-family: courier new, arial;
                text-align: center;
                margin-top: 50px;
            }

            a {
                color: #411B2D;
                font-size: 20px;
            }

            p {
                font-size: 20px;
            }

            a:hover{
                color: white;
                background-color: #411B2D;
            }

            #toolbar {
                background-color: #F3B643;
                font-family: courier new, arial;
                font-weight: bold;
                font-size: 16px;
                display: flex;
                justify-content: space-around;
                flex-direction: row;
                border: 1px solid black;
                border-radius: 1px;
                marg

In [33]:
pycon_soup= bs(pycon_html)

In [34]:
pycon_soup.find_all("a")

[<a href="https://us.pycon.org/2020/about/">WHAT IS PYCON?</a>,
 <a href="https://us.pycon.org/2020/schedule/tutorials/">TUTORIAL SCHEDULE</a>,
 <a href="https://us.pycon.org/2020/speaking/">SPEAKING AT PYCON</a>,
 <a href="https://us.pycon.org/2020/psf/">PYTHON SOFTWARE FOUNDATION</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/50/">Foundations of Numerical Computing in Python</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/72/">It's Officially Legal so Let's Scrape the Web</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/54/">A Beginner's Guide to Befriending Python</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/55/">Scalable Computing with Dask</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/74/">Effective Data Visu

## FIND BY ATTRIBUTE

In [36]:
today_div = pycon_soup.find(id="today")
print(today_div)

<div class="events" id="today">
<h2>A Selection of Today's Events</h2>
<p> Room 309, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/50/">Foundations of Numerical Computing in Python</a></p>
<p> Room 315, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/72/">It's Officially Legal so Let's Scrape the Web</a></p>
<p> Room 317, 1:20 pm - <a href="https://us.pycon.org/2020/schedule/presentation/54/">A Beginner's Guide to Befriending Python</a></p>
<p> Room 318, 1:20 pm -<a href="https://us.pycon.org/2020/schedule/presentation/55/">Scalable Computing with Dask</a></p>
</div>


In [37]:
today_div.find_all("a")

[<a href="https://us.pycon.org/2020/schedule/presentation/50/">Foundations of Numerical Computing in Python</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/72/">It's Officially Legal so Let's Scrape the Web</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/54/">A Beginner's Guide to Befriending Python</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/55/">Scalable Computing with Dask</a>]

In [38]:
pycon_soup.find_all(attrs={"class":"events","id": "tomorrow"})

[<div class="events" id="tomorrow">
 <h2>Coming Up Tomorrow</h2>
 <p> Room 316, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a></p>
 <p> Room 319, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a></p>
 <p> Room 309, 1:20 pm - <a href="https://us.pycon.org/2020/schedule/presentation/74/">Effective Data Visualization</a>
 </p></div>]

In [40]:
today_text = [link.text for link in today_div.find_all("a")]
print(today_text)

['Foundations of Numerical Computing in Python', "It's Officially Legal so Let's Scrape the Web", "A Beginner's Guide to Befriending Python", 'Scalable Computing with Dask']


In [41]:
today_div.find("a")

<a href="https://us.pycon.org/2020/schedule/presentation/50/">Foundations of Numerical Computing in Python</a>

In [43]:
today_div.find("a")["href"]

'https://us.pycon.org/2020/schedule/presentation/50/'

In [45]:
today_links = [link["href"] for link in today_div.find_all("a")]
print(today_links)

['https://us.pycon.org/2020/schedule/presentation/50/', 'https://us.pycon.org/2020/schedule/presentation/72/', 'https://us.pycon.org/2020/schedule/presentation/54/', 'https://us.pycon.org/2020/schedule/presentation/55/']


In [46]:
pycon_soup.find_all(attrs={"class":"events","id": "tomorrow"})

[<div class="events" id="tomorrow">
 <h2>Coming Up Tomorrow</h2>
 <p> Room 316, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a></p>
 <p> Room 319, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a></p>
 <p> Room 309, 1:20 pm - <a href="https://us.pycon.org/2020/schedule/presentation/74/">Effective Data Visualization</a>
 </p></div>]

## Create a list of tuples for each of tomorrow's events. The first element in your tuples will be the event title and the second will be the event link.

In [58]:
pycon_soup.find(id="tomorrow")

<div class="events" id="tomorrow">
<h2>Coming Up Tomorrow</h2>
<p> Room 316, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a></p>
<p> Room 319, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a></p>
<p> Room 309, 1:20 pm - <a href="https://us.pycon.org/2020/schedule/presentation/74/">Effective Data Visualization</a>
</p></div>

In [59]:
pycon_soup.find(id="tomorrow").find_all("a")

[<a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a>,
 <a href="https://us.pycon.org/2020/schedule/presentation/74/">Effective Data Visualization</a>]

In [62]:
tomorrow_tuple = [(link.text, link["href"]) for link in pycon_soup.find(id="tomorrow").find_all("a")]
print(tomorrow_tuple)

[('Creating a Great Python Package', 'https://us.pycon.org/2020/schedule/presentation/63/'), ('Minimum Viable Documentation', 'https://us.pycon.org/2020/schedule/presentation/45/'), ('Effective Data Visualization', 'https://us.pycon.org/2020/schedule/presentation/74/')]


## Find the header text for today's and tomorrow's events by referencing the events class.

In [63]:
pycon_soup.find_all(class_= "events")

[<div class="events" id="today">
 <h2>A Selection of Today's Events</h2>
 <p> Room 309, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/50/">Foundations of Numerical Computing in Python</a></p>
 <p> Room 315, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/72/">It's Officially Legal so Let's Scrape the Web</a></p>
 <p> Room 317, 1:20 pm - <a href="https://us.pycon.org/2020/schedule/presentation/54/">A Beginner's Guide to Befriending Python</a></p>
 <p> Room 318, 1:20 pm -<a href="https://us.pycon.org/2020/schedule/presentation/55/">Scalable Computing with Dask</a></p>
 </div>,
 <div class="events" id="tomorrow">
 <h2>Coming Up Tomorrow</h2>
 <p> Room 316, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/63/">Creating a Great Python Package</a></p>
 <p> Room 319, 9:00 am - <a href="https://us.pycon.org/2020/schedule/presentation/45/">Minimum Viable Documentation</a></p>
 <p> Room 309, 1:20 pm - <a href="https://us.pycon.org/2020/sc

In [14]:
url = 'https://www.atrapalo.com/hoteles/results/e27cc366501427ea755a61f72a8274ca/1:0:0:0:97134/'
html = urlopen(url)
soup = bs(html, 'lxml')


In [21]:
print(soup)

<!DOCTYPE html>
<html>
<head>
<title>Hoteles Atrápalo: Ofertas de hoteles baratos</title> <meta content="NOINDEX,NOFOLLOW" name="robots"/>
<meta content="ofertas,viajes,vuelos,hoteles,teatro,entradas,billetes,aviones,Madrid,Barcelona" name="keywords"/>
<meta content="ofertas de hoteles baratos" name="description"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="telephone=no" name="format-detection"/>
<meta content="#de0000" name="theme-color"/>
<meta content="app-id=393076727" name="apple-itunes-app"/>
<meta content="app-id=com.atrapalo.restaurantes" name="google-play-app"/>
<link href="https://www.google.com" rel="preconnect dns-prefetch"/>
<link href="https://www.google.es" rel="preconnect dns-prefetch"/>
<link href="https://www.google-analytics.com" rel="preconnect dns-prefetch"/>
<lin

In [15]:
span = []
for name in Pt:
    span.append(name.get_text().strip())

In [16]:
print(span)

['', 'Si encuentras una oferta de hotel con un mejor precio en un plazo de 24h te devolvemos la diferencia (hasta un máximo de 100EUR).', 'Si encuentras una oferta de hotel con un mejor precio', 'te devolvemos la diferencia', 'Sólo tienes que enviarnos un email a mejorpreciogarantizadohot@atrapalo.com con la siguiente información:', 'Sólo tienes que enviarnos un email', 'con la siguiente información:', 'El sitio web donde lo has encontrado.', 'Un pantallazo de la oferta final (con tasas, gastos de gestión y cargos en la tarjeta) en el que conste también fecha y hora  de dicho pantallazo.', 'Tu nombre, dirección de correo electrónico y teléfono de contacto, junto con tu localizador de compra de Atrápalo.', 'Tras verificar los datos, te devolveremos el importe a través de un Vale Atrápalo El Vale será por reserva, válido para efectuar compras en el producto de hoteles, con una validez de 3 meses.', 'Tras verificar los datos, te devolveremos el importe a través de un Vale Atrápalo', 'La o

In [35]:
Pt = soup.find_all('header')
print(Pt)

[<header class="header-section header-container__headerSection__3834172512 false"><a class="logo-atrapalo logo__logoSection__3834172512 logo__searchIsNotOpen__3834172512" href="https://www.atrapalo.com/" title="atrapalo.com - Ocio al mejor precio"><img alt="atrapalo.com - Ocio al mejor precio" class="atrapalo-text logo__fullLogo__3834172512" src="https://cdn.atrapalo.com/assets/common/brand-logo-negative.svg"/><img alt="atrapalo.com - Ocio al mejor precio" class="atrapalo-text logo__atrapador__3834172512" src="https://cdn.atrapalo.com/assets/common/brand-atrapador-negative.svg"/></a><nav class="column menu__columnSearchIsNotOpen__3834172512 menu__menuWrapper__3834172512"><div class="menu__expandable__3834172512 menu__onMediumUpFlex__3834172512"> <section class="menu__productSection__3834172512"><a class="menu__productLink__3834172512" href="https://www.atrapalo.com/hoteles/"><div class="menu__productText__3834172512">Hoteles</div></a><div class="menu__moreSection__3834172512"><div clas

In [20]:
import urllib.request

datos = urllib.request.urlopen("https://www.atrapalo.com/hoteles/results/e27cc366501427ea755a61f72a8274ca/1:0:0:0:97134/").read().decode()
soup =  BeautifulSoup(datos)
tags = soup("a")
for tag in tags:
    print(tag.get("href"))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe1 in position 62: invalid continuation byte

class ExpeHotelScraper:


    def __init__(self, url):

       self.url = url

       self.download_page()


    def download_page(self):

       # method for downloading the hotel page

       self.page = requests.get(self.url).text


    def scrape_data(self):

       #method for scraping out hotel name, address, and about

       soup = BeautifulSoup(self.page, "html.parser")

       hotel_name = soup.find("h1", {"class": "uitk-heading-3"}).text

       hotel_address = soup.find("div", {"data-stid": "content-hotel-address"}).text

       hotel_about = soup.find("div", {"data-stid": "content-markup"}).text

       return {"name": hotel_name,

               "about": hotel_about,

               "address": hotel_address

               }



urls = ["https://www.expedia.com/California-Hotels-Holiday-Inn-Express-Suites-Lexington-Park-California.h9741955.Hotel-Information?chkin=2021-10-17&chkout=2021-10-18&x_pwa=1&rfrr=HSR&pwa_ts=1633296934579&referrerUrl=aHR0cHM6Ly93d3cuZXhwZWRpYS5jb20vSG90ZWwtU2VhcmNo&useRewards=false&rm1=a2®ionId=85533&destination=California%2C+Maryland%2C+United+States+of+America&destType=MARKET&sort=RECOMMENDED&top_dp=123&top_cur=USD&semdtl=&userIntent=&selectedRoomType=201330831&selectedRatePlan=380921932",]
for url in urls:
    x = ExpeHotelScraper(url)

    print(x.scrape_data())

class ExpeHotelScraper:
    def __init__(self, url):
        self.url = url
        self.download_page()

    def download_page(self):
        # method for downloading the hotel page
        self.page = requests.get(self.url).text

    def scrape_data(self):
        #method for scraping out hotel name, address, and about
        soup = BeautifulSoup(self.page, "html.parser")
        hotel_name = soup.find("h1", {"class": "uitk-heading-3"}).text
        hotel_address = soup.find("div", {"data-stid": "content-hotel-address"}).text
        hotel_about = soup.find("div", {"data-stid": "content-markup"}).text
        return {"name": hotel_name,“about”: hotel_about, “address”: hotel_address}

urls = [“https://www.expedia.com/California-Hotels-Holiday-Inn-Express-Suites-Lexington-Park-California.h9741955.Hotel-Information?chkin=2021-10-17&chkout=2021-10-18&x_pwa=1&rfrr=HSR&pwa_ts=1633296934579&referrerUrl=aHR0cHM6Ly93d3cuZXhwZWRpYS5jb20vSG90ZWwtU2VhcmNo&useRewards=false&rm1=a2&regionId=85533&destination=California%2C+Maryland%2C+United+States+of+America&destType=MARKET&sort=RECOMMENDED&top_dp=123&top_cur=USD&semdtl=&userIntent=&selectedRoomType=201330831&selectedRatePlan=380921932”,]

for url in urls:

    x = ExpeHotelScraper(url)

    print(x.scrape_data())