# Tutorial on web scrapping

Scrap data from a website, clean it and put into a dataframe and perform analysis

https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [7]:
%%html

display(http://dataquestio.github.io/web-scraping-pages/simple.htm)

In [8]:
import requests

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [9]:
page.status_code

200

In [10]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [11]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [6]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [6]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [7]:
html = list(soup.children)[2]
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [8]:
body = list(html.children)[3]
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [9]:
p = list(body.children)[1]
p.get_text()

'Here is some simple content for this page.'

**For example, download the web page containing the forecast**

In [33]:
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport">
   <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
   <title>
    National Weather Service
   </title>
   <meta content="National Weather Service" name="DC.title"/>
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="http://www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robo

In [32]:
seven_day = soup.find(id="seven-day-forecast")
print(seven_day.prettify())

<div class="panel panel-default" id="seven-day-forecast">
 <div class="panel-heading">
  <b>
   Extended Forecast for
  </b>
  <h2 class="panel-title">
   San Francisco CA
  </h2>
 </div>
 <div class="panel-body" id="seven-day-forecast-body">
  <div id="seven-day-forecast-container">
   <ul class="list-unstyled" id="seven-day-forecast-list">
    <li class="forecast-tombstone">
     <div class="tombstone-container">
      <p class="period-name">
       Tonight
       <br>
        <br/>
       </br>
      </p>
      <p>
       <img alt="Tonight: A 50 percent chance of rain after 11pm.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south southeast after midnight. Winds could gust as high as 21 mph.  New precipitation amounts between a tenth and quarter of an inch possible. " class="forecast-icon" src="newimages/medium/nra50.png" title="Tonight: A 50 percent chance of rain after 11pm.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south s

In [36]:
forecast_items = seven_day.find_all(class_="tombstone-container")
forecast_items

[<div class="tombstone-container">
 <p class="period-name">Tonight<br><br/></br></p>
 <p><img alt="Tonight: A 50 percent chance of rain after 11pm.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south southeast after midnight. Winds could gust as high as 21 mph.  New precipitation amounts between a tenth and quarter of an inch possible. " class="forecast-icon" src="newimages/medium/nra50.png" title="Tonight: A 50 percent chance of rain after 11pm.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south southeast after midnight. Winds could gust as high as 21 mph.  New precipitation amounts between a tenth and quarter of an inch possible. "/></p><p class="short-desc">Chance Rain</p><p class="temp temp-low">Low: 52 °F</p></div>,
 <div class="tombstone-container">
 <p class="period-name">Friday<br><br/></br></p>
 <p><img alt="Friday: Rain.  High near 62. Breezy, with a south wind around 22 mph, with gusts as high as 29 mph.  Chance of preci

In [13]:
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br>
   <br/>
  </br>
 </p>
 <p>
  <img alt="Tonight: A 50 percent chance of rain after midnight.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south southeast after midnight. Winds could gust as high as 21 mph.  New precipitation amounts between a tenth and quarter of an inch possible. " class="forecast-icon" src="DualImage.php?i=nbkn&amp;j=nra&amp;jp=50" title="Tonight: A 50 percent chance of rain after midnight.  Cloudy, with a low around 52. West southwest wind 10 to 16 mph becoming south southeast after midnight. Winds could gust as high as 21 mph.  New precipitation amounts between a tenth and quarter of an inch possible. "/>
 </p>
 <p class="short-desc">
  Mostly Cloudy
  <br>
   then Chance
   <br>
    Rain
   </br>
  </br>
 </p>
 <p class="temp temp-low">
  Low: 52 °F
 </p>
</div>


# Extracting information from the page

In [26]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Tonight
Clear
Low: 44 °F


In [27]:
img = tonight.find("img")
desc = img['title']

print(desc)

Tonight: Clear, with a low around 44. West northwest wind 5 to 10 mph becoming light and variable. 


# Extracting all the information from the page

In [28]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Wednesday',
 'WednesdayNight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight']

In [31]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)

['Clear', 'Sunny', 'Clear', 'Sunny', 'Mostly Clear', 'Mostly Sunny', 'Partly Cloudy', 'Partly Sunny', 'Chance Rain']
['Low: 44 °F', 'High: 59 °F', 'Low: 44 °F', 'High: 61 °F', 'Low: 46 °F', 'High: 60 °F', 'Low: 50 °F', 'High: 59 °F', 'Low: 50 °F']
['Tonight: Clear, with a low around 44. West northwest wind 5 to 10 mph becoming light and variable. ', 'Wednesday: Sunny, with a high near 59. Calm wind becoming northwest 5 to 7 mph in the afternoon. ', 'Wednesday Night: Clear, with a low around 44. West wind 5 to 8 mph becoming calm  in the evening. ', 'Thursday: Sunny, with a high near 61. Calm wind becoming north 5 to 8 mph in the morning. ', 'Thursday Night: Mostly clear, with a low around 46. West wind 6 to 14 mph, with gusts as high as 18 mph. ', 'Friday: Mostly sunny, with a high near 60.', 'Friday Night: Partly cloudy, with a low around 50.', 'Saturday: Partly sunny, with a high near 59.', 'Saturday Night: A chance of rain.  Mostly cloudy, with a low around 50.']


In [32]:
short_descs

['Clear',
 'Sunny',
 'Clear',
 'Sunny',
 'Mostly Clear',
 'Mostly Sunny',
 'Partly Cloudy',
 'Partly Sunny',
 'Chance Rain']

In [34]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)

['Clear', 'Sunny', 'Clear', 'Sunny', 'Mostly Clear', 'Mostly Sunny', 'Partly Cloudy', 'Partly Sunny', 'Chance Rain']
['Low: 44 °F', 'High: 59 °F', 'Low: 44 °F', 'High: 61 °F', 'Low: 46 °F', 'High: 60 °F', 'Low: 50 °F', 'High: 59 °F', 'Low: 50 °F']
['Tonight: Clear, with a low around 44. West northwest wind 5 to 10 mph becoming light and variable. ', 'Wednesday: Sunny, with a high near 59. Calm wind becoming northwest 5 to 7 mph in the afternoon. ', 'Wednesday Night: Clear, with a low around 44. West wind 5 to 8 mph becoming calm  in the evening. ', 'Thursday: Sunny, with a high near 61. Calm wind becoming north 5 to 8 mph in the morning. ', 'Thursday Night: Mostly clear, with a low around 46. West wind 6 to 14 mph, with gusts as high as 18 mph. ', 'Friday: Mostly sunny, with a high near 60.', 'Friday Night: Partly cloudy, with a low around 50.', 'Saturday: Partly sunny, with a high near 59.', 'Saturday Night: A chance of rain.  Mostly cloudy, with a low around 50.']


# Combining our data into a Pandas Dataframe

In [35]:
import pandas as pd
weather = pd.DataFrame({
        "period": periods, 
        "short_desc": short_descs, 
        "temp": temps, 
        "desc":descs
    })
weather

Unnamed: 0,desc,period,short_desc,temp
0,"Tonight: Clear, with a low around 44. West nor...",Tonight,Clear,Low: 44 °F
1,"Wednesday: Sunny, with a high near 59. Calm wi...",Wednesday,Sunny,High: 59 °F
2,"Wednesday Night: Clear, with a low around 44. ...",WednesdayNight,Clear,Low: 44 °F
3,"Thursday: Sunny, with a high near 61. Calm win...",Thursday,Sunny,High: 61 °F
4,"Thursday Night: Mostly clear, with a low aroun...",ThursdayNight,Mostly Clear,Low: 46 °F
5,"Friday: Mostly sunny, with a high near 60.",Friday,Mostly Sunny,High: 60 °F
6,"Friday Night: Partly cloudy, with a low around...",FridayNight,Partly Cloudy,Low: 50 °F
7,"Saturday: Partly sunny, with a high near 59.",Saturday,Partly Sunny,High: 59 °F
8,Saturday Night: A chance of rain. Mostly clou...,SaturdayNight,Chance Rain,Low: 50 °F
