# Scraping Data From The Web

#### Why?

Sometimes data can't be accessed from a easy csv/txt/etc. file or an API. 

In [2]:
import requests

In [3]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
print page

# status code that starts with 2 == good; starts with 4 or 5 == bad
print page.status_code

<Response [200]>
200


In [4]:
# view html content
page.content

'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

# Cool, we have an html in Python. Now what?

### Use beautifulsoup to parse it!

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [9]:
print soup.prettify()

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [12]:
# collect all elements at top level of age
print list(soup.children)

print [type(item) for item in soup.children]

[u'html', u'\n', <html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>]
[<class 'bs4.element.Doctype'>, <class 'bs4.element.NavigableString'>, <class 'bs4.element.Tag'>]


In [24]:
html = list(soup.children)[2]
print list(html.children)
print len(list(html.children))
print "\n"
body = list(html.children)[3]
print body

[u'\n', <head>
<title>A simple example page</title>
</head>, u'\n', <body>
<p>Here is some simple content for this page.</p>
</body>, u'\n']
5


<listiterator object at 0x10778b990>


In [27]:
print list(body.children)

[u'\n', <p>Here is some simple content for this page.</p>, u'\n']


In [29]:
p = list(body.children)[1]
p

<p>Here is some simple content for this page.</p>

In [32]:
# In one line
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [35]:
soup.find_all('p')[0].get_text()

u'Here is some simple content for this page.'

In [36]:
# finding only the first method
soup.find('p')

<p>Here is some simple content for this page.</p>

In [45]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')

In [46]:
print soup.prettify()

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [54]:
# paragraphs with class 'outer-text'
soup.find_all('p', class_='outer-text')
soup.select("p.outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [48]:
# any tag with class outer-text
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [49]:
soup.find_all(id="second")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>]

# Weather Example

In [56]:
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.WKHxhxIrKRs")
soup = BeautifulSoup(page.content, 'html.parser')

In [58]:
print soup.prettify()

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport">
   <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
   <title>
    National Weather Service
   </title>
   <meta content="National Weather Service" name="DC.title"/>
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="http://www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robo

In [65]:
seven_day = soup.find(id='seven-day-forecast-list')
print seven_day.prettify()

<ul class="list-unstyled" id="seven-day-forecast-list">
 <li class="forecast-tombstone">
  <div class="tombstone-container">
   <p class="period-name">
    Today
    <br>
     <br/>
    </br>
   </p>
   <p>
    <img alt="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. "/>
   </p>
   <p class="short-desc">
    Mostly Sunny
   </p>
   <p class="temp temp-high">
    High: 58 °F
   </p>
  </div>
 </li>
 <li class="forecast-tombstone">
  <div class="tombstone-container">
   <p class="period-name">
    Tonight
    <br>
     <br/>
    </br>
   </p>
   <p>
    <img alt="Tonight: Mostly cloudy, with a low around 48. West wind 3 to 8 mph. " class="forecast-icon" src="newimages/medium/nbkn.png" title="Tonight: Mostly cloudy, with a low around 48. West wind 3 to 8 mph. "/>


In [71]:
forecast_stuff = soup.find_all(class_='tombstone-container')
print forecast_stuff
print len(forecast_stuff)

[<div class="tombstone-container">
<p class="period-name">Today<br><br/></br></p>
<p><img alt="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. "/></p><p class="short-desc">Mostly Sunny</p><p class="temp temp-high">High: 58 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Tonight<br><br/></br></p>
<p><img alt="Tonight: Mostly cloudy, with a low around 48. West wind 3 to 8 mph. " class="forecast-icon" src="newimages/medium/nbkn.png" title="Tonight: Mostly cloudy, with a low around 48. West wind 3 to 8 mph. "/></p><p class="short-desc">Mostly Cloudy</p><p class="temp temp-low">Low: 48 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Tuesday<br><br/></br></p>
<p><img alt="Tuesday: Mostly sunny, with a high near 60. Calm wi

In [73]:
tonight = forecast_stuff[0]
print tonight.prettify()

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br>
   <br/>
  </br>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 58 °F
 </p>
</div>


In [75]:
period = tonight.find(class_='period-name').get_text()
shortdsc = tonight.find(class_='short-desc').get_text()
temp = tonight.find(class_ = 'temp temp-high').get_text()
print period
print shortdsc
print temp

Today
Mostly Sunny
High: 58 °F


In [79]:
img = tonight.find('img')
desc = img['title']
print desc

Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. 


In [80]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

[u'Today',
 u'Tonight',
 u'Tuesday',
 u'TuesdayNight',
 u'Wednesday',
 u'WednesdayNight',
 u'Thursday',
 u'ThursdayNight',
 u'Friday']

In [84]:
sd_tags = seven_day.select(".tombstone-container .short-desc")
sd_tags = [sd.get_text() for sd in sd_tags]
sd_tags

[u'Mostly Sunny',
 u'Mostly Cloudy',
 u'Mostly Sunny',
 u'Mostly Cloudy',
 u'Partly Sunnythen SlightChance Rain',
 u'Rain',
 u'Rain Likely',
 u'ChanceShowers',
 u'Chance Rain']

In [86]:
temp = seven_day.select(".tombstone-container .temp")
temp = [t.get_text() for t in temp]
temp

[u'High: 58 \xb0F',
 u'Low: 48 \xb0F',
 u'High: 60 \xb0F',
 u'Low: 50 \xb0F',
 u'High: 62 \xb0F',
 u'Low: 54 \xb0F',
 u'High: 60 \xb0F',
 u'Low: 51 \xb0F',
 u'High: 58 \xb0F']

In [88]:
descs = seven_day.select(".tombstone-container img")
descs = [d['title'] for d in descs]
descs
#descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

[u'Today: Mostly sunny, with a high near 58. Calm wind becoming west northwest around 5 mph in the afternoon. ',
 u'Tonight: Mostly cloudy, with a low around 48. West wind 3 to 8 mph. ',
 u'Tuesday: Mostly sunny, with a high near 60. Calm wind becoming west around 6 mph in the afternoon. ',
 u'Tuesday Night: Mostly cloudy, with a low around 50. West southwest wind 7 to 10 mph. ',
 u'Wednesday: A 10 percent chance of rain after 4pm.  Mostly cloudy, with a high near 62. Calm wind. ',
 u'Wednesday Night: Rain, mainly after 10pm.  Low around 54. Chance of precipitation is 80%. New precipitation amounts between a quarter and half of an inch possible. ',
 u'Thursday: Rain likely.  Cloudy, with a high near 60. Chance of precipitation is 60%.',
 u'Thursday Night: A chance of showers.  Mostly cloudy, with a low around 51.',
 u'Friday: A chance of rain.  Mostly cloudy, with a high near 58.']

In [93]:
import pandas as pd
weather = pd.DataFrame({
        "period": periods, 
        "short_desc": sd_tags, 
        "temp": temp, 
        "desc":descs
    })

temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)")
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    58
1    48
2    60
3    50
4    62
5    54
6    60
7    51
8    58
Name: temp_num, dtype: object

In [94]:
weather

Unnamed: 0,desc,period,short_desc,temp,temp_num
0,"Today: Mostly sunny, with a high near 58. Calm...",Today,Mostly Sunny,High: 58 °F,58
1,"Tonight: Mostly cloudy, with a low around 48. ...",Tonight,Mostly Cloudy,Low: 48 °F,48
2,"Tuesday: Mostly sunny, with a high near 60. Ca...",Tuesday,Mostly Sunny,High: 60 °F,60
3,"Tuesday Night: Mostly cloudy, with a low aroun...",TuesdayNight,Mostly Cloudy,Low: 50 °F,50
4,Wednesday: A 10 percent chance of rain after 4...,Wednesday,Partly Sunnythen SlightChance Rain,High: 62 °F,62
5,"Wednesday Night: Rain, mainly after 10pm. Low...",WednesdayNight,Rain,Low: 54 °F,54
6,"Thursday: Rain likely. Cloudy, with a high ne...",Thursday,Rain Likely,High: 60 °F,60
7,Thursday Night: A chance of showers. Mostly c...,ThursdayNight,ChanceShowers,Low: 51 °F,51
8,"Friday: A chance of rain. Mostly cloudy, with...",Friday,Chance Rain,High: 58 °F,58


In [95]:
weather['temp_num'].mean()

55.666666666666664