## Webscraping

#### Basic html page

```
<!DOCTYPE html>
<html>
<head>
    <title>Web Page!</title>
    <style>
        body {background-color: powderblue;}
        h1   {color: blue;}
        p    {color: red;}
    </style>
    <link rel="stylesheet" href="styles.css">
    <script>
        document.getElementById("demo").innerHTML = "Hello JavaScript!";
    </script>
</head>
<body>
    <h1>A Very Bold Header</h1>
    <div style="background-color:lightblue">
        <p>This is a paragraph.</p>
    </div>
</body>
</html>
```

### nyc weather history

http://w1.weather.gov/data/obhistory/KNYC.html

In [1]:
knyc_link = 'http://w1.weather.gov/data/obhistory/KNYC.html'

In [2]:
import requests

knyc_page = requests.get(knyc_link)
knyc_page

<Response [200]>

In [3]:
# the first 1000 characters of the page
print(knyc_page.content[:1000])

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\r\n\t\t\t\t\t\t\t<html><meta name="Author" content="Leon Minton"><head><title>\r\n\t\t\t\t\t\t\tNational Weather Service : Observed Weather for past 3 Days : New York City, Central Park</title>\r\n\t\t\t\t\t\t\t<link rel="STYLESHEET" type="text/css" href="/images/weather/fcicons/main.css"></head>\r\n\t\t\t\t\t\t\t<body bgcolor="#ffffff" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0" background="/images/weather/fcicons/gray_background.gif">\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670" background="/images/weather/fcicons/topbanner.jpg">\r\n\t\t\t\t\t\t\t<tr><td align="right" height="19"><a href="http://weather.gov"><span class="nwslink">weather.gov</span></a>&nbsp;&nbsp;&nbsp;</td></tr></table>\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670"><tr valign="top">\r\n\t\t\t\t\t\t\t<td rowspan="2"><a href="http://www.noaa.gov"><img src="/images/weathe

In [4]:
# need to parse some html!
from bs4 import BeautifulSoup

In [5]:
knyc_soup = BeautifulSoup(knyc_page.content)

In [6]:
# first 1000 characters more legibly
print(knyc_soup.prettify()[:1000])

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
 <meta content="Leon Minton" name="Author"/>
 <head>
  <title>
   National Weather Service : Observed Weather for past 3 Days : New York City, Central Park
  </title>
  <link href="/images/weather/fcicons/main.css" rel="STYLESHEET" type="text/css"/>
 </head>
 <body background="/images/weather/fcicons/gray_background.gif" bgcolor="#ffffff" leftmargin="0" marginheight="0" marginwidth="0" topmargin="0">
  <table background="/images/weather/fcicons/topbanner.jpg" border="0" cellpadding="0" cellspacing="0" width="670">
   <tr>
    <td align="right" height="19">
     <a href="http://weather.gov">
      <span class="nwslink">
       weather.gov
      </span>
     </a>
    </td>
   </tr>
  </table>
  <table border="0" cellpadding="0" cellspacing="0" width="670">
   <tr valign="top">
    <td rowspan="2">
     <a href="http://www.noaa.gov">
      <img alt="NOAA logo - Click to go to the NOAA homepage" border="0" height="78" sr

In [7]:
# print the 4rd table in the page
print(knyc_soup.find_all('table')[3])

<table border="0" cellpadding="2" cellspacing="3" width="670"><tr align="center" bgcolor="#b0c4de"><th rowspan="3" width="17">D<br/>a<br/>t<br/>e</th><th rowspan="3" width="32">Time<br/>(est)</th>
<th rowspan="3" width="80">Wind<br/>(mph)</th><th rowspan="3" width="40">Vis.<br/>(mi.)</th><th rowspan="3" width="80">Weather</th><th rowspan="3" width="65">Sky Cond.</th>
<th colspan="4">Temperature (ºF)</th><th rowspan="3" width="65">Relative<br/>Humidity</th><th rowspan="3" width="80">Wind<br/>Chill<br/>(°F)</th><th rowspan="3" width="80">Heat<br/>Index<br/>(°F)</th><th colspan="2">Pressure</th><th colspan="3">Precipitation (in.)</th></tr>
<tr align="center" bgcolor="#b0c4de"><th rowspan="2" width="45">Air</th><th rowspan="2" width="26">Dwpt</th><th colspan="2">6 hour</th>
<th rowspan="2" width="40">altimeter<br/>(in)</th><th rowspan="2" width="40">sea level<br/>(mb)</th><th rowspan="2" width="24">1 hr</th>
<th rowspan="2" width="24">3 hr</th><th rowspan="2" width="30">6 hr</th></tr>
<tr 

In [8]:
# extract data from the 4th table in the page into a dataframe
import pandas as pd
data_table = knyc_soup.find_all('table')[3]

table_rows = data_table.find_all('tr') # get rows from table

data = []
for idx,tr in enumerate(table_rows):
    if idx < 3 :                       # skip header rows
        continue
    td = tr.find_all('td')             # get table cells
    row = [elem.text for elem in td]   # pull text from cells
    data.append(row)                   # add to dataset
    
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,16,07:51,W 10 G 17,4.0,Fog/Mist,FEW008 SCT044,47,43,,,86%,42.0,,29.86,1010.3,,,
1,16,06:51,W 12 G 17,5.0,Fog/Mist,OVC008,46,42,47.0,45.0,86%,40.0,,29.85,1010.0,,,0.11
2,16,05:51,SW 3 G 17,6.0,Fog/Mist,OVC011,46,42,,,86%,,,29.85,1009.9,,,
3,16,04:51,Vrbl 5,7.0,Overcast,OVC016,46,41,,,83%,44.0,,29.88,1010.9,0.02,,
4,16,03:51,Vrbl 5,4.0,Light Rain Fog/Mist,FEW012 FEW016 OVC025,46,43,,,89%,44.0,,29.88,1011.1,0.03,0.09,


#### central park weather history summary
https://www.wunderground.com/history/daily/us/ny/new-york-city/KNYC/date/2018-12-3?cm_ven=localwx_history

In [9]:
wu_link = 'https://www.wunderground.com/history/daily/us/ny/new-york-city/KNYC/date/2018-12-3?cm_ven=localwx_history'

In [10]:
# get the page
wu_page = requests.get(wu_link)
wu_page

<Response [200]>

In [11]:
wu_soup = BeautifulSoup(wu_page.content)

In [12]:
print(wu_soup.prettify()[:1000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Weather Underground
  </title>
  <base href="/"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <link href="/bundle-next/styles.40c95bfccea351808856.css" rel="stylesheet"/>
  <style ng-transition="app-root">
   {}  body,   p{font-size:.875rem;color:#1e2023}  :focus{outline:0!important}  a:link{color:#1088b0}  a:visited{color:#1088b0}  a:hover{color:#1088b0}  a:hover:not(.button){text-decoration:underline}  a:active{color:#1088b0}  a.button{color:#fff}  a.hook{text-decoration:underline}  a:focus,   button:focus{outline:0}  input[type=date],   input[type=datetime-local],   input[type=datetime],   input[type=email],   input[type=month],   input[type=number],   input[type=password],   input[type=search],   input[type=tel],   input[type=text],   input[type=time],   input[type=url],   input[type=week],   textarea{border-

In [13]:
# the table we want doesn't exist! culprit: javascript
wu_soup.find_all('div',class_='tablesaw-sortable')

[]

In [14]:
# get the text from the page
wu_text = wu_soup.get_text()

# clean up the whitespace
import re
wu_text = re.sub(r'\n+','\n',wu_text.strip())
print(wu_text[:1000])

Weather Underground
{}  body,   p{font-size:.875rem;color:#1e2023}  :focus{outline:0!important}  a:link{color:#1088b0}  a:visited{color:#1088b0}  a:hover{color:#1088b0}  a:hover:not(.button){text-decoration:underline}  a:active{color:#1088b0}  a.button{color:#fff}  a.hook{text-decoration:underline}  a:focus,   button:focus{outline:0}  input[type=date],   input[type=datetime-local],   input[type=datetime],   input[type=email],   input[type=month],   input[type=number],   input[type=password],   input[type=search],   input[type=tel],   input[type=text],   input[type=time],   input[type=url],   input[type=week],   textarea{border-radius:3px;-webkit-appearance:none;height:34px;margin:0;background:#f7f7f7}  input[type=date]:focus,   input[type=datetime-local]:focus,   input[type=datetime]:focus,   input[type=email]:focus,   input[type=month]:focus,   input[type=number]:focus,   input[type=password]:focus,   input[type=search]:focus,   input[type=tel]:focus,   input[type=text]:focus,   input

### Need to actually render page to process scripts!

In [16]:
# need to install chromedriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import chromedriver_binary

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 80


In [None]:
# this will actually render the page
driver.get(wu_link)

In [None]:
# two ways to find the table we want
wu_table = driver.find_element_by_class_name('city-history-observation')
#wu_table = driver.find_element_by_id('history-observation-table')

In [None]:
# text in the table
wu_table.text

In [None]:
# extracting text into a datafram
wu_data = []
for tr in wu_table.find_elements_by_css_selector('tr'):
    tmp_row = []
    for td in tr.find_elements_by_css_selector('td'):
        tmp_row.append(td.text.strip())
    wu_data.append(tmp_row)
df_wu = pd.DataFrame(wu_data)
df_wu.head()

In [None]:
# visualize the rendered table, still missing some stuff, need to debug
wu_table.screenshot('./images/test1.png')