# Scraping weather data 


This notebook scrap the historical daily weather information from https://www.wunderground.com/history/daily/. The procedure is the following.

1. Select range of date and for each day put the day into get_data_n_soup function.

2. Inside the get_data_n_soup 
    - Ask Selenium to open the website, and execute inner javascript.
    - Parse data into beautifulsoup object and look for the hourly table
    - Parse the table into a panda dataframe
    - remove the unit 
    - add date column
    
    
3. Use beautifulsoup to extract the unit in get_unit function.  
4. Add unit into Pandas columns name
5. Combine each df from daily table into a single dataframe and save it.

Reference: 
- http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
- https://automatetheboringstuff.com/chapter11/

In [1]:
import sys
import wget
import requests
from pathlib import Path
from fastai.imports import *
from bs4 import BeautifulSoup
from selenium import webdriver 
from datetime import datetime, date
import time
import pandas as pd

## Testing the Scraper

In [2]:
# set path to chromedriver
driver_path = "./chrome_driver/chromedriver"
browser = webdriver.Chrome(driver_path) 

In [3]:
# select date range 
start_date = datetime(2018, 8, 26)
stop_date = datetime(2018, 8, 26)
# convert date-time object to string
date_range = pd.date_range(start_date, stop_date).strftime('%Y-%m-%d')
#select one date
date_str = date_range[0]
#date_str = '2018-08-26'

In [4]:
# for now use the station KBOS weather station at airport
url=f'https://www.wunderground.com/history/daily/us/ma/boston/KBOS/date/{date_str}'

In [5]:
# open the website
browser.get(url)
time.sleep(10)

# execute inner html 
innerhtml= browser.execute_script("return document.body.innerHTML")

# parse html into beautifulsoup object
soup = BeautifulSoup(innerhtml)

In [6]:
# find the hourly weather report  

# This worked in June 2019
#div_table=soup.find_all(attrs={"id": "history-observation-table"})[0] # 

# Update code Jan 2020
div_table=soup.find_all(attrs={"class":"observation-table ng-star-inserted"})[0]

In [7]:
#parse into a panda dataframe
daily_df = pd.read_html(str(div_table))[0]
daily_df

Unnamed: 0,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition
0,6:54 PM,76 F,66 F,71 %,S,12 mph,0 mph,29.99 in,0.0 in,Fair
1,7:54 PM,75 F,67 F,76 %,S,12 mph,0 mph,29.99 in,0.0 in,Fair
2,9:54 PM,75 F,67 F,76 %,SW,10 mph,0 mph,29.99 in,0.0 in,Fair
3,10:54 PM,74 F,67 F,79 %,WSW,10 mph,0 mph,30.00 in,0.0 in,Partly Cloudy


In [8]:
# add unit into header
trow = div_table.tbody.find_all('tr')

 #stripping unit
f_row = trow[0]
units=[]
for unit_cell in f_row.find_all(class_="wu-label"):
    unit = ''.join(string for string in unit_cell.stripped_strings)
    units.append(r'('+unit+r')')
units.insert(0, '')
units.insert(4, '')
units.append('')

In [9]:
daily_df.columns + units

Index(['Time', 'Temperature(F)', 'Dew Point(F)', 'Humidity(%)', 'Wind',
       'Wind Speed(mph)', 'Wind Gust(mph)', 'Pressure(in)', 'Precip.(in)',
       'Condition'],
      dtype='object')

In [10]:
daily_df.columns = daily_df.columns + units
daily_df.head()

Unnamed: 0,Time,Temperature(F),Dew Point(F),Humidity(%),Wind,Wind Speed(mph),Wind Gust(mph),Pressure(in),Precip.(in),Condition
0,6:54 PM,76 F,66 F,71 %,S,12 mph,0 mph,29.99 in,0.0 in,Fair
1,7:54 PM,75 F,67 F,76 %,S,12 mph,0 mph,29.99 in,0.0 in,Fair
2,9:54 PM,75 F,67 F,76 %,SW,10 mph,0 mph,29.99 in,0.0 in,Fair
3,10:54 PM,74 F,67 F,79 %,WSW,10 mph,0 mph,30.00 in,0.0 in,Partly Cloudy


In [11]:
# clean up data
daily_df.iloc[:,1] = daily_df.iloc[:,1].str.replace('\xa0F','')
daily_df.iloc[:,2] = daily_df.iloc[:,2].str.replace('\xa0F','')
daily_df.iloc[:,3] = daily_df.iloc[:,3].str.replace('\xa0%','')
daily_df.iloc[:,5] = daily_df.iloc[:,5].str.replace('\xa0mph','')
daily_df.iloc[:,6] = daily_df.iloc[:,6].str.replace('\xa0mph','')
daily_df.iloc[:,7] = daily_df.iloc[:,7].str.replace('\xa0in','')
daily_df.iloc[:,8] = daily_df.iloc[:,8].str.replace('\xa0in','')
#daily_df.iloc[:,9] = daily_df.iloc[:,9].str.replace(' in','') # this is missing in Jan 2020

In [12]:
daily_df

Unnamed: 0,Time,Temperature(F),Dew Point(F),Humidity(%),Wind,Wind Speed(mph),Wind Gust(mph),Pressure(in),Precip.(in),Condition
0,6:54 PM,76,66,71,S,12,0,29.99,0.0,Fair
1,7:54 PM,75,67,76,S,12,0,29.99,0.0,Fair
2,9:54 PM,75,67,76,SW,10,0,29.99,0.0,Fair
3,10:54 PM,74,67,79,WSW,10,0,30.0,0.0,Partly Cloudy


In [13]:
weather = pd.DataFrame()
weather = pd.concat([weather,daily_df])

In [14]:
weather.head()

Unnamed: 0,Time,Temperature(F),Dew Point(F),Humidity(%),Wind,Wind Speed(mph),Wind Gust(mph),Pressure(in),Precip.(in),Condition
0,6:54 PM,76,66,71,S,12,0,29.99,0.0,Fair
1,7:54 PM,75,67,76,S,12,0,29.99,0.0,Fair
2,9:54 PM,75,67,76,SW,10,0,29.99,0.0,Fair
3,10:54 PM,74,67,79,WSW,10,0,30.0,0.0,Partly Cloudy


## Perform Mass Scraping

put the above code into a function and mass scrape 

In [1]:
import sys
import wget
from pathlib import Path
from fastai.imports import *
import requests
from bs4 import BeautifulSoup
from selenium import webdriver 
from datetime import datetime, date
import time
import pandas as pd

In [15]:
# set path to chromedriver
driver_path = "./chrome_driver/chromedriver"
browser = webdriver.Chrome(driver_path) 

In [16]:
# define date range
start_date = datetime(2019, 1, 1)
stop_date = datetime(2019, 6, 10)
date_range = pd.date_range(start_date, stop_date).strftime('%Y-%m-%d')

In [17]:
weather_2016 = pd.DataFrame()

for date in date_range:
    # obtain daily weather dataframe
    daily_df, div_table = get_data_n_soup(date)
    # get the unit
    units = get_unit(div_table)
    
    #if daily_df.bool() == 0 or div_table.bool() == 0:
    #    print (f'Misiing data on {date}')
    #    continue
    
    # add the unit into the column names
    daily_df.columns = daily_df.columns + units
    
    print(f'Finish obtaining data on {date}')
    #combine the weather for each day
    weather_2016 = pd.concat([weather_2016,daily_df], axis=0, join='outer')
    print(len(weather_2016))
    
#save weather dataframe
weather_2016.to_csv('data/raw/weather/weather_2019_v1.csv')

Finish obtaining data on 2019-01-01
29
Finish obtaining data on 2019-01-02
53
Finish obtaining data on 2019-01-03
82
Finish obtaining data on 2019-01-04
106
Finish obtaining data on 2019-01-05
144
Finish obtaining data on 2019-01-06
168
Finish obtaining data on 2019-01-07
192
Finish obtaining data on 2019-01-08
232
Finish obtaining data on 2019-01-09
267
Finish obtaining data on 2019-01-10
291
Finish obtaining data on 2019-01-11
313
Finish obtaining data on 2019-01-12
336
Finish obtaining data on 2019-01-13
361
Finish obtaining data on 2019-01-14
386
Finish obtaining data on 2019-01-15
409
Finish obtaining data on 2019-01-16
433
Finish obtaining data on 2019-01-17
457
Finish obtaining data on 2019-01-18
480
Finish obtaining data on 2019-01-19
512
Finish obtaining data on 2019-01-20
563
Finish obtaining data on 2019-01-21
590
Finish obtaining data on 2019-01-22
614
Finish obtaining data on 2019-01-23
641
Finish obtaining data on 2019-01-24
675
Finish obtaining data on 2019-01-25
701
Fin

In [9]:
#sometimes the for loop stop in the middle, so save the obtained data here
weather_2016.to_csv('data/raw/weather/weather_2019_v1.csv')

In [6]:
def get_data_n_soup(date_str):
    ''' Input: date in string
    - Ask Selenium to open the website, and execute inner javascript.
    - Parse data into beautifulsoup object and look for the hourly table
    - Parse the table into a panda dataframe
    - remove the unit 
    - add date column
    
    return: daily weather dataframe and beauitfulsoup object of that table
    '''
    # the weather station to be used (station KBOS at airport)
    url=f'https://www.wunderground.com/history/daily/us/ma/boston/KBOS/date/{date_str}'
    
    flag = True
    while flag:
        browser.get(url)
        time.sleep(10)
        innerhtml= browser.execute_script("return document.body.innerHTML")
        soup = BeautifulSoup(innerhtml)
        time.sleep(2)
        if len(soup.find_all(attrs={"class":"observation-table ng-star-inserted"}))>0:
            flag = False
    
    div_table=soup.find_all(attrs={"class":"observation-table ng-star-inserted"})[0]
    daily_df = pd.read_html(str(div_table))[0]
    #else:
    #    return 0,0
    
    # clean up data
    daily_df.iloc[:,1] = daily_df.iloc[:,1].str.replace('\xa0F','')
    daily_df.iloc[:,2] = daily_df.iloc[:,2].str.replace('\xa0F','')
    daily_df.iloc[:,3] = daily_df.iloc[:,3].str.replace('\xa0%','')
    daily_df.iloc[:,5] = daily_df.iloc[:,5].str.replace('\xa0mph','')
    daily_df.iloc[:,6] = daily_df.iloc[:,6].str.replace('\xa0mph','')
    daily_df.iloc[:,7] = daily_df.iloc[:,7].str.replace('\xa0in','')
    daily_df.iloc[:,8] = daily_df.iloc[:,8].str.replace('\xa0in','')
    #daily_df.iloc[:,9] = daily_df.iloc[:,9].str.replace(' in','') # this is missing in Jan 2020
    
    # add date columns
    daily_df['date'] = pd.to_datetime(date + ' ' +daily_df['Time'], format="%Y-%m-%d %I:%M %p")
    return daily_df, div_table

In [7]:
def get_unit(div_table):
    ''' stripe unit from the weather table
    columns that do not have unit get empty string 
    '''
    #if div_table == 0: return 0
    trow = div_table.tbody.find_all('tr')
    #stripping unit
    f_row = trow[0]
    units=[]
    for unit_cell in f_row.find_all(class_="wu-label"):
        unit = ''.join(string for string in unit_cell.stripped_strings)
        units.append(r'('+unit+r')')
    units.insert(0, '')
    units.insert(4, '')
    units.append('')
    units.append('')
    return units