# Weather Data Webscraper

Script to read in weather data for past 24 hours from http://weather.gc.ca 

In [1]:
import os
import urllib
import lxml
from lxml import html
import requests
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from platform import python_version
python_version()

'3.7.7'

In [2]:
station_code = 'vou'
url = f'https://weather.gc.ca/past_conditions/index_e.html?station={station_code:s}'

In [3]:
data_dir = 'data'
filename = f'past-24-hr-{station_code}-data.csv'

In [4]:
# LXML Tutorial is here: https://docs.python-guide.org/scenarios/scrape/
page = requests.get(url)
# Use page.content rather than page.text because 
# html.fromstring expects bytes as input.
tree = html.fromstring(page.content)

In [5]:
import itertools
nonprintable = itertools.chain(range(0x00,0x20), range(0x7f,0xa0))
nonprintable = {c:None for c in nonprintable}

def remove_nonprintable(text, nonprintable=nonprintable):
    """Remove all non-printable characters from string."""
    return text.translate(nonprintable)

def read_element_text(element, empty='Missing'):
    text = element.text_content()
    if text:
        text = ' '.join(remove_nonprintable(text).split())
    else:
        text = ''
        text = empty
    return text

In [7]:
results = tree.xpath('//table[@id="past24Table"]')
assert len(results) == 1
past_24_table = results[0]
past_24_table

print("Reading data from past 24-hour table...")
datetime_label = 'Date / Time PST'
table_header = past_24_table.xpath('thead')[0]
table_headers = [item for item in table_header.xpath('tr/th')]
print("Table headings:")
count = 0
time_col = None
col_labels = []
for i, item in enumerate(table_headers):
    # Header id ends in 'm' for metric or 'i' for imperial
    if item.attrib['id'].endswith('i'):
        continue
    label = item.text
    if label:
            label = ' '.join(label.split())
    else:
        label = ''
    children = item.getchildren()
    if len(children) > 0:
        list_of_strs = [label]
        for c in children:
            if c.text:
                list_of_strs.append(c.text.strip())
        label = ' '.join([s for s in list_of_strs if s is not ''])
    if label == '':
        label = f'Header{count}'
    print(f"{count:3d}: '{label}'")
    col_labels.append(label)
    if label.startswith('Date / Time'):
        time_col = count
    count += 1
n_columns = count
assert time_col is not None, "time column not recognized"

table_body = past_24_table.xpath('tbody')[0]
rows = table_body.getchildren()
print(f"Table has {len(rows)} rows.")
print("Table rows:")

Reading data from past 24-hour table...
Table headings:
  0: 'Date / Time PST'
  1: 'Conditions'
  2: 'Temperature °C'
  3: 'Wind km/h'
  4: 'Relative %'
  5: 'Dew °C'
  6: 'Pressure kPa'
  7: 'Visibility km'
Table has 27 rows.
Table rows:


In [8]:
date = datetime.datetime.now().date()
data = []
for i, row in enumerate(rows):
    if row.xpath('th'):
        items = row.xpath('th')
        if len(items) == 1 and items[0].attrib['class'] == 'wxo-th-bkg table-date':
            date = items[0].text
            date = pd.to_datetime(date).date()
        print(f"{i:3d}: Date {date}")

    if row.xpath('td'):
        count = 0
        items = row.xpath('td')
        row_data = []
        for item in items:
            # Header id ends in 'm' for metric or 'i' for imperial
            if item.attrib['headers'].endswith('i') or 'imperial' in item.attrib['class']:
                continue
            text = read_element_text(item)
            row_data.append(text)
            count += 1
        assert count == n_columns, 'Failed to read table row data'
        print(f"{i:3d}: {row_data}")
        # Add date time time column
        time = datetime.datetime.strptime(row_data[time_col], "%H:%M").time()
        dt = datetime.datetime.combine(date, time)
        row_data[time_col] = dt.strftime('%Y-%m-%d %H:%M')
        data.append(row_data)
        

  0: Date 2021-01-03
  1: ['21:00', 'n/a', '7 (6.8)', 'ESE 5', '86', '5', '101.1', 'n/a']
  2: ['20:00', 'n/a', '7 (7.0)', 'ESE 17 gust 27', '88', '5', '101.3', 'n/a']
  3: ['19:00', 'n/a', '7 (6.8)', 'ESE 11', '87', '5', '101.4', 'n/a']
  4: ['18:00', 'n/a', '7 (6.5)', 'SE 10', '88', '5', '101.4', 'n/a']
  5: ['17:00', 'n/a', '7 (6.7)', 'ESE 7', '88', '5', '101.4', 'n/a']
  6: ['16:00', 'n/a', '7 (7.0)', 'ESE 8', '87', '5', '101.3', 'n/a']
  7: ['15:00', 'n/a', '8 (7.6)', 'SE 17', '85', '5', '101.3', 'n/a']
  8: ['14:00', 'n/a', '8 (8.3)', 'SE 11', '82', '5', '101.2', 'n/a']
  9: ['13:00', 'n/a', '8 (8.4)', 'SE 17', '84', '6', '101.2', 'n/a']
 10: ['12:00', 'n/a', '7 (7.1)', 'ESE 18 gust 27', '90', '6', '101.1', 'n/a']
 11: ['11:00', 'n/a', '6 (6.3)', 'SE 15 gust 31', '97', '6', '101.1', 'n/a']
 12: ['10:00', 'n/a', '5 (4.9)', 'ESE 23 gust 34', '97', '5', '101.0', 'n/a']
 13: ['09:00', 'n/a', '5 (5.3)', 'SE 22 gust 37', '97', '5', '101.0', 'n/a']
 14: ['08:00', 'n/a', '6 (5.7)', 'SE 1

In [9]:
df = pd.DataFrame(data, columns=col_labels).set_index(datetime_label).sort_index()
df

Unnamed: 0_level_0,Conditions,Temperature °C,Wind km/h,Relative %,Dew °C,Pressure kPa,Visibility km
Date / Time PST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02 21:00,,5 (5.1),NE 7,99,5,100.1,
2021-01-02 22:00,,5 (4.7),SE 3,100,5,100.2,
2021-01-02 23:00,,4 (4.4),calm,99,4,100.3,
2021-01-03 00:00,,4 (4.1),WNW 3,99,4,100.4,
2021-01-03 01:00,,3 (2.5),NW 3,100,3,100.4,
2021-01-03 02:00,,3 (2.6),NNW 3,98,2,100.5,
2021-01-03 03:00,,2 (1.9),NNW 3,99,2,100.6,
2021-01-03 04:00,,1 (1.1),NE 6,100,1,100.7,
2021-01-03 05:00,,2 (1.5),SE 4,100,2,100.7,
2021-01-03 06:00,,4 (4.3),ESE 8,93,3,100.8,


In [10]:
def read_data_from_file(data_dir, year, filename):
    filepath = os.path.join(data_dir, f"{year:d}", filename)
    df = pd.read_csv(filepath)
    return df

def save_data_to_file(df, data_dir, year, filename):
    filepath = os.path.join(data_dir, f"{year:d}", filename)
    df = pd.read_csv(filepath)
    return df

In [11]:
year = date.year
filepath = os.path.join(data_dir, f"{year:d}", filename)

if not os.path.exists(filepath):
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    path = os.path.join(data_dir, f"{year:d}")
    if not os.path.exists(path):
        os.mkdir(path)
    df.to_csv(filepath)
    print(f"Data saved to {filepath}")
else:
    print("Existing file found")
    df_existing = pd.read_csv(filepath, index_col=0, dtype=str)
    assert df_existing.index.name == datetime_label
    df_existing = df_existing.sort_index()
    # Add existing records to current dataframe
    df = pd.concat(
        [df_existing.loc[df_existing.index < df.index[0]], df],
        axis=0
    )
    assert sum(df.index.duplicated()) == 0
    df.to_csv(filepath)
    print(f"Data merged and saved to {filepath}")

Existing file found
Data merged and saved to data/2021/past-24-hr-vou-data.csv


In [12]:
df_existing

Unnamed: 0_level_0,Conditions,Temperature °C,Wind km/h,Relative %,Dew °C,Pressure kPa,Visibility km
Date / Time PST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02 16:00,,8 (7.8),ENE 2,99,8,99.9,
2021-01-02 17:00,,7 (7.3),E 2,100,7,100.0,
2021-01-02 18:00,,7 (6.6),N 3,100,7,100.0,
2021-01-02 19:00,,6 (6.0),E 2,100,6,100.0,
2021-01-02 20:00,,6 (5.7),ENE 6,100,6,100.1,
2021-01-02 21:00,,5 (5.1),NE 7,99,5,100.1,
2021-01-02 22:00,,5 (4.7),SE 3,100,5,100.2,
2021-01-02 23:00,,4 (4.4),calm,99,4,100.3,
2021-01-03 00:00,,4 (4.1),WNW 3,99,4,100.4,
2021-01-03 01:00,,3 (2.5),NW 3,100,3,100.4,


In [13]:
df

Unnamed: 0_level_0,Conditions,Temperature °C,Wind km/h,Relative %,Dew °C,Pressure kPa,Visibility km
Date / Time PST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02 16:00,,8 (7.8),ENE 2,99,8,99.9,
2021-01-02 17:00,,7 (7.3),E 2,100,7,100.0,
2021-01-02 18:00,,7 (6.6),N 3,100,7,100.0,
2021-01-02 19:00,,6 (6.0),E 2,100,6,100.0,
2021-01-02 20:00,,6 (5.7),ENE 6,100,6,100.1,
2021-01-02 21:00,,5 (5.1),NE 7,99,5,100.1,
2021-01-02 22:00,,5 (4.7),SE 3,100,5,100.2,
2021-01-02 23:00,,4 (4.4),calm,99,4,100.3,
2021-01-03 00:00,,4 (4.1),WNW 3,99,4,100.4,
2021-01-03 01:00,,3 (2.5),NW 3,100,3,100.4,
