In [1]:
# from mpl_toolkits.basemap import Basemap
from google.cloud import bigquery
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import shutil

### Get the closest weather stations in Montreal

In [2]:
client = bigquery.Client()
stations = client.query("""
    SELECT
          name, id,
          state,
          latitude,
          longitude,
          ST_DISTANCE(
              ST_GEOGPOINT(-73.573570, 45.522490), 
              ST_GEOGPOINT(longitude, latitude)
          ) AS dist_ms 
        FROM
          `bigquery-public-data.ghcn_d.ghcnd_stations`
        ORDER BY
          dist_ms ASC
        LIMIT
          20
    """)
results = stations.result()

In [3]:
for row in results:
    print("{} : {} : {} : {} : {} : {}".format(row.name, row.id, row.state, row.latitude, row.longitude, row.dist_ms))

MONTREAL LAFONTAINE            : CA007025267 : QC : 45.5167 : -73.5667 : 837.2523104850238
MONTRéAL 1.8 S - JEANNE-MANCE  : CA1QC000038 : QC : 45.5128 : -73.5641 : 1305.8981134240278
MONTRéAL 1.9 S                 : CA1QC000041 : QC : 45.5121 : -73.5601 : 1560.8342656258706
MONTREAL MCGILL                : CA007025280 : QC : 45.5 : -73.5833 : 2613.183850558025
MCTAVISH                       : CA007024745 : QC : 45.5 : -73.5833 : 2613.183850558025
MONTREAL JEAN BREBEUF          : CA007025260 : QC : 45.5 : -73.6167 : 4189.121028609436
MONT-ROYAL 1.6 NNE             : CA1QC000030 : QC : 45.5314 : -73.6386 : 5161.833885584858
MONTREAL JAR BOT               : CA007025257 : QC : 45.5667 : -73.55 : 5247.438519912239
ST LAURENT (COLLEGE)           : CA007027440 : QC : 45.5167 : -73.6667 : 7284.316511379227
MONTREAL ICE CONTROL           : CA007025245 : QC : 45.4667 : -73.5 : 8447.955917600664
LAVAL DES RAPIDES              : CA007024256 : QC : 45.5333 : -73.7 : 9921.856986002194
LA SALLE      

### Get the weather features from the weather stations

- Time period: 2020, don't have 2017
- prcp: rain precipitation in millimeters
- tmin: minimum temperature in degree celsius
- tmax: maximum temperature in degree celsius
- hasws: if there was significant weather events or not, such as fog, hail, rain

In [33]:
def get_weather(year):
    weather = client.query("""
          SELECT
            date,
            MAX(prcp) AS prcp,
            MAX(tmin) AS tmin,
            MAX(tmax) AS tmax,
            IF(MAX(haswx) = 'True', 'True', 'False') AS haswx
          FROM (
            SELECT
              date,
              IF (element = 'PRCP', value/10, NULL) AS prcp,
              IF (element = 'TMIN', value/10, NULL) AS tmin,
              IF (element = 'TMAX', value/10, NULL) AS tmax,
              IF (SUBSTR(element, 0, 2) = 'WT', 'True', NULL) AS haswx
          FROM
            `bigquery-public-data.ghcn_d.ghcnd_{}`
          WHERE
            id = 'CA007022250'
            AND qflag IS NULL)
          GROUP BY
            date
          ORDER BY 
            date ASC
        """.format(year))
    results = weather.result()
    return results

In [34]:
data = {'date' : [], 'prcp' : [], 'tmin' : [], 'tmax' : [], 'haswx' : []}

for y in [2018, 2019, 2020]:
    res = get_weather(y)
    for r in res:
        data["date"].append(r.date)
        data["prcp"].append(r.prcp)
        data["tmin"].append(r.tmin)
        data["tmax"].append(r.tmax)
        data["haswx"].append(r.haswx)

        
df = pd.DataFrame(data=data)
df['haswx'].value_counts()

Series([], Name: haswx, dtype: int64)

In [97]:
gs_uri = 'gs://videotron-ai-bucket/'
dataset_path = gs_uri + 'dataset/'
df.to_csv(dataset_path + 'daily_weather_montreal.csv', index=False)