# Jupyter Notebook Purpose
- Run all code cells and you will have the most current data from each resource (through a series of HTTP requests)
    - as long as the links remain active in the future
- The data will be saved in the data format from where they came and unzipped for convenience
- Also the data will be processed in later Jupyter Notebooks
    - they are numbered in the order that they should be run (ex. 01, 02, etc.)

# Table of Contents

## 1. [Python Dependecies](#1)
___
## 2. [Folder Creation](#2)
___
## 3. [Toronto Fire Services Basic Incident Details (2011 - 2018)](#3)
### a. [Data Download / Saving / Unzipping](#3a)
### b. [Data Munging / Wrangling](#3b)
### c. [Initial DataFrame](#3c)
___
## 4. [Toronto Historical Weather](#4)
### a. [Data Download / Saving / Unzipping](#4a)
### b. [Data Munging / Wrangling](#4b)
### c. [Initial DataFrame](#4c)
___
## 5. [Toronto Fire Services Station Locations](#5)
### a. [Data Download / Saving / Unzipping](#5a)
___
## 6. [Toronto Fire Hydrants](#6)
### a. [Data Download / Saving / Unzipping](#6a)
___

# 1
# Python Dependencies

In [1]:
# Python Modules for Miscellaneous reasons
from zipfile import ZipFile  # to read and write to zipped folders
import requests  # simple HTTP library for Python
import os        # portable way to use operating system functionalities
import io        # Tool for working with streams (Input/Ouput data)
import datetime  # python classes for manipulating dates and times
import dateutil  # powerful extensions to standard datetime Python module
import time      # used for time.sleep() to delay the HTTP requests ever so slightly
import re        # used for Python regex library

In [2]:
# DATA ANALYSIS / VISUALIZATION Python Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 2
# Creation of Folders
- these folders will be used to store `RAW_ZIPPED/`, `RAW_UNZIPPED/`, `PROCESSED_ZIPPED/`, `PROCESSED_UNZIPPED/` data files
- it is assumed that you do not have them so I will create them for you if you do not have them

In [3]:
# Here are the major directory names
CWD_PATH = os.getcwd() # current working directory
RAW_ZIPPED_DIRECTORY = os.path.join(CWD_PATH, "RAW_ZIPPED")
RAW_UNZIPPED_DIRECTORY = os.path.join(CWD_PATH, "RAW_UNZIPPED")
PROCESSED_ZIPPED_DIRECTORY = os.path.join(CWD_PATH, "PROCESSED_ZIPPED")
PROCESSED_UNZIPPED_DIRECTORY = os.path.join(CWD_PATH, "PROCESSED_UNZIPPED")

In [1]:
# Lets check to see if all 4 folders exist
# If they do not then create them
print(f"{RAW_ZIPPED_DIRECTORY} exists") if os.path.isdir(RAW_ZIPPED_DIRECTORY) else (os.makedirs(RAW_ZIPPED_DIRECTORY), print(f"{RAW_ZIPPED_DIRECTORY} created"));
print(f"{RAW_UNZIPPED_DIRECTORY} exists") if os.path.isdir(RAW_UNZIPPED_DIRECTORY) else (os.makedirs(RAW_UNZIPPED_DIRECTORY), print(f"{RAW_UNZIPPED_DIRECTORY} created"));
print(f"{PROCESSED_ZIPPED_DIRECTORY} exists") if os.path.isdir(PROCESSED_ZIPPED_DIRECTORY) else (os.makedirs(PROCESSED_ZIPPED_DIRECTORY), print(f"{PROCESSED_ZIPPED_DIRECTORY} created"));
print(f"{PROCESSED_UNZIPPED_DIRECTORY} exists") if os.path.isdir(PROCESSED_UNZIPPED_DIRECTORY) else (os.makedirs(PROCESSED_UNZIPPED_DIRECTORY), print(f"{PROCESSED_UNZIPPED_DIRECTORY} created"));

# 3
# Toronto Fire Services Basic Incident Details (2011 - 2018)
- Open Data Toronto [link](https://open.toronto.ca/dataset/fire-services-basic-incident-details/)

# 3a
# Data Download / Saving / Unzipping

In [3]:
# Get the folders for fire_incidents
FIRE_INCIDENTS = "FIRE_INCIDENTS"
FIRE_RAW_ZIPPED_DIRECTORY = os.path.join(RAW_ZIPPED_DIRECTORY, FIRE_INCIDENTS)
FIRE_RAW_UNZIPPED_DIRECTORY = os.path.join(RAW_UNZIPPED_DIRECTORY, FIRE_INCIDENTS)
FIRE_PROCESSED_ZIPPED_DIRECTORY = os.path.join(PROCESSED_ZIPPED_DIRECTORY, FIRE_INCIDENTS)
FIRE_PROCESSED_UNZIPPED_DIRECTORY = os.path.join(PROCESSED_UNZIPPED_DIRECTORY, FIRE_INCIDENTS)

# Create all the subfolders for the above folder
print(f"{FIRE_RAW_ZIPPED_DIRECTORY} exists") if os.path.isdir(FIRE_RAW_ZIPPED_DIRECTORY) else (os.makedirs(FIRE_RAW_ZIPPED_DIRECTORY), print(f"{FIRE_RAW_ZIPPED_DIRECTORY} created"));
print(f"{FIRE_RAW_UNZIPPED_DIRECTORY} exists") if os.path.isdir(FIRE_RAW_UNZIPPED_DIRECTORY) else (os.makedirs(FIRE_RAW_UNZIPPED_DIRECTORY), print(f"{FIRE_RAW_UNZIPPED_DIRECTORY} created"));
print(f"{FIRE_PROCESSED_ZIPPED_DIRECTORY} exists") if os.path.isdir(FIRE_PROCESSED_ZIPPED_DIRECTORY) else (os.makedirs(FIRE_PROCESSED_ZIPPED_DIRECTORY), print(f"{FIRE_PROCESSED_ZIPPED_DIRECTORY} created"));
print(f"{FIRE_PROCESSED_UNZIPPED_DIRECTORY} exists") if os.path.isdir(FIRE_PROCESSED_UNZIPPED_DIRECTORY) else (os.makedirs(FIRE_PROCESSED_UNZIPPED_DIRECTORY), print(f"{FIRE_PROCESSED_UNZIPPED_DIRECTORY} created"));

In [6]:
# Make a request for the fire incidents zipped folder
URL = r"https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/64038657-6437-4a97-b6f7-b4caf135249f"

# this might take a while to download the zipped file (~50 MB in size)
with requests.get(URL, stream=True) as response:
    
    # location where the zipped folder will be saved
    ZIPPED_FOLDER = os.path.join(FIRE_RAW_ZIPPED_DIRECTORY, "2011-2018-fire-incidents.zip")
    
    # save the zipped folder
    with open(ZIPPED_FOLDER, "wb") as file:
        for chunk in response.iter_content(chunk_size=128):
            file.write(chunk)

In [7]:
# Open the downloaded zipped folder through ZipFile class
with ZipFile(ZIPPED_FOLDER, "r") as unzipped_files:
    # extract all files into the fire_unzipped_data folder
    unzipped_files.extractall(FIRE_RAW_UNZIPPED_DIRECTORY)

# 3b
# Data Munging / Wrangling

In [8]:
# get the data dictionary (metadata)
for dirpath, subdirs, files in os.walk(FIRE_RAW_UNZIPPED_DIRECTORY):
    subdirs[:] = [d for d in subdirs if d not in '.ipynb_checkpoints']
    for x in files:
        if x.split(".")[1] == "xlsx":
            DATA_DICTIONARY_PATH = os.path.join(dirpath, x)

# walk through the fire_incidents directory
# to find the file all csv files
CSV_FILE_PATHS = []
for dirpath, subdirs, files in os.walk(FIRE_RAW_UNZIPPED_DIRECTORY):
    subdirs[:] = [d for d in subdirs if d not in '.ipynb_checkpoints']
    for x in files:
        if x.split(".")[1] == "csv":
            if x != "2011-2018-TORONTO_FIRE_INCIDENTS_WEATHER.csv":
                CSV_FILE_PATHS.append(os.path.join(dirpath, x))
CSV_FILE_PATHS.sort();
# CSV_FILE_PATHS, a list, holds the pathway to the csv files

## Work with the Toronto Fire Services Incident CSV Files

In [9]:
# read all 8 .csv files with data into a dictionary
# 3 columns can be read in as datetime
df_dict = dict()
for x in CSV_FILE_PATHS:
    df_dict[x.split("\\")[-1]] = pd.read_csv(x, parse_dates=["TFS Alarm Time", "TFS Arrival Time", "Last TFS Unit Clear Time"])

In [10]:
# merge all dataframes into 1
df = pd.concat([df_dict[x.split("\\")[-1]] for x in CSV_FILE_PATHS])
df.shape

(975175, 15)

In [11]:
# create Datetime column flooring on the day
# This will help out later on
df["Datetime"] = df["TFS Alarm Time"].dt.floor('d')
df["Datetime"]

0        2011-01-01
1        2011-01-01
2        2011-01-01
3        2011-01-01
4        2011-01-01
            ...    
133711   2018-12-31
133712   2018-12-31
133713   2018-12-31
133714   2018-12-31
133715   2018-12-31
Name: Datetime, Length: 975175, dtype: datetime64[ns]

# 3c
# Initial DataFrame

In [12]:
# See the Metadata for the Fire Incident Data
# Each Column is described as a Row
pd.set_option('max_colwidth', 400)
df_meta = pd.read_excel(DATA_DICTIONARY_PATH)
df_meta

Unnamed: 0,Column,Description,Data Source
0,Incident Number,TFS incident number,TFS RMS System
1,Initial CAD Event Type,"First event type in CAD system of this incident. \nIn situations where the initial CAD event type is medical OR the final incident type is medical, the field is set to medical",TFS RMS System
2,Initial CAD Event Call Type,"First call type in CAD system of this incident. Call type is a group of event types.\nIn situations where the initial CAD event type is medical OR the final incident type is medical, the field is set to medical",TFS RMS System
3,Final Incident Type,"Final incident type.\nIn situations where the initial CAD event type is medical OR the final incident type is medical, the field is set to medical",TFS RMS System
4,Event Alarm Level,Alarm level of the event,TFS RMS System
5,Call Source,"Source of the call to TFS (e.g., 911 call, Alarm, referal agency, etc.)",TFS RMS System
6,Incident Station Area,TFS Station area where the incident occurred,TFS CAD System
7,Incident Ward,"Ward where the incident occurred, when available",TFS CAD System
8,LATITUDE,Latitude of nearest major or minor intersection in the ward of the incident. \nFor medical calls this data is not provided.,"City Of Toronto Open Data, Intersection File"
9,LONGITUDE,"Longitude of nearest major or minor intersection in the ward of the incident. \nIn situations where the initial CAD event type is medical OR the final incident type is medical, the field is set to the Forward Sortation Area (FSA) where the event occured","City Of Toronto Open Data, Intersection File"


In [13]:
# reset pandas default option
pd.reset_option('max_colwidth')

In [14]:
# Write the data to the appropriate folder
df.to_csv(
        os.path.join(
            FIRE_PROCESSED_UNZIPPED_DIRECTORY, "2011-2018_Basic_Incident_Details.csv"
                    ),
        index=False
        )

# Preview Toronto Fire Incidents DataFrame
df.sample(10)

Unnamed: 0,Incident Number,Initial CAD Event Type,Initial CAD Event Call Type,Final Incident Type,Event Alarm Level,Call Source,Incident Station Area,Incident Ward,LATITUDE,Longitude,Intersection,TFS Alarm Time,TFS Arrival Time,Last TFS Unit Clear Time,Persons Rescued,Datetime
38877,F18040391,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,145.0,8,0.0,0.0,M6A,2018-04-20 19:08:56,2018-04-20 19:15:21,2018-04-20 19:29:08,0.0,2018-04-20
79897,F15083416,FAHRD - Alarm Highrise Residential Downtown,Emergency Fire,31 - Alarm Equipment - Malfunction,0,05 - Telephone from Monitoring Agency,333.0,13,43.653274,-79.369636,Richmond St E / Stonecutters Lane,2015-09-06 05:36:07,2015-09-06 05:39:20,2015-09-06 06:04:34,0.0,2015-09-06
109173,F11128412,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,223.0,20,43.742488,-79.222108,Eglinton Ave E / Beachell St,2011-09-30 15:37:39,2011-09-30 15:44:41,2011-09-30 15:45:21,0.0,2011-09-30
95407,F11099162,Medical,Medical,89 - Other Medical,1,03 - From Ambulance,314.0,13,43.655774,-79.380675,Yonge St / Dundas Sq,2011-08-27 19:53:42,2011-08-27 19:58:08,2011-08-27 20:08:34,0.0,2011-08-27
102769,F18107093,FACC - Fire Alarm - Check Call,Other Emergency Events,34 - Human - Perceived Emergency,0,02 - Telephone from Civlian (other than 911),426.0,4,43.641484,-79.449511,Sunnyside Ave / Pearson Ave,2018-10-05 10:49:11,2018-10-05 10:57:43,2018-10-05 11:14:38,0.0,2018-10-05
100757,F15105151,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,344.0,11,43.674952,-79.399903,Bedford Rd / Pears Ave,2015-11-11 20:05:20,2015-11-11 20:09:11,2015-11-11 20:18:09,0.0,2015-11-11
103770,F11122576,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,224.0,19,43.687577,-79.31627,Glebeholme Blvd / Woodmount Ave,2011-09-17 11:41:11,2011-09-17 11:44:57,2011-09-17 11:56:51,0.0,2011-09-17
11584,F11011993,Medical,Medical,89 - Other Medical,1,03 - From Ambulance,341.0,8,43.706983,-79.45318,Dufferin St / Glencairn Ave,2011-01-29 13:32:22,2011-01-29 13:35:47,2011-01-29 13:53:15,0.0,2011-01-29
68013,F12071299,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,333.0,13,43.648042,-79.371741,The Esplanade / Market St,2012-07-03 05:29:18,2012-07-03 05:33:21,2012-07-03 05:50:05,0.0,2012-07-03
116243,F11135733,Medical,Medical,89 - Other Medical,0,03 - From Ambulance,415.0,1,43.716606,-79.564059,Redwater Dr / Tofield Cres,2011-10-17 18:09:30,2011-10-17 18:13:15,2011-10-17 18:29:56,0.0,2011-10-17


# 4
# Toronto Historical Weather from the Government of Canada
- [link](https://climate.weather.gc.ca/)
- Documentation for links can be found [here](ftp://client_climate@ftp.tor.ec.gc.ca/Pub/)
    - more specifically [here](ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Readme.txt)
    - go to this [folder](ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/) for more info
- I will be using Python Requests library to generate HTTP requests to obtain the data

# 4a
# Data Download / Saving / Unzipping

In [4]:
# Get the folders for fire_incidents
TORONTO_WEATHER = "TORONTO_WEATHER"
WEATHER_RAW_UNZIPPED_DIRECTORY = os.path.join(RAW_UNZIPPED_DIRECTORY, TORONTO_WEATHER)
WEATHER_PROCESSED_UNZIPPED_DIRECTORY = os.path.join(PROCESSED_UNZIPPED_DIRECTORY, TORONTO_WEATHER)

# Create all the subfolders for the above folder
print(f"{WEATHER_RAW_UNZIPPED_DIRECTORY} exists") if os.path.isdir(WEATHER_RAW_UNZIPPED_DIRECTORY) else (os.makedirs(WEATHER_RAW_UNZIPPED_DIRECTORY), print(f"{WEATHER_RAW_UNZIPPED_DIRECTORY} created"));
print(f"{WEATHER_PROCESSED_UNZIPPED_DIRECTORY} exists") if os.path.isdir(WEATHER_PROCESSED_UNZIPPED_DIRECTORY) else (os.makedirs(WEATHER_PROCESSED_UNZIPPED_DIRECTORY), print(f"{WEATHER_PROCESSED_UNZIPPED_DIRECTORY} created"));

## Canada Historical Weather Station Inventory
- needs to be queried to determine which stations will output Historical Toronto Weather Data

In [16]:
# station inventory list url
URL_Stations = r"ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv"

# read the csv into pandas
df_stations = pd.read_csv(URL_Stations, skiprows=3)

# get the file path to save the inventory
STATION_WEATHER_RAW_UNZIPPED_DIRECTORY = os.path.join(WEATHER_RAW_UNZIPPED_DIRECTORY, "Canada_Weather_Station_Inventory.csv")

# save the inventory
df_stations.to_csv(STATION_WEATHER_RAW_UNZIPPED_DIRECTORY, index=False)
df_stations

Unnamed: 0,Name,Province,Climate ID,Station ID,WMO ID,TC ID,Latitude (Decimal Degrees),Longitude (Decimal Degrees),Latitude,Longitude,Elevation (m),First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year
0,ACTIVE PASS,BRITISH COLUMBIA,1010066,14,,,48.87,-123.28,485200000,-1231700000,4.0,1984,1996,,,1984.0,1996.0,1984.0,1996.0
1,ALBERT HEAD,BRITISH COLUMBIA,1010235,15,,,48.40,-123.48,482400000,-1232900000,17.0,1971,1995,,,1971.0,1995.0,1971.0,1995.0
2,BAMBERTON OCEAN CEMENT,BRITISH COLUMBIA,1010595,16,,,48.58,-123.52,483500000,-1233100000,85.3,1961,1980,,,1961.0,1980.0,1961.0,1980.0
3,BEAR CREEK,BRITISH COLUMBIA,1010720,17,,,48.50,-124.00,483000000,-1240000000,350.5,1910,1971,,,1910.0,1971.0,1910.0,1971.0
4,BEAVER LAKE,BRITISH COLUMBIA,1010774,18,,,48.50,-123.35,483000000,-1232100000,61.0,1894,1952,,,1894.0,1952.0,1894.0,1952.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8761,WEST ST MODESTE,NEWFOUNDLAND,8504216,6803,,,51.60,-56.70,513600000,-564200000,12.2,1990,2002,,,1990.0,2002.0,1990.0,2002.0
8762,WEST ST MODESTE,NEWFOUNDLAND,8504217,6804,,,51.58,-56.72,513500000,-564300000,15.2,1984,1987,,,1984.0,1987.0,1984.0,1987.0
8763,CHURCHILL FALLS,NEWFOUNDLAND,850A131,6940,,,53.53,-63.97,533200000,-635800000,488.5,1993,1998,,,1993.0,1998.0,1993.0,1998.0
8764,MAKKOVIK (AUT),NEWFOUNDLAND,850B5HR,9025,,,55.08,-59.17,550500000,-591000000,71.3,1985,1986,1985.0,1986.0,,,,


In [17]:
# longitude is -79 - -79.6 and latitude is 44-43 for GTA area
df_latitude_longitude = df_stations.loc[(df_stations["Latitude (Decimal Degrees)"] <= 44) &
                                        (df_stations["Latitude (Decimal Degrees)"] >= 43.5) &
                                        (df_stations["Longitude (Decimal Degrees)"] >= -79.6) &
                                        (df_stations["Longitude (Decimal Degrees)"] <= -79)
                                       ]
df_latitude_longitude

Unnamed: 0,Name,Province,Climate ID,Station ID,WMO ID,TC ID,Latitude (Decimal Degrees),Longitude (Decimal Degrees),Latitude,Longitude,Elevation (m),First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year
6289,AURORA,ONTARIO,6150395,4855,,,43.95,-79.40,435700000,-792400000,270.1,1883,1919,,,1883.0,1919.0,1883.0,1919.0
6291,AURORA PAYMENT,ONTARIO,6150397,4857,,,43.98,-79.47,435900000,-792800000,281.9,1971,1975,,,1971.0,1975.0,1971.0,1975.0
6315,BRODDYTOWN,ONTARIO,6150997,4879,,,43.62,-79.60,433700000,-793600000,144.8,1951,1956,,,1951.0,1956.0,1951.0,1956.0
6317,BROUGHAM,ONTARIO,6151000,4880,,,43.92,-79.12,435500000,-790700000,198.1,1965,1975,,,1965.0,1975.0,1965.0,1975.0
6326,BURNHAMTHORPE,ONTARIO,6151065,4888,,,43.62,-79.60,433700000,-793600000,144.8,1951,1955,,,1951.0,1955.0,1951.0,1955.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6662,TORONTO TORBARRIE,ONTARIO,615H779,4842,,,43.73,-79.52,434400000,-793100000,139.9,1983,1984,,,1983.0,1984.0,1983.0,1984.0
6663,TORONTO YORK MILLS,ONTARIO,615HHDF,4840,,,43.75,-79.38,434500000,-792300000,153.3,1973,1987,,,1973.0,1987.0,1973.0,1987.0
6664,TORONTO BUTTONVILLE A,ONTARIO,615HMAK,4841,71639.0,YKZ,43.86,-79.37,435144000,-792212000,198.1,1986,2015,1986.0,2015.0,1986.0,2015.0,1986.0,2015.0
6665,BRUCES MILL,ONTARIO,615J02G,4843,,,44.00,-79.33,440000000,-792000000,358.1,1968,1974,,,1968.0,1974.0,1969.0,1974.0


In [18]:
# select all Toronto weather stations (toronto in name)
df_toronto_stations = df_latitude_longitude.loc[(df_latitude_longitude["Name"].str.lower().str.contains("toronto", regex=False)) & 
           (df_latitude_longitude["Province"].str.lower().str.contains("ontario", regex=False))]
df_toronto_stations

Unnamed: 0,Name,Province,Climate ID,Station ID,WMO ID,TC ID,Latitude (Decimal Degrees),Longitude (Decimal Degrees),Latitude,Longitude,Elevation (m),First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year
6478,PA TORONTO NORTH YORK MOTORS,ONTARIO,6156168,52678,,L1D,43.72,-79.47,434307100,-792807400,186.5,2014,2015,2014.0,2015.0,,,,
6482,PA SCARBOROUGH TORONTO HUNT,ONTARIO,6156172,52641,,L2A,43.68,-79.27,434100000,-791614900,133.5,2014,2015,2014.0,2015.0,,,,
6485,PA TORONTO HYUNDAI,ONTARIO,6156177,52640,,L1C,43.70,-79.45,434156200,-792705700,186.5,2014,2015,2014.0,2015.0,,,,
6544,TORONTO,ONTARIO,6158350,5051,71266.0,,43.67,-79.40,434000000,-792400000,112.5,1840,2017,1953.0,1969.0,1840.0,2017.0,1840.0,2006.0
6545,TORONTO SOLAR RADIATION,ONTARIO,6158352,41863,71626.0,TRF,43.67,-79.40,434000000,-792400000,166.0,2018,2018,,,2018.0,2018.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6637,TORONTO CASTLEMERE,ONTARIO,6158M1K,5040,,,43.78,-79.32,434700000,-791900000,184.4,1971,1980,,,1971.0,1980.0,1971.0,1980.0
6662,TORONTO TORBARRIE,ONTARIO,615H779,4842,,,43.73,-79.52,434400000,-793100000,139.9,1983,1984,,,1983.0,1984.0,1983.0,1984.0
6663,TORONTO YORK MILLS,ONTARIO,615HHDF,4840,,,43.75,-79.38,434500000,-792300000,153.3,1973,1987,,,1973.0,1987.0,1973.0,1987.0
6664,TORONTO BUTTONVILLE A,ONTARIO,615HMAK,4841,71639.0,YKZ,43.86,-79.37,435144000,-792212000,198.1,1986,2015,1986.0,2015.0,1986.0,2015.0,1986.0,2015.0


In [19]:
# make sure the weather stations contains weather data from <2010 to >2020 dates
df_year_stations = df_toronto_stations.loc[(df_toronto_stations["First Year"] <= 2010) &
                                          (df_toronto_stations["Last Year"] >= 2020)]
df_year_stations

Unnamed: 0,Name,Province,Climate ID,Station ID,WMO ID,TC ID,Latitude (Decimal Degrees),Longitude (Decimal Degrees),Latitude,Longitude,Elevation (m),First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year
6546,TORONTO CITY,ONTARIO,6158355,31688,71508.0,XTO,43.67,-79.4,434000000,-792400000,112.5,2002,2020,2002.0,2020.0,2002.0,2020.0,2003.0,2006.0
6547,TORONTO CITY CENTRE,ONTARIO,6158359,48549,71265.0,YTZ,43.63,-79.4,433739000,-792346000,76.8,2009,2020,2009.0,2020.0,2010.0,2020.0,,
6671,TORONTO NORTH YORK,ONTARIO,615S001,26953,,,43.78,-79.47,434648072,-792804038,187.0,1994,2020,,,1994.0,2020.0,1994.0,2006.0


In [20]:
# lets create a list of tuples
# this will be done to generate the HTTP requests for .CSV files
years = list(range(2010, 2021))
station_name_array = df_year_stations["Name"].values
station_id_array = df_year_stations["Station ID"].values
station_years = [(station_name_array[index], x, y) for index, x in enumerate(station_id_array) for y in years]

In [21]:
# this is how we will index for the different station and years
station_years[0]
station_years[1:11]

[('TORONTO CITY', 31688, 2011),
 ('TORONTO CITY', 31688, 2012),
 ('TORONTO CITY', 31688, 2013),
 ('TORONTO CITY', 31688, 2014),
 ('TORONTO CITY', 31688, 2015),
 ('TORONTO CITY', 31688, 2016),
 ('TORONTO CITY', 31688, 2017),
 ('TORONTO CITY', 31688, 2018),
 ('TORONTO CITY', 31688, 2019),
 ('TORONTO CITY', 31688, 2020)]

In [22]:
station_years[11]
station_years[12:22]

[('TORONTO CITY CENTRE', 48549, 2011),
 ('TORONTO CITY CENTRE', 48549, 2012),
 ('TORONTO CITY CENTRE', 48549, 2013),
 ('TORONTO CITY CENTRE', 48549, 2014),
 ('TORONTO CITY CENTRE', 48549, 2015),
 ('TORONTO CITY CENTRE', 48549, 2016),
 ('TORONTO CITY CENTRE', 48549, 2017),
 ('TORONTO CITY CENTRE', 48549, 2018),
 ('TORONTO CITY CENTRE', 48549, 2019),
 ('TORONTO CITY CENTRE', 48549, 2020)]

In [23]:
station_years[22]
station_years[23:]

[('TORONTO NORTH YORK', 26953, 2011),
 ('TORONTO NORTH YORK', 26953, 2012),
 ('TORONTO NORTH YORK', 26953, 2013),
 ('TORONTO NORTH YORK', 26953, 2014),
 ('TORONTO NORTH YORK', 26953, 2015),
 ('TORONTO NORTH YORK', 26953, 2016),
 ('TORONTO NORTH YORK', 26953, 2017),
 ('TORONTO NORTH YORK', 26953, 2018),
 ('TORONTO NORTH YORK', 26953, 2019),
 ('TORONTO NORTH YORK', 26953, 2020)]

In [24]:
# dict that stores all the dataframes
df_dict = {}

for name, a, b in station_years:

    # generated URL for Toronto Historical Weather Data
    URL = (f"http://climate.weather.gc.ca/climate_data/bulk_data_e.html?"+
           f"format=csv&stationID={a}&Year={b}&Month=1&Day=14&timeframe=2&submit= Download+Data")

    with requests.Session() as s:
        # download the file
        download = s.get(URL)
        # decode it to utf-8 format
        decode = download.content.decode("utf-8")

        # file path is too long for windows so lets
        # just store it in a buffer
        with io.StringIO(decode) as f:
            # read it into a dataframe
            df = pd.read_csv(f)
            
    # a little delay not to arouse suscpicion
    time.sleep(0.02)
    
    # place dataframe in a dictionary with tuples as keys
    df_dict[(name, a, b)] = df

In [25]:
# lets see if all the keys are present
df_dict.keys()

dict_keys([('TORONTO CITY', 31688, 2010), ('TORONTO CITY', 31688, 2011), ('TORONTO CITY', 31688, 2012), ('TORONTO CITY', 31688, 2013), ('TORONTO CITY', 31688, 2014), ('TORONTO CITY', 31688, 2015), ('TORONTO CITY', 31688, 2016), ('TORONTO CITY', 31688, 2017), ('TORONTO CITY', 31688, 2018), ('TORONTO CITY', 31688, 2019), ('TORONTO CITY', 31688, 2020), ('TORONTO CITY CENTRE', 48549, 2010), ('TORONTO CITY CENTRE', 48549, 2011), ('TORONTO CITY CENTRE', 48549, 2012), ('TORONTO CITY CENTRE', 48549, 2013), ('TORONTO CITY CENTRE', 48549, 2014), ('TORONTO CITY CENTRE', 48549, 2015), ('TORONTO CITY CENTRE', 48549, 2016), ('TORONTO CITY CENTRE', 48549, 2017), ('TORONTO CITY CENTRE', 48549, 2018), ('TORONTO CITY CENTRE', 48549, 2019), ('TORONTO CITY CENTRE', 48549, 2020), ('TORONTO NORTH YORK', 26953, 2010), ('TORONTO NORTH YORK', 26953, 2011), ('TORONTO NORTH YORK', 26953, 2012), ('TORONTO NORTH YORK', 26953, 2013), ('TORONTO NORTH YORK', 26953, 2014), ('TORONTO NORTH YORK', 26953, 2015), ('TORONT

In [26]:
# toronto city station from 2010 - 2020
df_toronto_city = df_dict[station_years[0]]
df_toronto_city = df_toronto_city.append([df_dict[x] for x in station_years[1:11]])
TORONTO_CITY_PATH = os.path.join(WEATHER_RAW_UNZIPPED_DIRECTORY, "toronto_city_2010_2020.csv")
df_toronto_city.to_csv(TORONTO_CITY_PATH, index=False)

In [27]:
# city_centre station from 2010 - 2020
df_city_centre = df_dict[station_years[11]]
df_city_centre = df_city_centre.append([df_dict[x] for x in station_years[12:22]])
CITY_CENTRE_PATH = os.path.join(WEATHER_RAW_UNZIPPED_DIRECTORY, "toronto_city_centre_2010_2020.csv")
df_city_centre.to_csv(CITY_CENTRE_PATH, index=False)

In [28]:
# north york station from 2010 - 2020
df_north_york = df_dict[station_years[22]]
df_north_york = df_north_york.append([df_dict[x] for x in station_years[23:]])
NORTH_YORK_PATH = os.path.join(WEATHER_RAW_UNZIPPED_DIRECTORY, "toronto_north_york_2010_2020.csv")
df_north_york.to_csv(NORTH_YORK_PATH, index=False)

In [29]:
print(df_toronto_city.shape)
print(df_city_centre.shape)
print(df_north_york.shape)

(4018, 31)
(4018, 31)
(4018, 31)


# 4b
# Data Munging / Wrangling

In [30]:
# drop the following columns as they are mostly null for each of the 3 dataframes
df_toronto_city = df_toronto_city.drop(columns=['Longitude (x)', 'Latitude (y)',
                                'Station Name', 'Climate ID', 'Data Quality', 'Max Temp Flag',
                                'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag',
                                'Cool Deg Days Flag', 'Total Rain Flag', 'Total Snow Flag',
                                'Total Precip Flag', 'Snow on Grnd Flag', 
                                'Dir of Max Gust Flag', 'Spd of Max Gust Flag',
                                'Total Snow (cm)', 'Dir of Max Gust (10s deg)', 
                                'Spd of Max Gust (km/h)', 'Date/Time'
                                ])

df_city_centre = df_city_centre.drop(columns=['Longitude (x)', 'Latitude (y)',
                                'Station Name', 'Climate ID', 'Data Quality', 'Max Temp Flag',
                                'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag',
                                'Cool Deg Days Flag', 'Total Rain Flag', 'Total Snow Flag',
                                'Total Precip Flag', 'Snow on Grnd Flag', 
                                'Dir of Max Gust Flag', 'Spd of Max Gust Flag',
                                'Total Snow (cm)', 'Dir of Max Gust (10s deg)', 
                                'Spd of Max Gust (km/h)', 'Total Rain (mm)',
                                'Snow on Grnd (cm)', 'Date/Time'
                                ])

df_north_york = df_north_york.drop(columns=['Longitude (x)', 'Latitude (y)',
                                'Station Name', 'Climate ID', 'Data Quality', 'Max Temp Flag',
                                'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag',
                                'Cool Deg Days Flag', 'Total Rain Flag', 'Total Snow Flag',
                                'Total Precip Flag', 'Snow on Grnd Flag', 
                                'Dir of Max Gust Flag', 'Spd of Max Gust Flag',
                                'Total Snow (cm)', 'Dir of Max Gust (10s deg)', 
                                'Spd of Max Gust (km/h)', 'Date/Time'
                                ])

In [31]:
# for the Rain, Snow and precipitation columns set the np.nan values to 0
# makes logical sense that if there is no rain / snow you would get a null value (or 0)
values = {
    'Total Rain (mm)': 0,
    'Total Precip (mm)':0,
    'Snow on Grnd (cm)':0,
        }

# do so for all 3 dataframes
df_toronto_city = df_toronto_city.fillna(value=values)
df_city_centre = df_city_centre.fillna(value=values)
df_north_york = df_north_york.fillna(value=values)

In [32]:
df_toronto_city.isnull().sum()

Year                    0
Month                   0
Day                     0
Max Temp (°C)         222
Min Temp (°C)         200
Mean Temp (°C)        231
Heat Deg Days (°C)    231
Cool Deg Days (°C)    231
Total Rain (mm)         0
Total Precip (mm)       0
Snow on Grnd (cm)       0
dtype: int64

In [33]:
df_city_centre.isnull().sum()

Year                    0
Month                   0
Day                     0
Max Temp (°C)         405
Min Temp (°C)         391
Mean Temp (°C)        412
Heat Deg Days (°C)    412
Cool Deg Days (°C)    412
Total Precip (mm)       0
dtype: int64

In [34]:
df_north_york.isnull().sum()

Year                    0
Month                   0
Day                     0
Max Temp (°C)         255
Min Temp (°C)         255
Mean Temp (°C)        255
Heat Deg Days (°C)    255
Cool Deg Days (°C)    255
Total Rain (mm)         0
Total Precip (mm)       0
Snow on Grnd (cm)       0
dtype: int64

In [35]:
# functions that will append the appropriate suffix to the dataframe
def suffix_x(value, suffix="_city"):
    if (value == "Year") or (value == "Month") or (value == "Day"):
        return value
    else:
        return value + suffix
    
def suffix_y(value, suffix="_centre"):
    if (value == "Year") or (value == "Month") or (value == "Day"):
        return value
    else:
        return value + suffix
    
def suffix_z(value, suffix="_york"):
    if (value == "Year") or (value == "Month") or (value == "Day"):
        return value
    else:
        return value + suffix
    
df_toronto_city.columns = df_toronto_city.columns.map(suffix_x)
df_city_centre.columns = df_city_centre.columns.map(suffix_y)
df_north_york.columns = df_north_york.columns.map(suffix_z)

In [36]:
# merge all 3 dataframes together on the year, month and day
df_total = df_toronto_city.merge(df_city_centre, how="inner", on=["Year", "Month", "Day"])
df_total = df_total.merge(df_north_york, how="inner", on=["Year", "Month", "Day"])

In [37]:
# create a Datetime column that denotes the year, month and day
df_total["Datetime"] = pd.to_datetime(dict(year=df_total["Year"], month=df_total["Month"],day=df_total["Day"]))

In [38]:
# get maximum and minimum and mean temperatures
df_total["MAX_TEMP"] = df_total.loc[:, ['Max Temp (°C)_city', 'Max Temp (°C)_centre', 'Max Temp (°C)_york']].max(axis=1)
df_total["MIN_TEMP"] = df_total.loc[:, ['Min Temp (°C)_city', 'Min Temp (°C)_centre', 'Min Temp (°C)_york']].min(axis=1)
df_total["MEAN_TEMP"] = df_total.loc[:, ['Mean Temp (°C)_city', 'Mean Temp (°C)_centre', 'Mean Temp (°C)_york']].mean(axis=1)

# heating degree day and cooling degree day (heating / cooling of buildings)
df_total["HDD"] = df_total.loc[:, ['Heat Deg Days (°C)_city', 'Heat Deg Days (°C)_centre', 'Heat Deg Days (°C)_york']].mean(axis=1)
df_total["CDD"] = df_total.loc[:, ['Cool Deg Days (°C)_city', 'Cool Deg Days (°C)_centre', 'Cool Deg Days (°C)_york']].mean(axis=1)

# total rain, total preciitation and snow on ground
df_total["RAIN_MM"] = df_total.loc[:, ['Total Rain (mm)_city', 'Total Rain (mm)_york']].mean(axis=1)
df_total["PRECIP_MM"] = df_total.loc[:, ['Total Precip (mm)_city', 'Total Precip (mm)_centre', 'Total Precip (mm)_york']].mean(axis=1)
df_total["SNOW_CM"] = df_total.loc[:, ['Snow on Grnd (cm)_city', 'Snow on Grnd (cm)_york']].mean(axis=1)

In [39]:
# drop the columns which I have aggregated
df_total = df_total.drop(columns=['Max Temp (°C)_city', 'Min Temp (°C)_city',
                               'Mean Temp (°C)_city', 'Heat Deg Days (°C)_city',
                               'Cool Deg Days (°C)_city', 'Total Rain (mm)_city',
                               'Total Precip (mm)_city', 'Snow on Grnd (cm)_city',
                               'Max Temp (°C)_centre', 'Min Temp (°C)_centre', 'Mean Temp (°C)_centre',
                               'Heat Deg Days (°C)_centre', 'Cool Deg Days (°C)_centre',
                               'Total Precip (mm)_centre', 'Max Temp (°C)_york', 'Min Temp (°C)_york',
                               'Mean Temp (°C)_york', 'Heat Deg Days (°C)_york',
                               'Cool Deg Days (°C)_york', 'Total Rain (mm)_york',
                               'Total Precip (mm)_york', 'Snow on Grnd (cm)_york',
                               'Year', 'Month', 'Day'
                                ])
df_total.head()

Unnamed: 0,Datetime,MAX_TEMP,MIN_TEMP,MEAN_TEMP,HDD,CDD,RAIN_MM,PRECIP_MM,SNOW_CM
0,2010-01-01,1.9,-9.9,-3.0,21.0,0.0,0.0,0.633333,0.0
1,2010-01-02,-9.7,-18.5,-14.05,32.05,0.0,0.0,0.333333,1.0
2,2010-01-03,-9.3,-17.0,-12.9,30.9,0.0,0.0,1.9,1.0
3,2010-01-04,-6.7,-13.5,-9.85,27.85,0.0,0.0,0.266667,3.5
4,2010-01-05,-3.6,-12.5,-7.65,25.65,0.0,0.0,1.3,4.5


In [40]:
df_total.isnull().sum()

Datetime       0
MAX_TEMP     168
MIN_TEMP     168
MEAN_TEMP    168
HDD          168
CDD          168
RAIN_MM        0
PRECIP_MM      0
SNOW_CM        0
dtype: int64

In [41]:
# lets round the numbers
df_total.loc[:, ['MAX_TEMP', 'MIN_TEMP', 'MEAN_TEMP', 'HDD', 'CDD', 'RAIN_MM', 'PRECIP_MM', 'SNOW_CM']] = (df_total
        .loc[:, ['MAX_TEMP', 'MIN_TEMP', 'MEAN_TEMP', 'HDD', 'CDD', 'RAIN_MM', 'PRECIP_MM', 'SNOW_CM']].round(2)
                                                                                                           )

In [42]:
# save the file to the toronto_weather folder
AGGREGATED_TORONTO_WEATHER_PATH = os.path.join(WEATHER_PROCESSED_UNZIPPED_DIRECTORY, "2010-2020_Toronto_Weather.csv")
df_total.to_csv(AGGREGATED_TORONTO_WEATHER_PATH, index=False)

# 4c
# Initial DataFrame

In [43]:
df_total.sample(10)

Unnamed: 0,Datetime,MAX_TEMP,MIN_TEMP,MEAN_TEMP,HDD,CDD,RAIN_MM,PRECIP_MM,SNOW_CM
844,2012-04-24,8.1,1.0,4.63,13.37,0.0,0.6,1.4,0.0
3290,2019-01-04,8.0,-0.5,4.13,13.87,0.0,0.0,0.0,4.0
3370,2019-03-25,4.0,-6.5,-0.6,18.6,0.0,0.0,0.0,0.5
1617,2014-06-06,25.0,8.0,16.67,1.33,0.0,0.0,0.0,0.0
1166,2013-03-12,6.0,0.9,3.17,14.83,0.0,0.0,0.63,5.0
1943,2015-04-28,19.3,4.5,12.27,5.73,0.0,0.0,0.0,0.0
249,2010-09-07,30.0,13.0,20.6,0.0,2.6,0.0,0.0,0.0
1168,2013-03-14,1.4,-8.5,-3.33,21.33,0.0,0.0,0.0,4.0
1310,2013-08-03,25.4,14.5,20.2,0.0,2.2,0.0,0.0,0.0
2089,2015-09-21,20.6,8.5,15.47,2.53,0.0,0.0,0.0,0.0


# 5
# Toronto Fire Servivces Station Locations
- Toronto Open Data [link](https://open.toronto.ca/dataset/fire-station-locations/)

# 5a
# Data Download / Saving / Unzipping

In [5]:
# Get the folders for fire_incidents
FIRE_STATIONS = "FIRE_STATIONS"
STATIONS_RAW_ZIPPED_DIRECTORY = os.path.join(RAW_ZIPPED_DIRECTORY, FIRE_STATIONS)
STATIONS_RAW_UNZIPPED_DIRECTORY = os.path.join(RAW_UNZIPPED_DIRECTORY, FIRE_STATIONS)

# Create all the subfolders for the above folder
print(f"{STATIONS_RAW_ZIPPED_DIRECTORY} exists") if os.path.isdir(STATIONS_RAW_ZIPPED_DIRECTORY) else (os.makedirs(STATIONS_RAW_ZIPPED_DIRECTORY), print(f"{STATIONS_RAW_ZIPPED_DIRECTORY} created"));
print(f"{STATIONS_RAW_UNZIPPED_DIRECTORY} exists") if os.path.isdir(STATIONS_RAW_UNZIPPED_DIRECTORY) else (os.makedirs(STATIONS_RAW_UNZIPPED_DIRECTORY), print(f"{STATIONS_RAW_UNZIPPED_DIRECTORY} created"));

In [45]:
# Make a request for the fire incidents zipped folder
URL = r"https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/daf5e0ee-cff6-4661-b308-79f95c3881e9"

# this might take a while to download the zipped file (~50 MB in size)
with requests.get(URL, stream=True) as response:
    
    # location where the zipped folder will be saved
    ZIPPED_FOLDER = os.path.join(STATIONS_RAW_ZIPPED_DIRECTORY, "2019_Toronto_Fire_Stations.zip")
    
    # save the zipped folder
    with open(ZIPPED_FOLDER, "wb") as file:
        for chunk in response.iter_content(chunk_size=128):
            file.write(chunk)

In [46]:
# Open the downloaded zipped folder through ZipFile class
with ZipFile(ZIPPED_FOLDER, "r") as unzipped_files:
    # extract all files into the fire_unzipped_data folder
    unzipped_files.extractall(STATIONS_RAW_UNZIPPED_DIRECTORY)

# Geopandas will need to be used
- convert from shapefile (.shp) to a pandas DataFrame
- requires another Conda Virtual Environment
    - this is due to conflict between other Python libraries

# 6
# Toronto Fire Hydrants
- Toronto Open Data [link](https://open.toronto.ca/dataset/fire-hydrants/)

# 6a
# Data Download / Saving / Unzipping

In [6]:
# Get the folders for fire_incidents
FIRE_HYDRANTS = "FIRE_HYDRANTS"
HYDRANTS_RAW_UNZIPPED_DIRECTORY = os.path.join(RAW_UNZIPPED_DIRECTORY, FIRE_HYDRANTS)

# Create all the subfolders for the above folder
print(f"{HYDRANTS_RAW_UNZIPPED_DIRECTORY} exists") if os.path.isdir(HYDRANTS_RAW_UNZIPPED_DIRECTORY) else (os.makedirs(HYDRANTS_RAW_UNZIPPED_DIRECTORY), print(f"{HYDRANTS_RAW_UNZIPPED_DIRECTORY} created"));

In [48]:
# Make a request for the fire hydrant location .csv
URL = r"https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/beaaa552-6338-4c81-95be-411e6cef6b89?format=csv&projection=4326"

with requests.Session() as s:
    
        # download the file
        download = s.get(URL)
        # decode it to utf-8 format
        decode = download.content.decode("utf-8")

        # file path is too long for windows so lets
        # just store it in a buffer
        with io.StringIO(decode) as f:
            # read it into a dataframe
            df_hydrants = pd.read_csv(f)

In [49]:
# write it to a csv file
HYDRANTS_FILEPATH = os.path.join(HYDRANTS_RAW_UNZIPPED_DIRECTORY, "Toronto_Fire_Hydrants.csv")
df_hydrants.to_csv(HYDRANTS_FILEPATH, index=False)

# see 10 random rows
df_hydrants.sample(10)

Unnamed: 0,_id,OBJECTID,ASSET_ID,ADDR_KEY,ADDR_QUAL,X_COORDINATE,Y_COORDINATE,geometry
40249,40250,69875,HY2009534,20348998,NEARCTIC DR QUEENS PLATE DR 50m WEST OF QUEEN...,297288.603,4841404.0,"{u'type': u'Point', u'coordinates': (-79.59301..."
16648,16649,27117,HY3004200,7807450,2 TAYLOR DR,318695.485,4839776.0,"{u'type': u'Point', u'coordinates': (-79.32740..."
7778,7779,12695,HY4010207,10459307,110 MILLWICK DR,299513.483,4846576.0,"{u'type': u'Point', u'coordinates': (-79.56546..."
27872,27873,57237,HY12976,995117,20 CROWLAND DR,299283.48,4842414.0,"{u'type': u'Point', u'coordinates': (-79.56827..."
11502,11503,18728,HY129170,382541,147 SANDYHOOK SQ,319164.653,4852985.0,"{u'type': u'Point', u'coordinates': (-79.32123..."
41875,41876,177999,HY32078,7950371,20 AVONDALE RD,314167.369625,4837066.0,"{u'type': u'Point', u'coordinates': (-79.38363..."
31378,31379,60819,HY2005126,1035356,8 SUNPLAINS CRES,298612.117,4831959.0,"{u'type': u'Point', u'coordinates': (-79.57648..."
33908,33909,63392,HY4042971,8006186,20 KING HIGH AVE,309368.194,4843880.0,"{u'type': u'Point', u'coordinates': (-79.44309..."
24191,24192,39389,HY1357093,3950987,737 CRAVEN RD,319091.084,4837121.0,"{u'type': u'Point', u'coordinates': (-79.32256..."
6501,6502,10495,HY1360421,853783,80 PETERBOROUGH AVE,309220.513,4836903.0,"{u'type': u'Point', u'coordinates': (-79.44498..."
