# Capstone - Toronto Shelter Occupancy Prediction

## Import Modules

In [2]:
import pandas as pd
import IPython
import plotly.graph_objects as go
import plotly.express as px
import warnings
import suntime
import datetime
from astral import Astral

# Ignore warnings
warnings.filterwarnings("ignore")

---

## Data Clearning & Feature Engineering

In [92]:
# Getting occupancy data
df_17 = pd.read_csv('data/daily-shelter-occupancy-2017-csv.csv')
df_18 = pd.read_csv('data/daily-shelter-occupancy-2018-csv.csv')
df_19 = pd.read_json('https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e4cdcaff-7c06-488a-a072-4880fbd84b88')
df = pd.concat([df_17, df_18, df_19])

In [93]:
# Dropping unuseful columns
df.drop(['_id', 'id'], axis=1, inplace=True)

In [95]:
df[(df.CAPACITY.isna() | df.CAPACITY == 0) & (df.OCCUPANCY != 0 | df.OCCUPANCY.isna())]['PROGRAM_NAME'].value_counts().index

Index(['Family Residence - AL Site (Internal Referral Only)'], dtype='object')

In [89]:
# Checking if there are any rows with null or zero value in CAPACITY but non-zero value in OCCUPANCY
for program in df[(df.CAPACITY.isna() | df.CAPACITY == 0) & (df.OCCUPANCY != 0)]['PROGRAM_NAME'].value_counts().index:
    # Assuming the max OCCUPANCY as the CAPACITY
    df.loc[df.PROGRAM_NAME == program, 'CAPACITY'] = df[df.PROGRAM_NAME == program].OCCUPANCY.max()

In [100]:
# Checking if there are any rows with null or zero values in OCCUPANCY, and drop these rows
df.drop(df[(df.OCCUPANCY == 0) | (df.OCCUPANCY.isna())].index, inplace=True)

In [147]:
# Getting addresses with empty postal codes
missing_postal_code_addresses = df[df.SHELTER_POSTAL_CODE.isna()].SHELTER_ADDRESS.value_counts().index

In [141]:
# Checking postal codes with Google Maps
postal_dict = {
    '38 Bathrust St': 'M5V 3W3',
    '67 Adelaide Street East' : 'M5C 1K6', 
    '1673 Kingston Road' : 'M1N 1S6',
    '1651 Sheppard Ave West' : 'M3M 2X4', 
    '129 Peter St' : 'M5V 1X1'
}

In [149]:
# Filling missing postal codes
for address in missing_postal_code_addresses:
    df.loc[df.SHELTER_ADDRESS == address, 'SHELTER_POSTAL_CODE'] = postal_dict[address]

In [153]:
# Cleaning postal code format
df.SHELTER_POSTAL_CODE.unique()

array(['M5S 2P1', 'M6H 3Z5', 'M6G 3B1', 'M1N 1S6', 'M1P 4V8', 'M1E 2M6',
       'M5V 3W3', 'M5A 2R9', 'M5C 1K6', 'M5A-2N2', 'M5T 1H9', 'M6C 1C2',
       'M5B 2P3', 'M4C 1L7', 'M5A 2N2', 'M5V 3G9', 'M3B 2T5', 'M2N OE3',
       'M4X 1K9', 'M5B 2A1', 'M5C 1M3', 'M6A 2W9', 'M5A 1T3', 'M1K 2B5',
       'M6H 1P2', 'M1K 1M2', 'M5V 1G8', 'M6E 4X3', 'M4K 3W5', 'M6G 2N1',
       'M5S 2P9', 'M4M 2V8', 'M8Y 1A3', 'M5B 1G6', 'M2M 3S3', 'M4W 1J1',
       'M5A 2N9', 'M5A 2V2', 'M6P 1Y5', 'M6J1E6', 'M5C 2H4', 'M5A 2R5',
       'M5S 2H1', 'M5R 2T9', 'M4Y 2X9', 'M5T 1C3', 'M5V 2A9', 'M5R 2W7',
       'M4T 1C1', 'M9V 1P9', 'M3M 2X4', 'M6L 3C6', 'M4M 1J2', 'M5V 1X1',
       'M4Y 2H6', 'M4M 3C3', 'M9V 2X6', 'M1M 1P8', 'M5T 2V9', 'M5R 1K6',
       'M1L 4A1'], dtype=object)

In [38]:
# Calculating occupancy rate
df['OCCUPANCY_RATE'] = df.OCCUPANCY / df.CAPACITY

In [25]:
# Gettting weather data
weather_2017_df = pd.read_csv('data/2017-toronto-weather.csv')
weather_2018_df = pd.read_csv('data/2018-toronto-weather.csv')
weather_2019_df = pd.read_csv('https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=31688&Year=2019&timeframe=2&submit=Download+Data')
weather_df = pd.concat([weather_2017_df, weather_2018_df, weather_2019_df])

In [26]:
# Cleaning weather data, dropping unnecessary columns
weather_df = weather_df.drop(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID', 'Data Quality',
       'Max Temp Flag', 'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag', 'Cool Deg Days Flag',
       'Total Rain Flag', 'Total Snow Flag', 'Total Precip Flag', 'Snow on Grnd (cm)',
       'Snow on Grnd Flag', 'Dir of Max Gust (10s deg)', 'Dir of Max Gust Flag', 'Spd of Max Gust (km/h)',
       'Spd of Max Gust Flag','Total Rain (mm)', 'Total Snow (cm)', 
        'Heat Deg Days (°C)', 'Cool Deg Days (°C)'], axis=1)

In [27]:
# Calculating the lengths of daylight each day in hours

# Coordinates of Toronto
latitude = 43.7001100
longitude = -79.4163000

# Function to calculate daylight hours in Toronto on a specific day
def get_daylight_hours(year, month, day):
    astral = Astral()
    sr, ss = astral.daylight_utc(datetime.date(year,month,day), latitude, longitude)
    return (ss - sr).seconds / 3600

weather_df['Daylight_Hours'] = weather_df.apply(lambda df : get_daylight_hours(df.Year, df.Month, df.Day), axis=1)

In [39]:
# Selecting year round shelter programs (ones that have existed since 2017-01-01)
program_value_counts = df.PROGRAM_NAME.value_counts()
program_value_counts = program_value_counts[program_value_counts == program_value_counts.max()]
year_round_programs = program_value_counts.index
df_year_round = df[df.ORGANIZATION_NAME.isin(year_round_programs)]