# Capstone - Toronto Shelter Occupancy Prediction

## Import Modules

In [2]:
import pandas as pd
import IPython
import plotly.graph_objects as go
import plotly.express as px
import warnings
import suntime
import datetime
from astral import Astral

# Ignore warnings
warnings.filterwarnings("ignore")

---

## Data Clearning & Feature Engineering

In [92]:
# Getting occupancy data
df_17 = pd.read_csv('data/daily-shelter-occupancy-2017-csv.csv')
df_18 = pd.read_csv('data/daily-shelter-occupancy-2018-csv.csv')
df_19 = pd.read_json('https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e4cdcaff-7c06-488a-a072-4880fbd84b88')
df = pd.concat([df_17, df_18, df_19])

In [93]:
# Dropping unuseful columns
df.drop(['_id', 'id'], axis=1, inplace=True)

In [95]:
df[(df.CAPACITY.isna() | df.CAPACITY == 0) & (df.OCCUPANCY != 0 | df.OCCUPANCY.isna())]['PROGRAM_NAME'].value_counts().index

Index(['Family Residence - AL Site (Internal Referral Only)'], dtype='object')

In [89]:
# Checking if there are any rows with null or zero value in CAPACITY but non-zero value in OCCUPANCY
for program in df[(df.CAPACITY.isna() | df.CAPACITY == 0) & (df.OCCUPANCY != 0)]['PROGRAM_NAME'].value_counts().index:
    # Assuming the max OCCUPANCY as the CAPACITY
    df.loc[df.PROGRAM_NAME == program, 'CAPACITY'] = df[df.PROGRAM_NAME == program].OCCUPANCY.max()

In [100]:
# Checking if there are any rows with null or zero values in OCCUPANCY, and drop these rows
df.drop(df[(df.OCCUPANCY == 0) | (df.OCCUPANCY.isna())].index, inplace=True)

In [109]:
df.columns

Index(['CAPACITY', 'FACILITY_NAME', 'OCCUPANCY', 'OCCUPANCY_DATE',
       'ORGANIZATION_NAME', 'PROGRAM_NAME', 'SECTOR', 'SHELTER_ADDRESS',
       'SHELTER_CITY', 'SHELTER_NAME', 'SHELTER_POSTAL_CODE',
       'SHELTER_PROVINCE'],
      dtype='object')

In [114]:
df[df.SHELTER_POSTAL_CODE.isna()].SHELTER_ADDRESS.value_counts().index

Index(['38 Bathrust St', '67 Adelaide Street East', '1673 Kingston Road',
       '1651 Sheppard Ave West', '129 Peter St'],
      dtype='object')

In [120]:
df[(df.SHELTER_ADDRESS == '38 Bathrust St') & (df.SHELTER_POSTAL_CODE.notnull())].iloc[:1]['SHELTER_POSTAL_CODE']

19    M5V 3W3
Name: SHELTER_POSTAL_CODE, dtype: object

In [113]:
# Cleaning postal code format
for address in df[df.SHELTER_POSTAL_CODE.isna()].SHELTER_ADDRESS.value_counts().index:
    df[(df.SHELTER_ADDRESS == address) & (df.SHELTER_POSTAL_CODE.notnull())]

In [38]:
# Calculating occupancy rate
df['OCCUPANCY_RATE'] = df.OCCUPANCY / df.CAPACITY

In [25]:
# Gettting weather data
weather_2017_df = pd.read_csv('data/2017-toronto-weather.csv')
weather_2018_df = pd.read_csv('data/2018-toronto-weather.csv')
weather_2019_df = pd.read_csv('https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=31688&Year=2019&timeframe=2&submit=Download+Data')
weather_df = pd.concat([weather_2017_df, weather_2018_df, weather_2019_df])

In [26]:
# Cleaning weather data, dropping unnecessary columns
weather_df = weather_df.drop(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID', 'Data Quality',
       'Max Temp Flag', 'Min Temp Flag', 'Mean Temp Flag', 'Heat Deg Days Flag', 'Cool Deg Days Flag',
       'Total Rain Flag', 'Total Snow Flag', 'Total Precip Flag', 'Snow on Grnd (cm)',
       'Snow on Grnd Flag', 'Dir of Max Gust (10s deg)', 'Dir of Max Gust Flag', 'Spd of Max Gust (km/h)',
       'Spd of Max Gust Flag','Total Rain (mm)', 'Total Snow (cm)', 
        'Heat Deg Days (°C)', 'Cool Deg Days (°C)'], axis=1)

In [27]:
# Calculating the lengths of daylight each day in hours

# Coordinates of Toronto
latitude = 43.7001100
longitude = -79.4163000

# Function to calculate daylight hours in Toronto on a specific day
def get_daylight_hours(year, month, day):
    astral = Astral()
    sr, ss = astral.daylight_utc(datetime.date(year,month,day), latitude, longitude)
    return (ss - sr).seconds / 3600

weather_df['Daylight_Hours'] = weather_df.apply(lambda df : get_daylight_hours(df.Year, df.Month, df.Day), axis=1)

In [39]:
# Selecting year round shelter programs (ones that have existed since 2017-01-01)
program_value_counts = df.PROGRAM_NAME.value_counts()
program_value_counts = program_value_counts[program_value_counts == program_value_counts.max()]
year_round_programs = program_value_counts.index
df_year_round = df[df.ORGANIZATION_NAME.isin(year_round_programs)]