# Capstone - Toronto Shelter Occupancy Prediction

## Import Modules

In [189]:
# Basics
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Model Evaluations
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Ignore warnings
warnings.filterwarnings("ignore")

In [154]:
# Loading data csv
df = pd.read_csv('data/shelter_occupancy_cleaned.csv').drop(['Unnamed: 0'], axis=1)
coor_df = pd.read_csv('data/shelter_coordinates.csv').drop(['Unnamed: 0'], axis=1)
weather_df = pd.read_csv('data/toronto_weather.csv').drop(['Unnamed: 0'], axis=1)

---

## Preparing Data

In [155]:
# Merging data frames
df = pd.merge(df, coor_df, how='left', on='SHELTER_POSTAL_CODE')
df = pd.merge(df, weather_df, how='left', on='OCCUPANCY_DATE')

In [156]:
# Converting categorical data to dummy variables
df = pd.get_dummies(df, columns=['SECTOR', 'sublocality'])

In [157]:
# Dropping unneeded features before fitting
df = df.drop([
    'FACILITY_NAME', 'OCCUPANCY_DATE', 'ORGANIZATION_NAME', 'PROGRAM_NAME', 'SHELTER_ADDRESS', 'SHELTER_CITY',
    'SHELTER_NAME', 'SHELTER_POSTAL_CODE', 'SHELTER_PROVINCE','lat', 'lng', 'OCCUPANCY',
    'Week_Day'
], axis=1)

In [158]:
# Defining occupancy level (dependent variable)
# 1 if occupancy rate is higher than 100%
df['OCCUPANCY_LEVEL'] = df.OCCUPANCY_RATE.apply(lambda x : 1 if x >=1 else 0)
df = df.drop(['OCCUPANCY_RATE'], axis=1)

In [159]:
# It seems that the historical weather data contains null values
df.columns[df.isna().any()]

Index(['Mean Temp (°C)', 'Total Precip (mm)'], dtype='object')

In [160]:
# Dropping rows with null weather data
df.dropna(inplace=True)

In [162]:
# Preparing data for model fitting
X = df.loc[:, 'CAPACITY':'sublocality_York']
y = df.OCCUPANCY_LEVEL

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

---
## Fitting Models

In [177]:
# Trying to fit a Logistic Regression model with default settings
logit = LogisticRegression().fit(X_train, y_train)
print(f'Accuracy: {logit.score(X_test, y_test)}')

Accuracy: 0.6407586792052605


In [178]:
# Percentage of full and non-full records
y.value_counts() / y.shape[0]

1    0.609166
0    0.390834
Name: OCCUPANCY_LEVEL, dtype: float64

61% of the records in the data reach full occupancy. A accuracy score of 64% is only slightly better than pure chance.

In [190]:
# Precision, Recall & F1 Score 
y_pred = logit.predict(X_test)
print(f'Precision Score: {precision_score(y_test, y_pred)}')
print(f'Recall Score: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

Precision Score: 0.6540055851295925
Recall Score: 0.8703870387038704
F1 Score: 0.7468390697210907


In [191]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.28      0.38     22131
           1       0.65      0.87      0.75     34441

    accuracy                           0.64     56572
   macro avg       0.62      0.58      0.56     56572
weighted avg       0.63      0.64      0.60     56572



The initial logistic model shows an F1 score of 0.75. This will be used as a baseline to compare with other models.

In [None]:
# Scaler and PCA

In [None]:
# Pipeline to find better models