In [1]:
import pandas as pd

import numpy as np

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

import pickle

import json

import matplotlib.pyplot as plt

import folium
from folium.plugins import HeatMap

In [2]:
%load_ext kedro.ipython
table = catalog.load("input_table")

In [3]:
# Path to the pickle file
pickle_file_path = 'C://Users//carga//Documents//KedroTutorial//montrealfirepredicition//data//07_model_output//trained_time_series_xgb_model_3months_v2.pkl'

# Open the pickle file and load the model
with open(pickle_file_path, 'rb') as file:
    model = pickle.load(file)

In [4]:
def get_mode(series):
    mode = series.mode()
    if not mode.empty:
        return mode.iloc[0]
    else:
        return None  # or a default value like 'unknown'

In [16]:
# Group by grid location and month
aggregated_data = pd.DataFrame()
aggregated_data = table.groupby(['grid_lat', 'grid_long', 'year', 'month']).agg({
    'index_mesh': 'first',
    'USE_CATEGORY': get_mode,
    'YEAR_CONSTRUCTION': 'mean',
    'is_fire': 'sum',  # Sum to count number of fire incidents
    'ABOVE_GROUND_FLOORS': 'max',
    'HOUSING_UNITS': 'mean',
    'distance_to_fire_station': 'mean',
    'building_age': 'mean',
    'AVERAGE_FAMILY_SIZE': 'mean',
    'POPULATION_DENSITY': 'mean',
    '2021_POPULATION': 'mean',
    'time_of_day': get_mode,
    'day_of_week': get_mode,
    'INCIDENT_ID': 'count',
}).reset_index()

# Fill missing categorical values with 'missing' and numerical with the mean
categorical_features = ['time_of_day', 'day_of_week', 'USE_CATEGORY']
numerical_features = ['grid_lat', 'grid_long', 'ABOVE_GROUND_FLOORS', 'YEAR_CONSTRUCTION', 'HOUSING_UNITS', 'building_age', 'distance_to_fire_station', 'AVERAGE_FAMILY_SIZE', 'POPULATION_DENSITY', '2021_POPULATION', 'INCIDENT_ID']

for col in categorical_features:
    aggregated_data[col].fillna('missing', inplace=True)

for col in numerical_features:
    aggregated_data[col].fillna(aggregated_data[col].mean(), inplace=True)

In [17]:
aggregated_data[aggregated_data['index_mesh'] == 82]

Unnamed: 0,grid_lat,grid_long,year,month,index_mesh,USE_CATEGORY,YEAR_CONSTRUCTION,is_fire,ABOVE_GROUND_FLOORS,HOUSING_UNITS,distance_to_fire_station,building_age,AVERAGE_FAMILY_SIZE,POPULATION_DENSITY,2021_POPULATION,time_of_day,day_of_week,INCIDENT_ID
0,45.403963,-73.946646,2005.0,1.0,82,missing,,0,,,1.558668,,,,,evening,2.0,1
1,45.403963,-73.946646,2005.0,2.0,82,missing,,1,,,1.558668,,,,,evening,1.0,2
2,45.403963,-73.946646,2005.0,3.0,82,missing,,0,,,1.558668,,,,,night,1.0,1
3,45.403963,-73.946646,2005.0,4.0,82,missing,,1,,,1.558668,,,,,night,1.0,1
4,45.403963,-73.946646,2005.0,5.0,82,missing,,2,,,1.558668,,,,,evening,6.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,45.403963,-73.946646,2023.0,8.0,82,missing,,0,,,1.558668,,,,,evening,1.0,1
160,45.403963,-73.946646,2023.0,9.0,82,missing,,0,,,1.558668,,,,,morning,4.0,2
161,45.403963,-73.946646,2023.0,10.0,82,missing,,0,,,1.558668,,,,,evening,0.0,5
162,45.403963,-73.946646,2023.0,11.0,82,missing,,0,,,1.558668,,,,,afternoon,0.0,2


In [28]:
# Create lag features for previous months
# Number of lags to create
n_lags = 3

merged_aggregated_data = pd.DataFrame()
merged_aggregated_data = aggregated_data.copy()

In [29]:
merged_aggregated_data.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'grid_lat'[0m, [32m'grid_long'[0m, [32m'year'[0m, [32m'month'[0m, [32m'index_mesh'[0m, [32m'USE_CATEGORY'[0m,
       [32m'YEAR_CONSTRUCTION'[0m, [32m'is_fire'[0m, [32m'ABOVE_GROUND_FLOORS'[0m, [32m'HOUSING_UNITS'[0m,
       [32m'distance_to_fire_station'[0m, [32m'building_age'[0m, [32m'AVERAGE_FAMILY_SIZE'[0m,
       [32m'POPULATION_DENSITY'[0m, [32m'2021_POPULATION'[0m, [32m'time_of_day'[0m, [32m'day_of_week'[0m,
       [32m'INCIDENT_ID'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0m

In [34]:
# Create lag features for previous months
# Number of lags to create
n_lags = 3

merged_aggregated_data = pd.DataFrame()
merged_aggregated_data = aggregated_data.copy()

# Loop to create lag features
for lag in range(1, n_lags + 1):
    # Copy the original data
    lagged_features = aggregated_data.copy()

    # Shift the month by the current lag value
    lagged_features['month'] = (lagged_features['month'] - lag - 1) % 12 + 1

    # Rename columns to indicate the lag
    lagged_features.columns = [
        col + f'_lag{lag}' if col not in ['index_mesh','grid_lat', 'grid_long', 'year', 'month'] else col
        for col in lagged_features.columns
    ]

    # Merge lagged features back into the original data
    merged_aggregated_data = pd.merge(
        merged_aggregated_data, lagged_features,
        on=['index_mesh','grid_lat', 'grid_long', 'year', 'month'],
        how='left'
    )

In [35]:
merged_aggregated_data.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'grid_lat'[0m, [32m'grid_long'[0m, [32m'year'[0m, [32m'month'[0m, [32m'index_mesh'[0m, [32m'USE_CATEGORY'[0m,
       [32m'YEAR_CONSTRUCTION'[0m, [32m'is_fire'[0m, [32m'ABOVE_GROUND_FLOORS'[0m, [32m'HOUSING_UNITS'[0m,
       [32m'distance_to_fire_station'[0m, [32m'building_age'[0m, [32m'AVERAGE_FAMILY_SIZE'[0m,
       [32m'POPULATION_DENSITY'[0m, [32m'2021_POPULATION'[0m, [32m'time_of_day'[0m, [32m'day_of_week'[0m,
       [32m'INCIDENT_ID'[0m, [32m'USE_CATEGORY_lag1'[0m, [32m'YEAR_CONSTRUCTION_lag1'[0m,
       [32m'is_fire_lag1'[0m, [32m'ABOVE_GROUND_FLOORS_lag1'[0m, [32m'HOUSING_UNITS_lag1'[0m,
       [32m'distance_to_fire_station_lag1'[0m, [32m'building_age_lag1'[0m,
       [32m'AVERAGE_FAMILY_SIZE_lag1'[0m, [32m'POPULATION_DENSITY_lag1'[0m,
       [32m'2021_POPULATION_lag1'[0m, [32m'time_of_day_lag1'[0m, [32m'day_of_week_lag1'[0m,
       [32m'INCIDENT_ID_lag1'[0m, [32m'USE_CATEGORY_l

In [36]:
merged_aggregated_data[['index_mesh','grid_lat', 'grid_long', 'year', 'month','distance_to_fire_station', 'distance_to_fire_station_lag1']][merged_aggregated_data['index_mesh'] == 82]

Unnamed: 0,index_mesh,grid_lat,grid_long,year,month,distance_to_fire_station,distance_to_fire_station_lag1
0,82,45.403963,-73.946646,2005.0,1.0,1.558668,1.558668
1,82,45.403963,-73.946646,2005.0,2.0,1.558668,1.558668
2,82,45.403963,-73.946646,2005.0,3.0,1.558668,1.558668
3,82,45.403963,-73.946646,2005.0,4.0,1.558668,1.558668
4,82,45.403963,-73.946646,2005.0,5.0,1.558668,
...,...,...,...,...,...,...,...
159,82,45.403963,-73.946646,2023.0,8.0,1.558668,1.558668
160,82,45.403963,-73.946646,2023.0,9.0,1.558668,1.558668
161,82,45.403963,-73.946646,2023.0,10.0,1.558668,1.558668
162,82,45.403963,-73.946646,2023.0,11.0,1.558668,1.558668


In [39]:
# Ensure all data types are correctly set for the pipeline
for col in categorical_features:
    merged_aggregated_data[col] = merged_aggregated_data[col].astype(str)

# Shift the target variable to the previous month to predict the next month's fire
merged_aggregated_data['is_fire'] = merged_aggregated_data.groupby(['grid_lat', 'grid_long'])['is_fire'].shift(-1)

# Drop rows with missing target
merged_aggregated_data.dropna(subset=['is_fire'], inplace=True)

# Convert 'is_fire' to a binary numerical column for classification purposes (e.g., threshold at 0.5)
merged_aggregated_data['is_fire'] = (merged_aggregated_data['is_fire'] > 0.5).astype(int)

In [40]:
# Prepare the data
features = [col for col in merged_aggregated_data.columns if col not in ['is_fire', 'year', 'month']]
X = merged_aggregated_data[features]
y = merged_aggregated_data['is_fire']

In [41]:
# Make predictions
merged_aggregated_data['predicted_fire'] = model.predict(X)

: 

In [56]:
merged_aggregated_data.head()

Unnamed: 0,grid_lat,grid_long,year,month,index_mesh,USE_CATEGORY,YEAR_CONSTRUCTION,is_fire,ABOVE_GROUND_FLOORS,HOUSING_UNITS,...,HOUSING_UNITS_lag1_lag2_lag3,distance_to_fire_station_lag1_lag2_lag3,building_age_lag1_lag2_lag3,AVERAGE_FAMILY_SIZE_lag1_lag2_lag3,POPULATION_DENSITY_lag1_lag2_lag3,2021_POPULATION_lag1_lag2_lag3,time_of_day_lag1_lag2_lag3,day_of_week_lag1_lag2_lag3,INCIDENT_ID_lag1_lag2_lag3,predicted_fire
0,45.403963,-73.946646,2005.0,1.0,82,missing,,1,,,...,,,,,,,,,,0
1,45.403963,-73.946646,2005.0,2.0,82,missing,,0,,,...,,,,,,,,,,0
2,45.403963,-73.946646,2005.0,3.0,82,missing,,1,,,...,,,,,,,,,,0
3,45.403963,-73.946646,2005.0,4.0,82,missing,,1,,,...,,,,,,,,,,0
4,45.403963,-73.946646,2005.0,5.0,82,missing,,0,,,...,,,,,,,,,,0


In [47]:
aggregated_data[['predicted_fire', 'is_fire']].describe()

Unnamed: 0,predicted_fire,is_fire
count,593573.0,593573.0
mean,0.192866,0.129649
std,0.394549,0.335917
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


In [54]:
aggregated_data['predicted_fire'][aggregated_data['predicted_fire'] == 1].sum()

[1;36m114480[0m

In [55]:
aggregated_data['is_fire'][aggregated_data['is_fire'] == 1].sum()

[1;36m76956[0m

In [62]:
selected_month = aggregated_data['month'] == 8
selected_year = aggregated_data['year'] == 2023
fire_prediction_selected_date = aggregated_data[selected_month & selected_year]

In [63]:
fire_prediction_selected_date.head()

Unnamed: 0,grid_lat,grid_long,year,month,index_mesh,USE_CATEGORY,YEAR_CONSTRUCTION,is_fire,ABOVE_GROUND_FLOORS,HOUSING_UNITS,...,HOUSING_UNITS_lag1_lag2_lag3,distance_to_fire_station_lag1_lag2_lag3,building_age_lag1_lag2_lag3,AVERAGE_FAMILY_SIZE_lag1_lag2_lag3,POPULATION_DENSITY_lag1_lag2_lag3,2021_POPULATION_lag1_lag2_lag3,time_of_day_lag1_lag2_lag3,day_of_week_lag1_lag2_lag3,INCIDENT_ID_lag1_lag2_lag3,predicted_fire
159,45.403963,-73.946646,2023.0,8.0,82,missing,,0,,,...,,,,,,,,,,0
345,45.404061,-73.943198,2023.0,8.0,96,missing,,0,,,...,,1.615592,,,,,morning,1.0,2.0,0
557,45.404188,-73.950076,2023.0,8.0,68,missing,,0,,,...,,1.511289,,,,,morning,1.0,3.0,0
867,45.404592,-73.953471,2023.0,8.0,55,missing,,0,,,...,,1.490979,,,,,morning,3.0,1.0,0
1062,45.404897,-73.956301,2023.0,8.0,42,missing,,0,,,...,,,,,,,,,,0


In [61]:
fire_prediction_selected_date['predicted_fire'][fire_prediction_selected_date['predicted_fire'] == 1].sum()

[1;36m462[0m

In [83]:
# Create a base map of Montreal
montreal_map = folium.Map(location=[45.5017, -73.5673], zoom_start=12)

# Add fire prediction markers to the map
for _, row in fire_prediction.iterrows():
    location = [row['grid_lat'], row['grid_long']]
    is_fire = row['predicted_fire']
    
    if is_fire:
        folium.CircleMarker(
            location=location,
            radius=5,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.6
        ).add_to(montreal_map)
    else:
        folium.CircleMarker(
            location=location,
            radius=5,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        ).add_to(montreal_map)

# Show map
montreal_map
