# Imports

In [29]:
import pandas as pd
import numpy as np

In [30]:
import time
import os
import base64
import hashlib
import geopandas as gpd
import requests
import re
from dotenv import load_dotenv
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection, Point

# Data Cleaning

### Mini data exploration

In [31]:
df_incidents = pd.read_csv('Data/Incidents/donneesouvertes-interventions-sim.csv')

In [32]:
df_incidents.columns

Index(['INCIDENT_NBR', 'CREATION_DATE_TIME', 'INCIDENT_TYPE_DESC',
       'DESCRIPTION_GROUPE', 'CASERNE', 'NOM_VILLE', 'NOM_ARROND', 'DIVISION',
       'NOMBRE_UNITES', 'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

In [33]:
df_incidents.head()

Unnamed: 0,INCIDENT_NBR,CREATION_DATE_TIME,INCIDENT_TYPE_DESC,DESCRIPTION_GROUPE,CASERNE,NOM_VILLE,NOM_ARROND,DIVISION,NOMBRE_UNITES,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE
0,38821,2015-04-21T14:55:09,Premier répondant,1-REPOND,28,Montréal,Anjou,9,1.0,301114.1,5051641.4,-73.547252,45.604665
1,39630,2015-04-24T05:02:05,Appel de Cie de détection,Alarmes-incendies,39,Montréal,Mercier-Hochelaga-Maisonneuve,7,4.0,301944.8,5049911.1,-73.536592,45.5891
2,39643,2015-04-24T07:03:39,Premier répondant,1-REPOND,21,Montréal,Saint-Léonard,8,1.0,296334.0,5048630.5,-73.608478,45.577531
3,39652,2015-04-24T07:45:01,Premier répondant,1-REPOND,33,Montréal,Le Sud-Ouest,5,1.0,297828.3,5035708.9,-73.589148,45.461274
4,39663,2015-04-24T08:10:13,Odeur suspecte - gaz,SANS FEU,14,Montréal,Rivière-des-Prairies-Pointe-aux-Trembles,9,2.0,298623.1,5054687.8,-73.579224,45.63206


In [34]:
df_incidents[['INCIDENT_TYPE_DESC']]


Unnamed: 0,INCIDENT_TYPE_DESC
0,Premier répondant
1,Appel de Cie de détection
2,Premier répondant
3,Premier répondant
4,Odeur suspecte - gaz
...,...
904135,Ac.véh./1R/s.v./V.R./29B/D
904136,Ac.véh./1R/s.v./pont/29B/D
904137,Ac.véh./1R/s.v./pont/29B/D
904138,Ac.véh./1R/s.v./V.R./29B/D


In [35]:
df_incidents.dtypes

INCIDENT_NBR            int64
CREATION_DATE_TIME     object
INCIDENT_TYPE_DESC     object
DESCRIPTION_GROUPE     object
CASERNE                 int64
NOM_VILLE              object
NOM_ARROND             object
DIVISION                int64
NOMBRE_UNITES         float64
MTM8_X                float64
MTM8_Y                float64
LONGITUDE             float64
LATITUDE              float64
dtype: object

### Add date columns

In [36]:
df_incidents['CREATION_DATE_TIME'] = pd.to_datetime(
    df_incidents['CREATION_DATE_TIME'])

In [37]:
df_incidents = df_incidents.rename(columns={'CREATION_DATE_TIME': 'DATE'})

In [38]:
df_incidents['DAY'] = df_incidents['DATE'].dt.strftime('%Y-%m-%d')
df_incidents['MONTH'] = df_incidents['DATE'].dt.strftime('%Y-%m')
df_incidents['YEAR'] = df_incidents['DATE'].dt.strftime('%Y')

In [39]:
df_incidents = df_incidents.drop(['INCIDENT_TYPE_DESC',
                                  'NOM_VILLE',
                                  'NOM_ARROND',
                                  'MTM8_X',
                                  'MTM8_Y'], axis=1)

In [40]:
df_incidents.dtypes

INCIDENT_NBR                   int64
DATE                  datetime64[ns]
DESCRIPTION_GROUPE            object
CASERNE                        int64
DIVISION                       int64
NOMBRE_UNITES                float64
LONGITUDE                    float64
LATITUDE                     float64
DAY                           object
MONTH                         object
YEAR                          object
dtype: object

### Add grid IDs for each incident

In [41]:
def add_grid_id_column(df, grid_id_col_name, lat_col_name, lon_col_name, grid_file_path):
    # Load the grid file into a GeoDataFrame
    grid_gdf = gpd.read_file(grid_file_path)

    # Assign an appropriate CRS for lat/lon coordinates to the GeoDataFrame
    grid_gdf = grid_gdf.to_crs("EPSG:4326")

    # Create a spatial index for the grid GeoDataFrame
    grid_index = grid_gdf.sindex

    # Define a lambda function to get the grid ID for a single row
    def get_grid_id(row): return _get_grid_id(
        row, grid_index, lat_col_name, lon_col_name, grid_gdf)

    # Apply the get_grid_id function to each row of the DataFrame to get the grid ID
    grid_ids = df.apply(get_grid_id, axis=1)

    # Add the new column to the DataFrame and return the result
    df_with_grid_id = df.copy()
    df_with_grid_id[grid_id_col_name] = grid_ids
    return df_with_grid_id

def _get_grid_id(row, grid_index, lat_col_name, lon_col_name, grid_gdf):
    lat = row[lat_col_name]
    lon = row[lon_col_name]
    point = Point(lon, lat)
    possible_matches_index = list(grid_index.intersection(point.bounds))
    possible_matches = grid_gdf.iloc[possible_matches_index]
    intersecting_gdf = possible_matches[possible_matches.intersects(point)]
    if intersecting_gdf.empty:
        return None
    return intersecting_gdf.iloc[0]['grid_id']


In [42]:
def extract_number_from_filename(file_name):
    number = re.search(r'\d+', file_name).group(0)
    return str(number)

In [43]:
def get_grid_id_col_name(file_path):
    file_name = os.path.basename(file_path)
    grid_id_col_name = extract_number_from_filename(file_name) + '_GRID_ID'
    return grid_id_col_name

In [44]:
shapefile_path = 'Data\Generated_grids\square_grids1000.shp'
grid_id_col_name = get_grid_id_col_name(shapefile_path)

In [45]:
df_incidents = add_grid_id_column(
    df_incidents, grid_id_col_name, 'LATITUDE', 'LONGITUDE', shapefile_path)


In [46]:
df_incidents.describe(include='all', datetime_is_numeric=True)


Unnamed: 0,INCIDENT_NBR,DATE,DESCRIPTION_GROUPE,CASERNE,DIVISION,NOMBRE_UNITES,LONGITUDE,LATITUDE,DAY,MONTH,YEAR,1000_GRID_ID
count,904140.0,904140,904002,904140.0,904140.0,903853.0,904140.0,904140.0,904140,904140,904140.0,903521
unique,,,7,,,,,,2955,98,9.0,568
top,,,1-REPOND,,,,,,2019-11-01,2018-01,2017.0,BSwXKT8NtyU=
freq,,,593925,,,,,,987,11734,122474.0,15349
mean,60237.405644,2018-11-23 19:15:34.593665280,,37.981816,5.037382,1.807277,-73.622829,45.525942,,,,
min,1.0,2015-01-01 00:03:22,,3.0,0.0,1.0,-73.983189,45.402657,,,,
25%,28982.0,2016-11-18 16:35:44.750000128,,20.0,3.0,1.0,-73.647114,45.486928,,,,
50%,59325.0,2018-09-26 12:21:09.500000,,37.0,5.0,1.0,-73.603653,45.52095,,,,
75%,89871.0,2020-12-17 09:09:07.750000128,,55.0,7.0,2.0,-73.568655,45.561843,,,,
max,500127.0,2023-02-02 23:56:25,,78.0,17.0,199.0,-73.479326,45.703664,,,,


### Handling missing values

In [48]:
df_incidents = df_incidents.dropna()

In [49]:
df_incidents.shape

(903103, 12)

In [50]:
df_incidents = df_incidents.sort_values(['DATE'])

### Remove all rows associated to '2023-02'

In [54]:
df_incidents = df_incidents[df_incidents['MONTH'] != '2023-02']

In [55]:
df_incidents.shape

(902471, 12)

In [56]:
df_incidents.head()

Unnamed: 0,INCIDENT_NBR,DATE,DESCRIPTION_GROUPE,CASERNE,DIVISION,NOMBRE_UNITES,LONGITUDE,LATITUDE,DAY,MONTH,YEAR,1000_GRID_ID
457987,1,2015-01-01 00:03:22,SANS FEU,26,11,1.0,-73.580599,45.535699,2015-01-01,2015-01,2015,u7VeXxK7i-k=
410791,2,2015-01-01 00:05:58,1-REPOND,25,10,2.0,-73.574265,45.494496,2015-01-01,2015-01,2015,ZxSg_e-wPKs=
684717,3,2015-01-01 00:08:34,Alarmes-incendies,64,7,3.0,-73.665783,45.443273,2015-01-01,2015-01,2015,fTHnnBAVd18=
205493,4,2015-01-01 00:11:28,1-REPOND,38,6,1.0,-73.489985,45.641295,2015-01-01,2015-01,2015,xuTgwZGOLGk=
69696,5,2015-01-01 00:14:06,SANS FEU,34,9,2.0,-73.61557,45.483623,2015-01-01,2015-01,2015,hiQKHukPWWY=


# Output file

In [57]:
df_incidents.to_csv('Data/Processed_Datasets/Cleaned_datasets/df_incidents.csv')