# Imports

In [2]:
import pandas as pd
from datetime import datetime, timedelta

In [3]:
import time
import os
import base64
import hashlib
import geopandas as gpd
import requests
import re
from dotenv import load_dotenv
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection, Point



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [4]:
import secrets

In [5]:
import numpy as np                           

# Data Cleaning

In [6]:
df_crimes = pd.read_csv('Data/Crimes/actes-criminels.csv')

In [7]:
df_crimes.columns

Index(['CATEGORIE', 'DATE', 'QUART', 'PDQ', 'X', 'Y', 'LONGITUDE', 'LATITUDE'], dtype='object')

In [8]:
df_crimes.head()

Unnamed: 0,CATEGORIE,DATE,QUART,PDQ,X,Y,LONGITUDE,LATITUDE
0,Vol de véhicule à moteur,2018-09-13,jour,30.0,294904.159001,5047549.0,-73.626778,45.56778
1,Vol de véhicule à moteur,2018-04-30,jour,30.0,294904.159001,5047549.0,-73.626778,45.56778
2,Vol de véhicule à moteur,2018-09-01,nuit,7.0,290274.565,5042150.0,-73.685928,45.519122
3,Méfait,2017-07-21,jour,21.0,,,,
4,Méfait,2017-07-29,jour,12.0,,,,


In [9]:
df_crimes.dtypes


CATEGORIE     object
DATE          object
QUART         object
PDQ          float64
X            float64
Y            float64
LONGITUDE    float64
LATITUDE     float64
dtype: object

In [10]:
df_crimes.shape

(244412, 8)

### Add and modify temporal columns

In [11]:
df_crimes['DATE'] = pd.to_datetime(
    df_crimes['DATE'])


In [12]:
df_crimes['DAY'] = df_crimes['DATE'].dt.strftime('%Y-%m-%d')
df_crimes['MONTH'] = df_crimes['DATE'].dt.strftime('%Y-%m')
df_crimes['YEAR'] = df_crimes['DATE'].dt.strftime('%Y')

### Drop columns

In [13]:
df_crimes = df_crimes.drop(
    ['PDQ', 'X', 'Y'], axis=1)

In [14]:
df_crimes.dtypes


CATEGORIE            object
DATE         datetime64[ns]
QUART                object
LONGITUDE           float64
LATITUDE            float64
DAY                  object
MONTH                object
YEAR                 object
dtype: object

In [15]:
df_crimes.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,CATEGORIE,DATE,QUART,LONGITUDE,LATITUDE,DAY,MONTH,YEAR
count,244412,244412,244412,202967.0,202967.0,244412,244412,244412.0
unique,6,,3,,,2955,98,9.0
top,Vol dans / sur véhicule à moteur,,jour,,,2015-12-07,2015-10,2015.0
freq,72375,,124548,,,153,3316,35609.0
mean,,2018-11-21 14:35:14.763565312,,-73.615263,45.528878,,,
min,,2015-01-01 00:00:00,,-73.968954,45.402691,,,
25%,,2016-10-06 00:00:00,,-73.63955,45.491678,,,
50%,,2018-09-17 00:00:00,,-73.595557,45.525735,,,
75%,,2021-01-05 00:00:00,,-73.565752,45.564528,,,
max,,2023-02-02 00:00:00,,-73.479583,45.702351,,,


### Add grid IDs for each incident

In [16]:
def add_grid_id_column(df, grid_id_col_name, lat_col_name, lon_col_name, grid_file_path):
    # Load the grid file into a GeoDataFrame
    grid_gdf = gpd.read_file(grid_file_path)

    # Assign an appropriate CRS for lat/lon coordinates to the GeoDataFrame
    grid_gdf = grid_gdf.to_crs("EPSG:4326")

    # Create a spatial index for the grid GeoDataFrame
    grid_index = grid_gdf.sindex

    # Define a lambda function to get the grid ID for a single row
    def get_grid_id(row): return _get_grid_id(
        row, grid_index, lat_col_name, lon_col_name, grid_gdf)

    # Apply the get_grid_id function to each row of the DataFrame to get the grid ID
    grid_ids = df.apply(get_grid_id, axis=1)

    # Add the new column to the DataFrame and return the result
    df_with_grid_id = df.copy()
    df_with_grid_id[grid_id_col_name] = grid_ids
    return df_with_grid_id

def _get_grid_id(row, grid_index, lat_col_name, lon_col_name, grid_gdf):
    lat = row[lat_col_name]
    lon = row[lon_col_name]
    point = Point(lon, lat)
    possible_matches_index = list(grid_index.intersection(point.bounds))
    possible_matches = grid_gdf.iloc[possible_matches_index]
    intersecting_gdf = possible_matches[possible_matches.intersects(point)]
    if intersecting_gdf.empty:
        return None
    return intersecting_gdf.iloc[0]['grid_id']


In [17]:
def extract_number_from_filename(file_name):
    number = re.search(r'\d+', file_name).group(0)
    return str(number)

In [18]:
def get_grid_id_col_name(file_path):
    file_name = os.path.basename(file_path)
    grid_id_col_name = extract_number_from_filename(file_name) + '_GRID_ID'
    return grid_id_col_name

In [19]:
shapefile_path = 'Data\Generated_grids\square_grids1000.shp'
grid_id_col_name = get_grid_id_col_name(shapefile_path)

In [20]:
df_crimes = add_grid_id_column(
    df_crimes, grid_id_col_name, 'LATITUDE', 'LONGITUDE', shapefile_path)

In [21]:
df_crimes.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,CATEGORIE,DATE,QUART,LONGITUDE,LATITUDE,DAY,MONTH,YEAR,1000_GRID_ID
count,244412,244412,244412,202967.0,202967.0,244412,244412,244412.0,202965
unique,6,,3,,,2955,98,9.0,540
top,Vol dans / sur véhicule à moteur,,jour,,,2015-12-07,2015-10,2015.0,BSwXKT8NtyU=
freq,72375,,124548,,,153,3316,35609.0,4427
mean,,2018-11-21 14:35:14.763565312,,-73.615263,45.528878,,,,
min,,2015-01-01 00:00:00,,-73.968954,45.402691,,,,
25%,,2016-10-06 00:00:00,,-73.63955,45.491678,,,,
50%,,2018-09-17 00:00:00,,-73.595557,45.525735,,,,
75%,,2021-01-05 00:00:00,,-73.565752,45.564528,,,,
max,,2023-02-02 00:00:00,,-73.479583,45.702351,,,,


### Handling missing values

In [22]:
df_crimes = df_crimes.dropna()

In [23]:
df_crimes.shape

(202965, 9)

In [24]:
df_crimes = df_crimes.sort_values(['MONTH'])

In [25]:
df_crimes.head()

Unnamed: 0,CATEGORIE,DATE,QUART,LONGITUDE,LATITUDE,DAY,MONTH,YEAR,1000_GRID_ID
13718,Introduction,2015-01-31,jour,-73.54593,45.592243,2015-01-31,2015-01,2015,16jKQF9hsj8=
17107,Méfait,2015-01-10,jour,-73.557136,45.513991,2015-01-10,2015-01,2015,f0Nq3PF9MoE=
27190,Méfait,2015-01-21,soir,-73.634641,45.48254,2015-01-21,2015-01,2015,drxXwx5Ulrk=
1975,Vol de véhicule à moteur,2015-01-14,jour,-73.582266,45.620038,2015-01-14,2015-01,2015,BDLTCeQ2O6w=
74962,Vol dans / sur véhicule à moteur,2015-01-16,soir,-73.560261,45.507695,2015-01-16,2015-01,2015,hoz0d7bgrqw=


### Remove all rows associated to '2023-02'

In [26]:
df_crimes = df_crimes[df_crimes['MONTH'] != '2023-02']

### Add identifer for each row

In [27]:
# Add unique identifier to rows
df_crimes['CRIMES_ID'] = [secrets.token_hex(4) for _ in range(len(df_crimes))]

In [28]:
df_crimes.columns

Index(['CATEGORIE', 'DATE', 'QUART', 'LONGITUDE', 'LATITUDE', 'DAY', 'MONTH',
       'YEAR', '1000_GRID_ID', 'CRIMES_ID'],
      dtype='object')

# Output files

In [29]:
df_crimes.to_csv(
    'Data/Processed_Datasets/Cleaned_datasets/df_crimes.csv')