# Toronto Criminal Data Record

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import branca.colormap as cm

%matplotlib inline
plt.style.use('ggplot')
sns.set_context("notebook")
import warnings
warnings.filterwarnings("ignore")



## 1. Read the file

In [2]:
# Read the CSV into a DataFrame
criminal_data_raw = pd.read_csv('Criminal Data and Disorderly Patrons Data\Assault_Open_Data.csv')
criminal_data_raw.head()

Unnamed: 0,X,Y,OBJECTID,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,-8837009.0,5414638.0,1,GO-20141265238,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1430,100,Assault,Assault,98,Rosedale-Moore Park,98,Rosedale-Moore Park (98),-79.384206,43.670798
1,-8832733.0,5419701.0,2,GO-20141259834,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1420,100,Assault With Weapon,Assault,55,Thorncliffe Park,55,Thorncliffe Park (55),-79.345795,43.703684
2,-8836444.0,5410819.0,3,GO-20141262027,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1430,100,Assault,Assault,166,St Lawrence-East Bayfront-The Islands,77,Waterfront Communities-The Island (77),-79.379131,43.645981
3,-8836897.0,5412101.0,4,GO-20141259951,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1460,100,Assault Peace Officer,Assault,170,Yonge-Bay Corridor,76,Bay Street Corridor (76),-79.3832,43.654313
4,-8851435.0,5422186.0,5,GO-20141261561,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1420,100,Assault With Weapon,Assault,154,Oakdale-Beverley Heights,26,Downsview-Roding-CFB (26),-79.513797,43.719824


In [3]:
#create a new data frame contains the useful columns below
#OCC_DATE
#LOCATION_TYPE
#PREMISES_TYPE
#OFFENCE
#NEIGHBOURHOOD_140
#LONG_WGS84
#LAT_WGS84


criminal_data_sorted = criminal_data_raw[['OCC_DATE','LOCATION_TYPE','PREMISES_TYPE',
                                          'OFFENCE','NEIGHBOURHOOD_140','LONG_WGS84','LAT_WGS84']]

criminal_data_sorted.head()

Unnamed: 0,OCC_DATE,LOCATION_TYPE,PREMISES_TYPE,OFFENCE,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,2014/01/01 05:00:00+00,Ttc Subway Station,Transit,Assault,Rosedale-Moore Park (98),-79.384206,43.670798
1,2014/01/01 05:00:00+00,Bar / Restaurant,Commercial,Assault With Weapon,Thorncliffe Park (55),-79.345795,43.703684
2,2014/01/01 05:00:00+00,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,Assault,Waterfront Communities-The Island (77),-79.379131,43.645981
3,2014/01/01 05:00:00+00,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,Assault Peace Officer,Bay Street Corridor (76),-79.3832,43.654313
4,2014/01/01 05:00:00+00,"Commercial Dwelling Unit (Hotel, Motel, B & B,...",Commercial,Assault With Weapon,Downsview-Roding-CFB (26),-79.513797,43.719824


In [4]:
# Filter out data related to the subway stations
criminal_data_subway = criminal_data_sorted[criminal_data_sorted['LOCATION_TYPE'].str.contains('Subway', case=False, na=False)]
criminal_data_subway.head()

Unnamed: 0,OCC_DATE,LOCATION_TYPE,PREMISES_TYPE,OFFENCE,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,2014/01/01 05:00:00+00,Ttc Subway Station,Transit,Assault,Rosedale-Moore Park (98),-79.384206,43.670798
17,2013/12/31 05:00:00+00,Ttc Subway Train,Transit,Assault,Bay Street Corridor (76),-79.386625,43.650847
43,2014/01/01 05:00:00+00,Ttc Subway Station,Transit,Assault,Annex (95),-79.411785,43.666614
170,2014/01/05 05:00:00+00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323
203,2014/01/05 05:00:00+00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323


In [5]:
#convert the occ_date column to datetime
criminal_data_subway['OCC_DATE'] = pd.to_datetime(criminal_data_subway['OCC_DATE'])

In [6]:
# make sure the occ date range match with the delay time
start_date = '2014/01/01 05:00:00+00'
end_date = '2023/09/30 05:00:00+00'

# Convert the start and end dates to datetime
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

# Filter out the date between the start date and the end date
criminal_data_subway = criminal_data_subway[(criminal_data_subway['OCC_DATE'] >= start_date) & (criminal_data_subway['OCC_DATE'] <= end_date)]
# Set the OOC_DATE as the index
criminal_data_subway = criminal_data_subway.set_index('OCC_DATE')
criminal_data_subway.head()


Unnamed: 0_level_0,LOCATION_TYPE,PREMISES_TYPE,OFFENCE,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
OCC_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Rosedale-Moore Park (98),-79.384206,43.670798
2014-01-01 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Annex (95),-79.411785,43.666614
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Dovercourt-Wallace Emerson-Junction (93),-79.435362,43.659878


In [7]:
# Filter out the location at TTC Subway Trains
filtered_df = criminal_data_subway[criminal_data_subway['LOCATION_TYPE'] != 'Ttc Subway Train']
filtered_df["LOCATION_TYPE"].unique()

array(['Ttc Subway Station', 'Ttc Subway Tunnel / Outdoor Tracks'],
      dtype=object)

In [8]:
# Group the data by the neighbourhood
first_occurrences = filtered_df.groupby('NEIGHBOURHOOD_140').first()[['LONG_WGS84', 'LAT_WGS84']]

for neighborhood in first_occurrences.index:
    longitude, latitude = first_occurrences.loc[neighborhood]
    filtered_df.loc[filtered_df['NEIGHBOURHOOD_140'] == neighborhood, 'LONG_WGS84'] = longitude
    filtered_df.loc[filtered_df['NEIGHBOURHOOD_140'] == neighborhood, 'LAT_WGS84'] = latitude

filtered_df.head(10)

Unnamed: 0_level_0,LOCATION_TYPE,PREMISES_TYPE,OFFENCE,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
OCC_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Rosedale-Moore Park (98),-79.384206,43.670798
2014-01-01 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Annex (95),-79.411785,43.666614
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault Peace Officer,Church-Yonge Corridor (75),-79.380923,43.656323
2014-01-05 05:00:00+00:00,Ttc Subway Station,Transit,Assault,Dovercourt-Wallace Emerson-Junction (93),-79.435362,43.659878
2014-01-07 05:00:00+00:00,Ttc Subway Station,Transit,Assault With Weapon,Newtonbrook West (36),-79.415998,43.781631
2014-01-07 05:00:00+00:00,Ttc Subway Station,Transit,Assault With Weapon,Newtonbrook West (36),-79.415998,43.781631
2014-01-07 05:00:00+00:00,Ttc Subway Station,Transit,Assault With Weapon,Newtonbrook West (36),-79.415998,43.781631
2014-01-07 05:00:00+00:00,Ttc Subway Station,Transit,Assault With Weapon,Newtonbrook West (36),-79.415998,43.781631
2014-01-07 05:00:00+00:00,Ttc Subway Station,Transit,Assault With Weapon,Newtonbrook West (36),-79.415998,43.781631


In [9]:
# Count how many occurance at each neightbourhood
counts = filtered_df['NEIGHBOURHOOD_140'].value_counts()

# Converting the Series to a DataFrame
station_counts_df = counts.reset_index()
station_counts_df.columns = ['Station', 'Count']

# This will print the count of each unique value in the NEIGHBOURHOOD_140 column
station_counts_df.head()

Unnamed: 0,Station,Count
0,Church-Yonge Corridor (75),531
1,Annex (95),432
2,Rosedale-Moore Park (98),276
3,Kennedy Park (124),205
4,Danforth (66),167


In [10]:
# Keep neighbourhood, longitude and latitude of the original dataframe
subset_df1 = filtered_df[['NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84']]

# Rename the 'Station' column in df2 to 'NEIGHBOURHOOD_140' to match df1
station_counts_df.rename(columns={'Station': 'NEIGHBOURHOOD_140'}, inplace=True)

# Merge the DataFrames on the NEIGHBOURHOOD_140 column
merged_df = pd.merge(station_counts_df, subset_df1, on='NEIGHBOURHOOD_140', how='left')
merged_df = merged_df.drop_duplicates(subset='NEIGHBOURHOOD_140')
# Select the top 20 rows
merged_df = merged_df.head(20)

merged_df.head(15)

Unnamed: 0,NEIGHBOURHOOD_140,Count,LONG_WGS84,LAT_WGS84
0,Church-Yonge Corridor (75),531,-79.380923,43.656323
531,Annex (95),432,-79.411785,43.666614
963,Rosedale-Moore Park (98),276,-79.384206,43.670798
1239,Kennedy Park (124),205,-79.264551,43.732765
1444,Danforth (66),167,-79.345152,43.679673
1611,Yonge-Eglinton (100),166,-79.397849,43.704345
1777,Newtonbrook West (36),157,-79.415998,43.781631
1934,Dovercourt-Wallace Emerson-Junction (93),153,-79.435362,43.659878
2087,Waterfront Communities-The Island (77),142,-79.379131,43.645981
2229,High Park North (88),139,-79.460028,43.65534


## 2. Plot the map

In [11]:
# Set up the map
m = folium.Map(location=[43.656323, -79.380923], zoom_start=12, tiles='CartoDB positron', width='80%', height='100%')

# Create a color scale
color_scale = cm.linear.YlOrRd_09.scale(min(merged_df['Count']), max(merged_df['Count']))

# Add points
for _, row in merged_df.iterrows():
    # Scale the radius based on the Count value
    radius = row['Count'] / max(merged_df['Count']) * 10
    folium.CircleMarker(
        location=[row['LAT_WGS84'], row['LONG_WGS84']],
        radius=radius,
        color=color_scale(row['Count']),
        fill=True,
        fill_color=color_scale(row['Count']),
        fill_opacity=0.5
    ).add_to(m)

# Add color scale to map
color_scale.add_to(m)

# Display the map
m


## 3. Find the delay with Disorderly Patron

In [12]:
# Load a shapefile
gdf = gpd.read_file('Criminal Data and Disorderly Patrons Data\subway_stations.shp')

# Display the first few rows of the GeoDataFrame
gdf.head()

Unnamed: 0,STATION,LINE,PLATFORM_L,AVG_PASSEN,LINE2,PLATFORM_1,SUBWAY_TRA,ADDRESS,Opened,geometry
0,Kipling,Bloor-Danforth,1,53640,,,False,5247 Dundas St. West,1980,POINT (618101.613 4832636.300)
1,Islington,Bloor-Danforth,1,43090,,,False,3286 Bloor St. West,1968,POINT (618990.613 4833544.113)
2,Royal York,Bloor-Danforth,2,19440,,,False,3012 Bloor St. West,1968,POINT (620056.496 4833882.764)
3,Old Mill,Bloor-Danforth,2,5780,,,False,2672 Bloor St. West,1968,POINT (621361.678 4834111.901)
4,Jane,Bloor-Danforth,2,16730,,,False,2440 Bloor St. West,1968,POINT (622220.664 4834091.381)


In [13]:
# create a new dataframe with only station and geometry
selected_df = gdf[['STATION', 'geometry']]
selected_df.head()

Unnamed: 0,STATION,geometry
0,Kipling,POINT (618101.613 4832636.300)
1,Islington,POINT (618990.613 4833544.113)
2,Royal York,POINT (620056.496 4833882.764)
3,Old Mill,POINT (621361.678 4834111.901)
4,Jane,POINT (622220.664 4834091.381)


In [14]:
# Convert the station name to all upper cases
selected_df['STATION'] = selected_df['STATION'].str.upper()
selected_df.head()

Unnamed: 0,STATION,geometry
0,KIPLING,POINT (618101.613 4832636.300)
1,ISLINGTON,POINT (618990.613 4833544.113)
2,ROYAL YORK,POINT (620056.496 4833882.764)
3,OLD MILL,POINT (621361.678 4834111.901)
4,JANE,POINT (622220.664 4834091.381)


In [15]:
# Save the dataframe to a csv file
selected_df.to_csv('geometry.csv', index=False)

In [16]:
# Read the CSV file from the TTC Subway data into a DataFrame
delay = pd.read_csv('Output\TTC_Real_Delay_2014_to_2023.csv')
delay.head()

Unnamed: 0,Datetime,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle,Code description,time_interval
0,2018-01-01 00:29:00,Monday,SHEPPARD WEST STATION,MUATC,10,15,N,YUS,5986,ATC Project,00:00 - 01:00
1,2018-01-01 01:39:00,Monday,MUSEUM STATION,MUO,6,11,S,YUS,5781,Miscellaneous Other,01:00 - 02:00
2,2018-01-01 02:09:00,Monday,KIPLING STATION,MUSAN,3,7,E,BD,5261,Unsanitary Vehicle,02:00 - 03:00
3,2018-01-01 02:42:00,Monday,COLLEGE STATION,SUDP,7,12,N,YUS,5696,Disorderly Patron,02:00 - 03:00
4,2018-01-01 03:06:00,Monday,WARDEN STATION,MUI,3,7,E,BD,5257,Injured or ill Customer (On Train) - Transported,03:00 - 04:00


In [17]:
# Filter out the disorderly patron cases
filtered_code = delay[delay['Code'] == 'SUDP']
filtered_code.head()

Unnamed: 0,Datetime,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle,Code description,time_interval
3,2018-01-01 02:42:00,Monday,COLLEGE STATION,SUDP,7,12,N,YUS,5696,Disorderly Patron,02:00 - 03:00
15,2018-01-01 15:36:00,Monday,SHERBOURNE STATION,SUDP,5,10,E,BD,5269,Disorderly Patron,15:00 - 16:00
39,2018-01-02 09:18:00,Tuesday,FINCH WEST STATION,SUDP,3,7,S,YUS,6096,Disorderly Patron,09:00 - 10:00
41,2018-01-02 11:28:00,Tuesday,OLD MILL STATION,SUDP,13,16,E,BD,5358,Disorderly Patron,11:00 - 12:00
66,2018-01-03 12:56:00,Wednesday,ST GEORGE BD STATION,SUDP,7,10,E,BD,5265,Disorderly Patron,12:00 - 13:00


In [18]:
# Count the number of cases at each subway station
delay_stn = filtered_code['Station'].value_counts()

# Converting the Series to a DataFrame
delay_stn = delay_stn.reset_index()
delay_stn.columns = ['Station', 'Count']

delay_stn.head()

Unnamed: 0,Station,Count
0,BLOOR STATION,370
1,EGLINTON STATION,229
2,KENNEDY BD STATION,187
3,YONGE BD STATION,176
4,COXWELL STATION,164


In [19]:
# Delete " STATION" in the delay_stn data to match the index with the criminal data dataframe
delay_stn['Station'] = delay_stn['Station'].str.replace(" STATION", "", regex=False)

In [20]:
# Merge stations with corresponding station names
merged_df_delay = pd.merge(delay_stn, selected_df, left_on='Station', right_on='STATION', how='inner')
merged_df_delay.head()

Unnamed: 0,Station,Count,STATION,geometry
0,EGLINTON,229,EGLINTON,POINT (629021.977 4840533.356)
1,COXWELL,164,COXWELL,POINT (635157.330 4838173.966)
2,KIPLING,156,KIPLING,POINT (618101.613 4832636.300)
3,DAVISVILLE,150,DAVISVILLE,POINT (629140.910 4839583.709)
4,FINCH,139,FINCH,POINT (627505.402 4848639.783)


In [21]:
# Filter out geometry and count from the dataframe above
delay_df = merged_df_delay[['geometry','Count']]
delay_df.head()

Unnamed: 0,geometry,Count
0,POINT (629021.977 4840533.356),229
1,POINT (635157.330 4838173.966),164
2,POINT (618101.613 4832636.300),156
3,POINT (629140.910 4839583.709),150
4,POINT (627505.402 4848639.783),139


In [22]:
delay_df['geometry'].head()

0    POINT (629021.977 4840533.356)
1    POINT (635157.330 4838173.966)
2    POINT (618101.613 4832636.300)
3    POINT (629140.910 4839583.709)
4    POINT (627505.402 4848639.783)
Name: geometry, dtype: geometry

In [23]:
import pandas as pd
from pyproj import Proj, Transformer
from shapely.geometry import Point

# Toronto is at UTM zone 17 North
utm_zone = 17
is_south = False

# Create a Transformer object for UTM to WGS84 coordinate transformation
transformer = Transformer.from_proj(
    Proj(proj='utm', zone=utm_zone, ellps='WGS84', south=is_south),
    Proj(proj='latlong', datum='WGS84')
)

# Function to convert UTM to Lat/Lon
def utm_to_latlon(point):
    if point and isinstance(point, Point):
        lon, lat = transformer.transform(point.x, point.y)
        return lat, lon
    else:
        return None, None

# Apply the function to the 'geometry' column
delay_df[['Latitude', 'Longitude']] = delay_df['geometry'].apply(utm_to_latlon).apply(pd.Series)

# Display the DataFrame
print(delay_df)

                          geometry  Count   Latitude  Longitude
0   POINT (629021.977 4840533.356)    229  43.706609 -79.398608
1   POINT (635157.330 4838173.966)    164  43.684282 -79.323080
2   POINT (618101.613 4832636.300)    156  43.637345 -79.535833
3   POINT (629140.910 4839583.709)    150  43.698041 -79.397360
4   POINT (627505.402 4848639.783)    139  43.779832 -79.415500
5   POINT (628695.620 4842566.191)    136  43.724962 -79.402171
6   POINT (628695.620 4842566.191)      1  43.724962 -79.402171
7   POINT (632306.357 4837276.516)    134  43.676718 -79.358659
8   POINT (630307.607 4835962.279)    133  43.665242 -79.383762
9   POINT (630370.915 4835534.587)    132  43.661382 -79.383081
10  POINT (630555.233 4834978.387)    126  43.656343 -79.380930
11  POINT (627716.063 4838010.034)    126  43.684123 -79.415408
12  POINT (631521.278 4836950.424)    112  43.673922 -79.368474
13  POINT (630698.023 4834540.929)    110  43.652381 -79.379266
14  POINT (638642.813 4841232.462)    10

In [24]:
# Create a base map
y = folium.Map(location=[43.656323, -79.380923], zoom_start=12, tiles='CartoDB positron')

# Create a color scale
min_count = delay_df['Count'].min()
max_count = delay_df['Count'].max()
color_scale = cm.linear.Blues_09.scale(min_count, max_count)

# Add points
for _, row in delay_df.iterrows():
    # Scale the radius based on the Count value
    radius = row['Count'] / max_count * 20  # Adjust the multiplier as needed

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=radius,
        color=color_scale(row['Count']),
        fill=True,
        fill_color=color_scale(row['Count']),
        fill_opacity=0.7
    ).add_to(y)

# Add color scale to map
color_scale.add_to(y)

# Display the map
y

In [25]:
# Create a base map
overlay_map = folium.Map(location=[43.656323, -79.380923], zoom_start=12, tiles='CartoDB positron', width='80%', height='100%')

# Create a color scale for the first map
min_count = delay_df['Count'].min()
max_count = delay_df['Count'].max()
color_scale_1 = cm.linear.Blues_09.scale(min_count, max_count)

# Add points for the first map
for _, row in delay_df.iterrows():
    # Scale the radius based on the Count value
    radius = row['Count'] / max_count * 10

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=radius,
        color=color_scale_1(row['Count']),
        fill=True,
        fill_color=color_scale_1(row['Count']),
        fill_opacity=0.7
    ).add_to(overlay_map)

# Create a color scale for the second map
color_scale_2 = cm.linear.YlOrRd_09.scale(min(merged_df['Count']), max(merged_df['Count']))

# Add points for the second map
for _, row in merged_df.iterrows():
    # Scale the radius based on the Count value
    radius = row['Count'] / max(merged_df['Count']) * 10
    folium.CircleMarker(
        location=[row['LAT_WGS84'], row['LONG_WGS84']],
        radius=radius,
        color=color_scale_2(row['Count']),
        fill=True,
        fill_color=color_scale_2(row['Count']),
        fill_opacity=0.5
    ).add_to(overlay_map)

# Display the overlay map
overlay_map
