<a href="https://colab.research.google.com/github/caiogasparine/3253-Machine-Learning/blob/main/Toronto_Major_Crime_Indicators(MCI)_GroupXX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Source of Data: Major Crime Indicators (MCI) Historical**

This dataset includes all Major Crime Indicators (MCI) occurrences by reported date and related offences from 2014 to June 30, 2022.

https://data.torontopolice.on.ca/pages/major-crime-indicators


# 1 - Import Libraries & Ingest Data

In [28]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from pandas.plotting import scatter_matrix
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Major_Crime_Indicators.csv', header=0)
#data = pd.read_csv('Major_Crime_Indicators.csv', header=0)

# 2. Data Cleaning & Transformation

Data profiling was done to ensure data quality and integrity. Contents of each field were analyzed for any anomalies and Null values. None we found.

In [30]:
data.shape

(301233, 30)

In [31]:
data.columns

Index(['X', 'Y', 'Index_', 'event_unique_id', 'Division', 'occurrencedate',
       'reporteddate', 'location_type', 'premises_type', 'ucr_code', 'ucr_ext',
       'offence', 'reportedyear', 'reportedmonth', 'reportedday',
       'reporteddayofyear', 'reporteddayofweek', 'reportedhour',
       'occurrenceyear', 'occurrencemonth', 'occurrenceday',
       'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour',
       'mci_category', 'Hood_ID', 'Neighbourhood', 'Longitude', 'Latitude',
       'ObjectId'],
      dtype='object')

In [32]:
data.head(5)

Unnamed: 0,X,Y,Index_,event_unique_id,Division,occurrencedate,reporteddate,location_type,premises_type,ucr_code,...,occurrenceday,occurrencedayofyear,occurrencedayofweek,occurrencehour,mci_category,Hood_ID,Neighbourhood,Longitude,Latitude,ObjectId
0,-8850398.0,5429359.0,201,GO-20141273318,D31,2014/01/03 05:00:00+00,2014/01/03 05:00:00+00,"Apartment (Rooming House, Condo)",Apartment,1430,...,3.0,3.0,Friday,11,Assault,27,York University Heights,-79.504475,43.766371,1
1,-8818405.0,5436836.0,202,GO-20141274349,D42,2014/01/03 05:00:00+00,2014/01/03 05:00:00+00,"Single Home, House (Attach Garage, Cottage, Mo...",House,2120,...,3.0,3.0,Friday,14,Break and Enter,132,Malvern,-79.217083,43.814861,2
2,-8853791.0,5402071.0,203,GO-20141274052,D22,2014/01/03 05:00:00+00,2014/01/03 05:00:00+00,"Open Areas (Lakes, Parks, Rivers)",Outside,1430,...,3.0,3.0,Friday,13,Assault,19,Long Branch,-79.534957,43.589085,3
3,-8832822.0,5419628.0,204,GO-20141276966,D53,2014/01/03 05:00:00+00,2014/01/03 05:00:00+00,Other Commercial / Corporate Places (For Profi...,Commercial,2130,...,3.0,3.0,Friday,12,Theft Over,55,Thorncliffe Park,-79.346588,43.703212,4
4,-8853452.0,5410706.0,205,GO-20141274457,D22,2014/01/03 05:00:00+00,2014/01/03 05:00:00+00,Convenience Stores,Commercial,1610,...,3.0,3.0,Friday,14,Robbery,14,Islington-City Centre West,-79.531916,43.645247,5


In [33]:
data['mci_category'].value_counts()

Assault            161833
Break and Enter     59440
Auto Theft          40794
Robbery             29230
Theft Over           9936
Name: mci_category, dtype: int64

In [34]:
data.describe()

Unnamed: 0,X,Y,Index_,ucr_code,ucr_ext,reportedyear,reportedday,reporteddayofyear,reportedhour,occurrenceyear,occurrenceday,occurrencedayofyear,occurrencehour,Longitude,Latitude,ObjectId
count,301233.0,301233.0,301233.0,301233.0,301233.0,301233.0,301233.0,301233.0,301233.0,301133.0,301133.0,301133.0,301233.0,301233.0,301233.0,301233.0
mean,-8724562.0,5350375.0,150617.0,1702.422928,146.565585,2017.876713,15.736659,180.765756,12.808786,2017.822852,15.456476,180.544766,12.601591,-78.374071,43.143874,150617.0
std,996674.6,611228.3,86958.621157,326.260102,51.988967,2.43694,8.766766,103.717225,6.502562,2.475716,8.914754,104.141362,7.243753,8.95328,4.928613,86958.621157
min,-8910331.0,0.0,1.0,1410.0,100.0,2014.0,1.0,1.0,0.0,2000.0,1.0,1.0,0.0,-80.042866,0.0,1.0
25%,-8846681.0,5412946.0,75309.0,1430.0,100.0,2016.0,8.0,92.0,8.0,2016.0,8.0,92.0,7.0,-79.471086,43.659806,75309.0
50%,-8838015.0,5419000.0,150617.0,1450.0,100.0,2018.0,16.0,178.0,13.0,2018.0,15.0,178.0,14.0,-79.393238,43.699132,150617.0
75%,-8829868.0,5426986.0,225925.0,2120.0,200.0,2020.0,23.0,270.0,18.0,2020.0,23.0,270.0,19.0,-79.320056,43.750978,225925.0
max,0.0,5517228.0,301233.0,2135.0,230.0,2022.0,31.0,366.0,23.0,2022.0,31.0,366.0,23.0,0.0,44.333691,301233.0


In [35]:
# Check if any column has missing values ("False" = not missing - "True" = missing)
data.isnull().any()

X                      False
Y                      False
Index_                 False
event_unique_id        False
Division               False
occurrencedate         False
reporteddate           False
location_type          False
premises_type          False
ucr_code               False
ucr_ext                False
offence                False
reportedyear           False
reportedmonth          False
reportedday            False
reporteddayofyear      False
reporteddayofweek      False
reportedhour           False
occurrenceyear          True
occurrencemonth         True
occurrenceday           True
occurrencedayofyear     True
occurrencedayofweek     True
occurrencehour         False
mci_category           False
Hood_ID                False
Neighbourhood          False
Longitude              False
Latitude               False
ObjectId               False
dtype: bool

In [36]:
data[['mci_category','offence','ucr_ext']]

Unnamed: 0,mci_category,offence,ucr_ext
0,Assault,Assault,100
1,Break and Enter,B&E,200
2,Assault,Assault,100
3,Theft Over,Theft Over,210
4,Robbery,Robbery - Business,210
...,...,...,...
301228,Auto Theft,Theft Of Motor Vehicle,210
301229,Auto Theft,Theft Of Motor Vehicle,210
301230,Auto Theft,Theft Of Motor Vehicle,210
301231,Auto Theft,Theft Of Motor Vehicle,210
