# **Toronto Crime Predictions**


---

In [1]:
# Dependencies
import requests
import json
import pprint
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor

---

## **Retrieve Data from API**

In [2]:
# Initialize the base URL and the initial parameters
base_url = 'https://services.arcgis.com/S9th0jAJ7bqgIRjw/ArcGIS/rest/services/Major_Crime_Indicators_Open_Data/FeatureServer/0/query'
params = {
    'where': 'OCC_YEAR<=2024 AND OCC_YEAR>=2021',
    'outFields': '*',
    'resultRecordCount': 2000,
    'resultOffset': 0,
    'f': 'geojson'
}
# Create an empty list to store all the results
crime_json_array = []
# Start the loop to paginate through the results
while True:
    response = requests.get(base_url, params=params).json()
    # Add the current batch of features to the list
    crime_json_array.extend(response['features'])
    # Check if 'exceededTransferLimit' is True, which means there are more records
    if 'properties' in response and response['properties'].get('exceededTransferLimit', False):
        # Increment the offset to get the next batch of records
        params['resultOffset'] += params['resultRecordCount']
    else:
        # Exit the loop if there are no more records to retrieve
        break

In [3]:
# View the raw JSON data
crime_json_array

[{'type': 'Feature',
  'id': 246675,
  'geometry': {'type': 'Point',
   'coordinates': [-79.425761926, 43.6817690130001]},
  'properties': {'OBJECTID': 246675,
   'EVENT_UNIQUE_ID': 'GO-20213605',
   'REPORT_DATE': 1609477200000,
   'OCC_DATE': 1609477200000,
   'REPORT_YEAR': 2021,
   'REPORT_MONTH': 'January',
   'REPORT_DAY': 1,
   'REPORT_DOY': 1,
   'REPORT_DOW': 'Friday    ',
   'REPORT_HOUR': 16,
   'OCC_YEAR': 2021,
   'OCC_MONTH': 'January',
   'OCC_DAY': 1,
   'OCC_DOY': 1,
   'OCC_DOW': 'Friday    ',
   'OCC_HOUR': 16,
   'DIVISION': 'D13',
   'LOCATION_TYPE': 'Parking Lots (Apt., Commercial Or Non-Commercial)',
   'PREMISES_TYPE': 'Outside',
   'UCR_CODE': 2135,
   'UCR_EXT': 210,
   'OFFENCE': 'Theft Of Motor Vehicle',
   'MCI_CATEGORY': 'Auto Theft',
   'HOOD_158': '094',
   'NEIGHBOURHOOD_158': 'Wychwood (94)',
   'HOOD_140': '094',
   'NEIGHBOURHOOD_140': 'Wychwood (94)',
   'LONG_WGS84': -79.42576192637651,
   'LAT_WGS84': 43.68176901263976}},
 {'type': 'Feature',
  'i

In [4]:
# Using a list comprehension to apply the function for each entry in the intial crime JSON array.
crime_json_list = [crime['properties'] for crime in crime_json_array]
crime_json_list

[{'OBJECTID': 246675,
  'EVENT_UNIQUE_ID': 'GO-20213605',
  'REPORT_DATE': 1609477200000,
  'OCC_DATE': 1609477200000,
  'REPORT_YEAR': 2021,
  'REPORT_MONTH': 'January',
  'REPORT_DAY': 1,
  'REPORT_DOY': 1,
  'REPORT_DOW': 'Friday    ',
  'REPORT_HOUR': 16,
  'OCC_YEAR': 2021,
  'OCC_MONTH': 'January',
  'OCC_DAY': 1,
  'OCC_DOY': 1,
  'OCC_DOW': 'Friday    ',
  'OCC_HOUR': 16,
  'DIVISION': 'D13',
  'LOCATION_TYPE': 'Parking Lots (Apt., Commercial Or Non-Commercial)',
  'PREMISES_TYPE': 'Outside',
  'UCR_CODE': 2135,
  'UCR_EXT': 210,
  'OFFENCE': 'Theft Of Motor Vehicle',
  'MCI_CATEGORY': 'Auto Theft',
  'HOOD_158': '094',
  'NEIGHBOURHOOD_158': 'Wychwood (94)',
  'HOOD_140': '094',
  'NEIGHBOURHOOD_140': 'Wychwood (94)',
  'LONG_WGS84': -79.42576192637651,
  'LAT_WGS84': 43.68176901263976},
 {'OBJECTID': 246676,
  'EVENT_UNIQUE_ID': 'GO-20213400',
  'REPORT_DATE': 1609477200000,
  'OCC_DATE': 1609477200000,
  'REPORT_YEAR': 2021,
  'REPORT_MONTH': 'January',
  'REPORT_DAY': 1,
  

In [5]:
# Convert the JSON array into a Dataframe
crime_data_original = pd.DataFrame(crime_json_list)
crime_data_original

Unnamed: 0,OBJECTID,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,246675,GO-20213605,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,094,Wychwood (94),094,Wychwood (94),-79.425762,43.681769
1,246676,GO-20213400,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,NSA,NSA,NSA,NSA,0.000000,0.000000
2,246677,GO-20211123,1609477200000,1609477200000,2021,January,1,1,Friday,7,...,2135,210,Theft Of Motor Vehicle,Auto Theft,031,Yorkdale-Glen Park (31),031,Yorkdale-Glen Park (31),-79.460110,43.721013
3,246678,GO-2021445,1609477200000,1609477200000,2021,January,1,1,Friday,1,...,2135,210,Theft Of Motor Vehicle,Auto Theft,151,Yonge-Doris (151),051,Willowdale East (51),-79.415293,43.778743
4,246679,GO-20213400,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,NSA,NSA,NSA,NSA,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147546,396731,GO-20241427047,1719723600000,1719637200000,2024,June,30,182,Sunday,16,...,1430,100,Assault,Assault,071,Cabbagetown-South St.James Town (71),071,Cabbagetown-South St.James Town (71),-79.373043,43.663195
147547,396732,GO-20241427869,1719723600000,1719723600000,2024,June,30,182,Sunday,18,...,2133,200,Theft Over - Shoplifting,Theft Over,027,York University Heights (27),027,York University Heights (27),-79.464942,43.759469
147548,396733,GO-20241423116,1719723600000,1719637200000,2024,June,30,182,Sunday,2,...,1450,120,Discharge Firearm With Intent,Assault,144,Morningside Heights (144),131,Rouge (131),-79.248477,43.837237
147549,396734,GO-20241426669,1719723600000,1718859600000,2024,June,30,182,Sunday,15,...,2132,200,Theft From Motor Vehicle Over,Theft Over,160,Mimico-Queensway (160),017,Mimico (includes Humber Bay Shores) (17),-79.521053,43.616490




---



## **Preprocess the Data**

In [6]:
# Get all the column names in crime_data_original
crime_data_original.columns

Index(['OBJECTID', 'EVENT_UNIQUE_ID', 'REPORT_DATE', 'OCC_DATE', 'REPORT_YEAR',
       'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', 'REPORT_HOUR',
       'OCC_YEAR', 'OCC_MONTH', 'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR',
       'DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE', 'UCR_CODE', 'UCR_EXT',
       'OFFENCE', 'MCI_CATEGORY', 'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140',
       'NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84'],
      dtype='object')

In [7]:
# Change the value of 'NSA' in the HOOD_158 column to 0 and then convert the column values to integers
crime_data_original['HOOD_158'] = crime_data_original['HOOD_158'].replace('NSA', 0)
crime_data_original['HOOD_158'] = crime_data_original['HOOD_158'].astype(int)
crime_data_original

Unnamed: 0,OBJECTID,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,246675,GO-20213605,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,94,Wychwood (94),094,Wychwood (94),-79.425762,43.681769
1,246676,GO-20213400,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,0,NSA,NSA,NSA,0.000000,0.000000
2,246677,GO-20211123,1609477200000,1609477200000,2021,January,1,1,Friday,7,...,2135,210,Theft Of Motor Vehicle,Auto Theft,31,Yorkdale-Glen Park (31),031,Yorkdale-Glen Park (31),-79.460110,43.721013
3,246678,GO-2021445,1609477200000,1609477200000,2021,January,1,1,Friday,1,...,2135,210,Theft Of Motor Vehicle,Auto Theft,151,Yonge-Doris (151),051,Willowdale East (51),-79.415293,43.778743
4,246679,GO-20213400,1609477200000,1609477200000,2021,January,1,1,Friday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,0,NSA,NSA,NSA,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147546,396731,GO-20241427047,1719723600000,1719637200000,2024,June,30,182,Sunday,16,...,1430,100,Assault,Assault,71,Cabbagetown-South St.James Town (71),071,Cabbagetown-South St.James Town (71),-79.373043,43.663195
147547,396732,GO-20241427869,1719723600000,1719723600000,2024,June,30,182,Sunday,18,...,2133,200,Theft Over - Shoplifting,Theft Over,27,York University Heights (27),027,York University Heights (27),-79.464942,43.759469
147548,396733,GO-20241423116,1719723600000,1719637200000,2024,June,30,182,Sunday,2,...,1450,120,Discharge Firearm With Intent,Assault,144,Morningside Heights (144),131,Rouge (131),-79.248477,43.837237
147549,396734,GO-20241426669,1719723600000,1718859600000,2024,June,30,182,Sunday,15,...,2132,200,Theft From Motor Vehicle Over,Theft Over,160,Mimico-Queensway (160),017,Mimico (includes Humber Bay Shores) (17),-79.521053,43.616490


In [8]:
crime_data_original['OCC_MONTH']

Unnamed: 0,OCC_MONTH
0,January
1,January
2,January
3,January
4,January
...,...
147546,June
147547,June
147548,June
147549,June


In [9]:
# prompt: Convert the crime_data_original column OCC_MONTH's month names to integer form

month_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

crime_data_original['OCC_MONTH'] = crime_data_original['OCC_MONTH'].map(month_mapping)
crime_data_original['OCC_MONTH']


Unnamed: 0,OCC_MONTH
0,1
1,1
2,1
3,1
4,1
...,...
147546,6
147547,6
147548,6
147549,6


In [10]:
# Collect only the necessary columns for the analysis
crime_pd = crime_data_original[['EVENT_UNIQUE_ID', 'NEIGHBOURHOOD_158', 'HOOD_158', 'LAT_WGS84', 'LONG_WGS84', 'PREMISES_TYPE', 'OCC_DATE', 'OCC_YEAR', 'OCC_MONTH', 'OCC_DAY', 'OCC_HOUR', 'MCI_CATEGORY']]
crime_pd


Unnamed: 0,EVENT_UNIQUE_ID,NEIGHBOURHOOD_158,HOOD_158,LAT_WGS84,LONG_WGS84,PREMISES_TYPE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DAY,OCC_HOUR,MCI_CATEGORY
0,GO-20213605,Wychwood (94),94,43.681769,-79.425762,Outside,1609477200000,2021,1,1,16,Auto Theft
1,GO-20213400,NSA,0,0.000000,0.000000,Commercial,1609477200000,2021,1,1,4,Auto Theft
2,GO-20211123,Yorkdale-Glen Park (31),31,43.721013,-79.460110,Other,1609477200000,2021,1,1,4,Auto Theft
3,GO-2021445,Yonge-Doris (151),151,43.778743,-79.415293,Other,1609477200000,2021,1,1,1,Auto Theft
4,GO-20213400,NSA,0,0.000000,0.000000,Commercial,1609477200000,2021,1,1,4,Auto Theft
...,...,...,...,...,...,...,...,...,...,...,...,...
147546,GO-20241427047,Cabbagetown-South St.James Town (71),71,43.663195,-79.373043,Apartment,1719637200000,2024,6,29,23,Assault
147547,GO-20241427869,York University Heights (27),27,43.759469,-79.464942,Commercial,1719723600000,2024,6,30,18,Theft Over
147548,GO-20241423116,Morningside Heights (144),144,43.837237,-79.248477,Outside,1719637200000,2024,6,29,21,Assault
147549,GO-20241426669,Mimico-Queensway (160),160,43.616490,-79.521053,Outside,1718859600000,2024,6,20,13,Theft Over


In [11]:
# Encode the MCI_CATEGORY column using get_dummies
crime_category_encoded = pd.get_dummies(crime_pd['MCI_CATEGORY']).astype(int)
crime_category_encoded


Unnamed: 0,Assault,Auto Theft,Break and Enter,Robbery,Theft Over
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
147546,1,0,0,0,0
147547,0,0,0,0,1
147548,1,0,0,0,0
147549,0,0,0,0,1


In [12]:
# Add the encoded columns back into the datframe
crime_pd = pd.concat([crime_pd, crime_category_encoded], axis=1)
crime_pd_encoded = crime_pd.drop(['MCI_CATEGORY'], axis=1) # Drop the original MCI_CATEGORY column
crime_pd_encoded

Unnamed: 0,EVENT_UNIQUE_ID,NEIGHBOURHOOD_158,HOOD_158,LAT_WGS84,LONG_WGS84,PREMISES_TYPE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DAY,OCC_HOUR,Assault,Auto Theft,Break and Enter,Robbery,Theft Over
0,GO-20213605,Wychwood (94),94,43.681769,-79.425762,Outside,1609477200000,2021,1,1,16,0,1,0,0,0
1,GO-20213400,NSA,0,0.000000,0.000000,Commercial,1609477200000,2021,1,1,4,0,1,0,0,0
2,GO-20211123,Yorkdale-Glen Park (31),31,43.721013,-79.460110,Other,1609477200000,2021,1,1,4,0,1,0,0,0
3,GO-2021445,Yonge-Doris (151),151,43.778743,-79.415293,Other,1609477200000,2021,1,1,1,0,1,0,0,0
4,GO-20213400,NSA,0,0.000000,0.000000,Commercial,1609477200000,2021,1,1,4,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147546,GO-20241427047,Cabbagetown-South St.James Town (71),71,43.663195,-79.373043,Apartment,1719637200000,2024,6,29,23,1,0,0,0,0
147547,GO-20241427869,York University Heights (27),27,43.759469,-79.464942,Commercial,1719723600000,2024,6,30,18,0,0,0,0,1
147548,GO-20241423116,Morningside Heights (144),144,43.837237,-79.248477,Outside,1719637200000,2024,6,29,21,1,0,0,0,0
147549,GO-20241426669,Mimico-Queensway (160),160,43.616490,-79.521053,Outside,1718859600000,2024,6,20,13,0,0,0,0,1


In [13]:
# Group the values by EVENT_UNIQUE_ID and get the first value for all the columns, except for Assault, Auto Theft, Break and Enter, Robbery, Theft Over
crime_first_group = crime_pd_encoded.groupby('EVENT_UNIQUE_ID')[["NEIGHBOURHOOD_158", "HOOD_158", "LAT_WGS84", "LONG_WGS84", "PREMISES_TYPE", "OCC_DATE", "OCC_YEAR", "OCC_MONTH", "OCC_DAY", "OCC_HOUR"]].first()
crime_first_group

Unnamed: 0_level_0,NEIGHBOURHOOD_158,HOOD_158,LAT_WGS84,LONG_WGS84,PREMISES_TYPE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DAY,OCC_HOUR
EVENT_UNIQUE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GO-20211000033,West Queen West (162),162,43.646286,-79.408568,Commercial,1622264400000,2021,5,29,21
GO-2021100004,Morningside Heights (144),144,43.807252,-79.162903,Outside,1610773200000,2021,1,16,17
GO-20211000054,Moss Park (73),73,43.657067,-79.374531,Apartment,1622264400000,2021,5,29,22
GO-20211000193,Fort York-Liberty Village (163),163,43.636618,-79.399704,Apartment,1622264400000,2021,5,29,23
GO-20211000248,Eglinton East (138),138,43.737099,-79.246230,Outside,1622264400000,2021,5,29,21
...,...,...,...,...,...,...,...,...,...,...
GO-20249997,Junction-Wallace Emerson (171),171,43.668917,-79.442637,Outside,1704085200000,2024,1,1,18
GO-202499972,Edenbridge-Humber Valley (9),9,43.672705,-79.522472,House,1705208400000,2024,1,14,3
GO-2024999786,Flemingdon Park (44),44,43.718727,-79.334948,Apartment,1714539600000,2024,5,1,0
GO-2024999795,Oakridge (121),121,43.691225,-79.288346,Commercial,1715230800000,2024,5,9,13


In [14]:
# Group the MCI_CATEGORY values by EVENT_UNIQUE_ID and get the maximum value.
crime_max_group = crime_pd_encoded.groupby('EVENT_UNIQUE_ID')[["Assault", "Auto Theft", "Break and Enter", "Robbery", "Theft Over"]].max()
crime_max_group

Unnamed: 0_level_0,Assault,Auto Theft,Break and Enter,Robbery,Theft Over
EVENT_UNIQUE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GO-20211000033,0,0,1,0,0
GO-2021100004,0,1,0,0,0
GO-20211000054,1,0,0,0,0
GO-20211000193,1,0,0,0,0
GO-20211000248,1,0,0,0,0
...,...,...,...,...,...
GO-20249997,0,1,0,0,0
GO-202499972,0,1,0,0,0
GO-2024999786,1,0,0,0,0
GO-2024999795,1,0,0,0,0


In [15]:
# Concatenate the two groups together into one Dataframe and reset the index so that EVENT_UNIQUE_ID is not the index.
crime_groups_joined = pd.concat([crime_first_group, crime_max_group], axis=1)
crime_groups_joined = crime_groups_joined.reset_index()
crime_groups_joined

Unnamed: 0,EVENT_UNIQUE_ID,NEIGHBOURHOOD_158,HOOD_158,LAT_WGS84,LONG_WGS84,PREMISES_TYPE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DAY,OCC_HOUR,Assault,Auto Theft,Break and Enter,Robbery,Theft Over
0,GO-20211000033,West Queen West (162),162,43.646286,-79.408568,Commercial,1622264400000,2021,5,29,21,0,0,1,0,0
1,GO-2021100004,Morningside Heights (144),144,43.807252,-79.162903,Outside,1610773200000,2021,1,16,17,0,1,0,0,0
2,GO-20211000054,Moss Park (73),73,43.657067,-79.374531,Apartment,1622264400000,2021,5,29,22,1,0,0,0,0
3,GO-20211000193,Fort York-Liberty Village (163),163,43.636618,-79.399704,Apartment,1622264400000,2021,5,29,23,1,0,0,0,0
4,GO-20211000248,Eglinton East (138),138,43.737099,-79.246230,Outside,1622264400000,2021,5,29,21,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129212,GO-20249997,Junction-Wallace Emerson (171),171,43.668917,-79.442637,Outside,1704085200000,2024,1,1,18,0,1,0,0,0
129213,GO-202499972,Edenbridge-Humber Valley (9),9,43.672705,-79.522472,House,1705208400000,2024,1,14,3,0,1,0,0,0
129214,GO-2024999786,Flemingdon Park (44),44,43.718727,-79.334948,Apartment,1714539600000,2024,5,1,0,1,0,0,0,0
129215,GO-2024999795,Oakridge (121),121,43.691225,-79.288346,Commercial,1715230800000,2024,5,9,13,1,0,0,0,0


---

## **Create the Model**

### Creating the Testing and Training Datasets - First Approach

In [16]:
# Create a Total_Count column in the crime_groups_joined dataframe
crime_groups_joined['Total_Count'] = crime_groups_joined.iloc[:, 11:].sum(axis=1)
crime_groups_joined

Unnamed: 0,EVENT_UNIQUE_ID,NEIGHBOURHOOD_158,HOOD_158,LAT_WGS84,LONG_WGS84,PREMISES_TYPE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DAY,OCC_HOUR,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count
0,GO-20211000033,West Queen West (162),162,43.646286,-79.408568,Commercial,1622264400000,2021,5,29,21,0,0,1,0,0,1
1,GO-2021100004,Morningside Heights (144),144,43.807252,-79.162903,Outside,1610773200000,2021,1,16,17,0,1,0,0,0,1
2,GO-20211000054,Moss Park (73),73,43.657067,-79.374531,Apartment,1622264400000,2021,5,29,22,1,0,0,0,0,1
3,GO-20211000193,Fort York-Liberty Village (163),163,43.636618,-79.399704,Apartment,1622264400000,2021,5,29,23,1,0,0,0,0,1
4,GO-20211000248,Eglinton East (138),138,43.737099,-79.246230,Outside,1622264400000,2021,5,29,21,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129212,GO-20249997,Junction-Wallace Emerson (171),171,43.668917,-79.442637,Outside,1704085200000,2024,1,1,18,0,1,0,0,0,1
129213,GO-202499972,Edenbridge-Humber Valley (9),9,43.672705,-79.522472,House,1705208400000,2024,1,14,3,0,1,0,0,0,1
129214,GO-2024999786,Flemingdon Park (44),44,43.718727,-79.334948,Apartment,1714539600000,2024,5,1,0,1,0,0,0,0,1
129215,GO-2024999795,Oakridge (121),121,43.691225,-79.288346,Commercial,1715230800000,2024,5,9,13,1,0,0,0,0,1


In [17]:
# Group the crime_groups_joined dataframe by neighbourhood, occ_year, occ_month,
# get the count of the rows aggregated to each group, and place those values in the Total_Count column.
crime_totals_by_month_year_hood = crime_groups_joined.groupby(['HOOD_158', 'OCC_YEAR', 'OCC_MONTH'])[['Total_Count']].count()
crime_totals_by_month_year_hood = crime_totals_by_month_year_hood.reset_index()
crime_totals_by_month_year_hood

Unnamed: 0,HOOD_158,OCC_YEAR,OCC_MONTH,Total_Count
0,0,2021,1,44
1,0,2021,2,34
2,0,2021,3,45
3,0,2021,4,26
4,0,2021,5,37
...,...,...,...,...
6672,174,2024,2,15
6673,174,2024,3,7
6674,174,2024,4,17
6675,174,2024,5,12


In [18]:
# Group the crime_groups_joined dataframe by neighbourhood, occ_year, occ_month and get the sum of assaults, auto theft, break and enter, robbery, and theft over.
crime_types_by_month_year_hood = crime_groups_joined.groupby(['HOOD_158', 'OCC_YEAR', 'OCC_MONTH'])[["Assault", "Auto Theft", "Break and Enter", "Robbery", "Theft Over"]].sum()
crime_types_by_month_year_hood = crime_types_by_month_year_hood.reset_index()
# Concatenate crime_totals_by_month_year_hood and crime_types_by_month_year_hood intot a single dataframe
crime_by_month_year_hood = pd.concat([crime_types_by_month_year_hood, crime_totals_by_month_year_hood['Total_Count']], axis=1)
crime_by_month_year_hood.sort_values(by=['Total_Count'])

Unnamed: 0,HOOD_158,OCC_YEAR,OCC_MONTH,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count
1895,49,2021,6,1,0,1,0,0,1
2229,58,2021,4,0,0,1,0,0,1
1909,49,2022,8,0,1,0,0,0,1
5208,140,2021,2,1,0,0,0,0,1
1890,49,2021,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
77,1,2023,12,30,45,28,4,13,118
68,1,2023,3,18,83,12,3,3,119
65,1,2022,12,24,67,15,4,10,120
71,1,2023,6,24,77,22,4,5,131


In [19]:
# Show all the rows where HOOD_158 has a value of 0
crime_by_month_year_hood.loc[crime_by_month_year_hood['HOOD_158'] == 0, :]

Unnamed: 0,HOOD_158,OCC_YEAR,OCC_MONTH,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count
0,0,2021,1,23,2,10,9,1,44
1,0,2021,2,24,1,8,1,0,34
2,0,2021,3,27,7,5,5,3,45
3,0,2021,4,16,1,3,3,3,26
4,0,2021,5,30,3,3,1,1,37
5,0,2021,6,22,3,3,2,3,32
6,0,2021,7,29,2,3,6,2,42
7,0,2021,8,48,6,2,7,3,66
8,0,2021,9,27,8,6,3,3,47
9,0,2021,10,39,9,15,2,2,65


In [20]:
# Filter out the rows where the column HOOD_158 has a value of 0
crime_by_month_year_hood = crime_by_month_year_hood.loc[crime_by_month_year_hood['HOOD_158'] != 0, :]
crime_by_month_year_hood = crime_by_month_year_hood.reset_index(drop=True)
crime_by_month_year_hood

Unnamed: 0,HOOD_158,OCC_YEAR,OCC_MONTH,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count
0,1,2021,1,18,35,7,1,3,62
1,1,2021,2,17,17,5,1,3,43
2,1,2021,3,15,20,8,6,6,54
3,1,2021,4,11,31,4,2,4,52
4,1,2021,5,18,26,9,5,4,62
...,...,...,...,...,...,...,...,...,...
6630,174,2024,2,9,0,5,1,1,15
6631,174,2024,3,6,1,0,0,0,7
6632,174,2024,4,12,2,2,0,1,17
6633,174,2024,5,8,1,2,0,1,12


In [21]:
# Have the training data be all the data in crime_by_month_year_hood for the years 2021 to 2023
train_data = crime_by_month_year_hood.loc[crime_by_month_year_hood['OCC_YEAR'].isin([2021, 2022, 2023]), :]

# Have the testing data be all the data in crime_by_month_year_hood for the year 2024
test_data = crime_by_month_year_hood.loc[crime_by_month_year_hood['OCC_YEAR'] == 2024, :]

In [22]:
# Split the training data into the features and targets
X_train = train_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

X_test = test_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

y_train = train_data[['Assault', 'Auto Theft', 'Break and Enter', 'Robbery', 'Theft Over', 'Total_Count']]  # Target

y_test = test_data[['Assault', 'Auto Theft', 'Break and Enter', 'Robbery', 'Theft Over', 'Total_Count']]  # Target

### Testing Different Models - First Approach

In [23]:
# Initialize the Random Forest Regressor
r_model = RandomForestRegressor()

# Train the model on the training data
r_model.fit(X_train, y_train)

In [24]:
# Initialize the Histogram-Based Gradient Boosting Regressor and wrap it with MultiOutputRegressor
h_model = MultiOutputRegressor(HistGradientBoostingRegressor())

# Train the model on the training data
h_model.fit(X_train, y_train)

In [25]:
# Initialize the Lasso Regressor
l_model = Lasso()

# Train the model on the training data
l_model.fit(X_train, y_train)

In [26]:
# Initialize the Extra-Trees Regressor
e_model = ExtraTreesRegressor()

# Train the model on the training data
e_model.fit(X_train, y_train)

In [27]:
# Initialize the K-Nearest Neighbors Regressor
k_model = KNeighborsRegressor()

# Train the model on the training data
k_model.fit(X_train, y_train)

In [28]:
# Initialize the Elastic Net Regressor
e_n_model = ElasticNet()

# Train the model on the training data
e_n_model.fit(X_train, y_train)

In [29]:
# Initialize the Radius Neighbors Regressor
r_n_model = RadiusNeighborsRegressor()

# Train the model on the training data
r_n_model.fit(X_train, y_train)

In [30]:
# Make predictions on the test set with the Random Forest Regressor
y_r_pred = r_model.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor
y_h_pred = h_model.predict(X_test)

# Make predictions on the test set with the Lasso Regressor
y_l_pred = l_model.predict(X_test)

# Make predictions on the test set with the Extra-Trees Regressor
y_e_pred = e_model.predict(X_test)

# Make predictions on the test set with the K-Nearest Neighbors Regressor
y_k_pred = k_model.predict(X_test)

# Make predictions on the test set with the Elastic Net Regressor
y_e_n_pred = e_n_model.predict(X_test)

# Make predictions on the test set with the Radius Neighbors Regressor
y_r_n_pred = r_n_model.predict(X_test)

# Print the predicted crime counts for the test set for all the models
print("Random Forest Predicted crime counts for (test set):", y_r_pred)
print("Histogram-Based Gradient Boosting Predicted crime counts for (test set):", y_h_pred)
print("Lasso Regressor Predicted crime counts for (test set):", y_l_pred)
print("Extra-Trees Regressor Predicted crime counts for (test set):", y_e_pred)
print("K-Nearest Neighbors Regressor Predicted crime counts for (test set):", y_k_pred)
print("Elastic Net Regressor Predicted crime counts for (test set):", y_e_n_pred)
print("Radius Neighbors Regressor Predicted crime counts for (test set):", y_r_n_pred)

Random Forest Predicted crime counts for (test set): [[1.7500e+01 8.3520e+01 1.3900e+01 4.2300e+00 6.5200e+00 1.2546e+02]
 [1.5890e+01 7.0540e+01 1.4360e+01 7.1500e+00 6.7700e+00 1.1407e+02]
 [1.7390e+01 7.6240e+01 1.3070e+01 4.8700e+00 4.5600e+00 1.1577e+02]
 ...
 [5.2800e+00 2.8000e+00 2.7900e+00 1.1000e-01 8.2000e-01 1.1800e+01]
 [7.3100e+00 1.3500e+00 2.6200e+00 2.5000e-01 3.2000e-01 1.1820e+01]
 [4.7700e+00 1.1800e+00 1.6500e+00 1.4500e+00 7.7000e-01 9.8200e+00]]
Histogram-Based Gradient Boosting Predicted crime counts for (test set): [[ 19.67589275  62.11731357  13.8018587    3.50618882   5.30514193
  108.43872484]
 [ 19.03243738  59.71585395  13.80908441   4.10071414   5.2917468
  105.83106772]
 [ 20.74704133  60.72346006  14.00794312   4.29920655   5.30692221
  107.92372245]
 ...
 [  7.17071503   1.45463046   3.01718845   0.47609182   0.56415242
   12.66759225]
 [  7.77662045   1.74450785   3.11291836   0.49039185   0.69702188
   13.50335636]
 [  7.67900526   1.59320963   2.246

  multiarray.copyto(res, fill_value, casting='unsafe')


In [31]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Mean Squared Error: 16.320249947257384
R-squared: 0.42257437260500347
mean absolute error: 2.5092281997187054


In [32]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting.
h_mse = mean_squared_error(y_test, y_h_pred)
h_r2 = r2_score(y_test, y_h_pred)
h_mae = mean_absolute_error(y_test, y_h_pred)

print(f"Mean Squared Error: {h_mse}")
print(f"R-squared: {h_r2}")
print(f"mean absolute error: {h_mae}")

Mean Squared Error: 13.53697402119441
R-squared: 0.5401264890283979
mean absolute error: 2.3624557106149555


In [33]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Lasso Regressor
l_mse = mean_squared_error(y_test, y_l_pred)
l_r2 = r2_score(y_test, y_l_pred)
l_mae = mean_absolute_error(y_test, y_l_pred)

print(f"Mean Squared Error: {l_mse}")
print(f"R-squared: {l_r2}")
print(f"mean absolute error: {l_mae}")

Mean Squared Error: 56.06504159690181
R-squared: 0.0008593089048466821
mean absolute error: 4.053649104634542


In [34]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Extra-Trees Regressor
e_mse = mean_squared_error(y_test, y_e_pred)
e_r2 = r2_score(y_test, y_e_pred)
e_mae = mean_absolute_error(y_test, y_e_pred)

print(f"Mean Squared Error: {e_mse}")
print(f"R-squared: {e_r2}")
print(f"mean absolute error: {e_mae}")

Mean Squared Error: 21.00927246835443
R-squared: 0.26335932380394184
mean absolute error: 2.83704641350211


In [35]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for K-Nearest Neighbors Regressor
k_mse = mean_squared_error(y_test, y_k_pred)
k_r2 = r2_score(y_test, y_k_pred)
k_mae = mean_absolute_error(y_test, y_k_pred)

print(f"Mean Squared Error: {k_mse}")
print(f"R-squared: {k_r2}")
print(f"mean absolute error: {k_mae}")

Mean Squared Error: 22.812032348804497
R-squared: 0.3917381351580209
mean absolute error: 2.9212025316455694


In [36]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Elastic Net Regressor
e_n_mse = mean_squared_error(y_test, y_e_n_pred)
e_n_r2 = r2_score(y_test, y_e_n_pred)
e_n_mae = mean_absolute_error(y_test, y_e_n_pred)

print(f"Mean Squared Error: {e_n_mse}")
print(f"R-squared: {e_n_r2}")
print(f"mean absolute error: {e_n_mae}")

Mean Squared Error: 55.83116626511688
R-squared: 0.0018104521500861837
mean absolute error: 4.0478073837331445


In [37]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Radius Neighbors Regressor
r_n_mse = mean_squared_error(y_test, y_r_n_pred)
r_n_r2 = r2_score(y_test, y_r_n_pred)
r_n_mae = mean_absolute_error(y_test, y_r_n_pred)

print(f"Mean Squared Error: {r_n_mse}")
print(f"R-squared: {r_n_r2}")
print(f"mean absolute error: {r_n_mae}")

Mean Squared Error: 8.973691110784241e+34
R-squared: -1.48166511902087e+34
mean absolute error: 9729295397526136.0


### Optimizing the models - First Approach

In [38]:
# Create scorers for R-squared and MAE
r2_scorer = make_scorer(r2_score)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)  # Minimize MAE

##### **Perform Hyper-Parameter Tuning on the Models**

Hyper-Parameter Tuning was performed on the following models, since they yielded the highest R-squared scores:


*   Random Forest Regressor
*   Histogram-Based Gradient Boosting
*   K-Nearest Neighbors Regressor



###### Iterations for Random Forest Regressor

In [39]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring=make_scorer(mean_squared_error, greater_is_better=False),
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.928958393105038
R-squared: 0.49663590673300156
mean absolute error: 2.6323390583657864


In [40]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=10,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.928958393105038
R-squared: 0.49663590673300156
mean absolute error: 2.6323390583657864


In [41]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=20,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.900895599495165
R-squared: 0.49718000005206253
mean absolute error: 2.6397685170607654


In [42]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=20,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=15,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 5}
Mean Squared Error: 37.38108382013141
R-squared: 0.24787742552140726
mean absolute error: 3.5582910596792368


In [43]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.900895599495165
R-squared: 0.49718000005206253
mean absolute error: 2.6397685170607654


In [44]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.900895599495165
R-squared: 0.49718000005206253
mean absolute error: 2.6397685170607654


In [45]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [75, 100, 200],
    'max_depth': [8, 10, 20],
    'min_samples_split': [10, 15, 30],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.900895599495165
R-squared: 0.49718000005206253
mean absolute error: 2.6397685170607654


In [46]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [15, 35, 55],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 18.358640232112
R-squared: 0.49228192721420205
mean absolute error: 2.66464858496764


In [47]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [100, 150, 250],
    'max_depth': [10, 20, 30],
    'min_samples_split': [15, 35, 55],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 250, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 18.348343979326263
R-squared: 0.49257004515939246
mean absolute error: 2.6625904431614718


In [48]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [150, 250, 350, 450],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [15, 35, 55, 75],
    'min_samples_leaf': [2, 4, 6, 8]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 450, 'min_samples_split': 15, 'min_samples_leaf': 8, 'max_depth': 10}
Mean Squared Error: 18.777789215683814
R-squared: 0.485768406447904
mean absolute error: 2.6973421196246625


In [49]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [15, 35, 55, 75],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object (similar to HGB)
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=25,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 400, 'min_samples_split': 55, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 23.92497566088106
R-squared: 0.4239082506998882
mean absolute error: 2.9652493531921866


In [50]:
# Define the hyperparameter search space for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Create the model for Random Forest Regressor
rf_model = RandomForestRegressor(random_state=1)

# Create RandomizedSearchCV object
random_search_reg = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_reg,
    n_iter=20,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_reg.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_reg.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = random_search_reg.best_estimator_

# Make predictions on the test set with the Random Forest Regressor
y_r_pred = best_rf_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests.
r_mse = mean_squared_error(y_test, y_r_pred)
r_r2 = r2_score(y_test, y_r_pred)
r_mae = mean_absolute_error(y_test, y_r_pred)

print(f"Mean Squared Error: {r_mse}")
print(f"R-squared: {r_r2}")
print(f"mean absolute error: {r_mae}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Mean Squared Error: 17.900895599495165
R-squared: 0.49718000005206253
mean absolute error: 2.6397685170607654


###### Iterations for Histogram-Based Gradient Boosting

In [51]:
# Define the hyperparameter search space for Histogram-Based Gradient Boosting Regressor wrapped with MultiOutputRegressor
param_dist_hist = {
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 5, 7],
    'estimator__l2_regularization': [0.0, 0.1, 0.2]
}

# Create the Histogram-Based Gradient Boosting Regressor and wrap it with MultiOutputRegressor
hgb_model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=1))

# Create RandomizedSearchCV object with multiple scoring metrics
random_search_hist = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_dist_hist,
    n_iter=10,
    scoring={'r2': r2_scorer, 'mae': mae_scorer},  # Multiple scorers
    refit='r2',  # Refit on R-squared
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_hist.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_hist.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_hgb_model = random_search_hist.best_estimator_

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor
y_h_pred = best_hgb_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting.
h_mse = mean_squared_error(y_test, y_h_pred)
h_r2 = r2_score(y_test, y_h_pred)
h_mae = mean_absolute_error(y_test, y_h_pred)

print(f"Mean Squared Error: {h_mse}")
print(f"R-squared: {h_r2}")
print(f"mean absolute error: {h_mae}")

Best Hyperparameters: {'estimator__max_depth': 7, 'estimator__learning_rate': 0.01, 'estimator__l2_regularization': 0.2}
Mean Squared Error: 32.18282924168508
R-squared: 0.3320401942652549
mean absolute error: 3.2649741520630413


In [52]:
# Define the hyperparameter search space for Histogram-Based Gradient Boosting Regressor wrapped with MultiOutputRegressor
param_dist_hist = {
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [None, 3, 5],
    'estimator__min_samples_leaf': [10, 20, 40],
    'estimator__l2_regularization': [0.0, 0.1, 0.2]
}

# Create the Histogram-Based Gradient Boosting Regressor and wrap it with MultiOutputRegressor
hgb_model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=1))

# Create RandomizedSearchCV object with multiple scoring metrics
random_search_hist = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_dist_hist,
    n_iter=20,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_hist.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_hist.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_hgb_model = random_search_hist.best_estimator_

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor
y_h_pred = best_hgb_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting.
h_mse = mean_squared_error(y_test, y_h_pred)
h_r2 = r2_score(y_test, y_h_pred)
h_mae = mean_absolute_error(y_test, y_h_pred)

print(f"Mean Squared Error: {h_mse}")
print(f"R-squared: {h_r2}")
print(f"mean absolute error: {h_mae}")

Best Hyperparameters: {'estimator__min_samples_leaf': 40, 'estimator__max_depth': None, 'estimator__learning_rate': 0.01, 'estimator__l2_regularization': 0.0}
Mean Squared Error: 34.768348922898916
R-squared: 0.28056428760433166
mean absolute error: 3.322235227775795


In [53]:
# Define the hyperparameter search space for Histogram-Based Gradient Boosting Regressor wrapped with MultiOutputRegressor
param_dist_hist = {
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [None, 3, 5],
    'estimator__min_samples_leaf': [10, 20, 40],
    'estimator__l2_regularization': [0.0, 0.1, 0.2]
}

# Create the Histogram-Based Gradient Boosting Regressor and wrap it with MultiOutputRegressor
hgb_model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=1))

# Create RandomizedSearchCV object with multiple scoring metrics
random_search_hist = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_dist_hist,
    n_iter=10,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_hist.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_hist.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_hgb_model = random_search_hist.best_estimator_

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor
y_h_pred = best_hgb_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting.
h_mse = mean_squared_error(y_test, y_h_pred)
h_r2 = r2_score(y_test, y_h_pred)
h_mae = mean_absolute_error(y_test, y_h_pred)

print(f"Mean Squared Error: {h_mse}")
print(f"R-squared: {h_r2}")
print(f"mean absolute error: {h_mae}")

Best Hyperparameters: {'estimator__min_samples_leaf': 10, 'estimator__max_depth': None, 'estimator__learning_rate': 0.01, 'estimator__l2_regularization': 0.1}
Mean Squared Error: 27.707811341396553
R-squared: 0.39456052080811127
mean absolute error: 3.123993487397762


###### Iterations for K-Nearest Neighbors Regressor

In [54]:
# Define the hyperparameter search space for K-Nearest Neighbors Regressor
param_dist_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Creat the model for K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()

# Create RandomizedSearchCV object
random_search_knn = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_dist_knn,
    n_iter=10,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_knn.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_knn.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_knn_model = random_search_knn.best_estimator_

# Make predictions on the test set with the K-Nearest Neighbors Regressor
y_k_pred = best_knn_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for K-Nearest Neighbors Regressor.
k_mse = mean_squared_error(y_test, y_k_pred)
k_r2 = r2_score(y_test, y_k_pred)
k_mae = mean_absolute_error(y_test, y_k_pred)

print(f"Mean Squared Error: {k_mse}")
print(f"R-squared: {k_r2}")
print(f"mean absolute error: {k_mae}")

Best Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 9}
Mean Squared Error: 18.85456904070352
R-squared: 0.46409840345829956
mean absolute error: 2.643735813566883


In [55]:
# Define the hyperparameter search space for K-Nearest Neighbors Regressor
param_dist_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Creat the model for K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()

# Create RandomizedSearchCV object
random_search_knn = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_dist_knn,
    n_iter=20,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_knn.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_knn.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_knn_model = random_search_knn.best_estimator_

# Make predictions on the test set with the K-Nearest Neighbors Regressor
y_k_pred = best_knn_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for K-Nearest Neighbors Regressor.
k_mse = mean_squared_error(y_test, y_k_pred)
k_r2 = r2_score(y_test, y_k_pred)
k_mae = mean_absolute_error(y_test, y_k_pred)

print(f"Mean Squared Error: {k_mse}")
print(f"R-squared: {k_r2}")
print(f"mean absolute error: {k_mae}")



Best Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 9}
Mean Squared Error: 18.85456904070352
R-squared: 0.46409840345829956
mean absolute error: 2.643735813566883


In [56]:
# Define the hyperparameter search space for K-Nearest Neighbors Regressor
param_dist_knn = {
    'n_neighbors': [7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Creat the model for K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()

# Create RandomizedSearchCV object
random_search_knn = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_dist_knn,
    n_iter=16,
    scoring=r2_scorer,  # Use R-squared as the primary scoring metric
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search_knn.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search_knn.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_knn_model = random_search_knn.best_estimator_

# Make predictions on the test set with the K-Nearest Neighbors Regressor
y_k_pred = best_knn_model.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for K-Nearest Neighbors Regressor.
k_mse = mean_squared_error(y_test, y_k_pred)
k_r2 = r2_score(y_test, y_k_pred)
k_mae = mean_absolute_error(y_test, y_k_pred)

print(f"Mean Squared Error: {k_mse}")
print(f"R-squared: {k_r2}")
print(f"mean absolute error: {k_mae}")

Best Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 11}
Mean Squared Error: 19.612298984089254
R-squared: 0.46018526487952816
mean absolute error: 2.677782519371975


##### **Apply Boosting Algorithms**

In [57]:
# prompt: create an AdaBoost regressor with the base estimator being a Histogram-Based Gradient Boosting wrapped with MultiOutputRegressor with sci-kit learn

# Create the base estimator (Histogram-Based Gradient Boosting)
# Removed MultiOutputRegressor wrapper from the base estimator
base_estimator = HistGradientBoostingRegressor(random_state=1)

# Create the AdaBoostRegressor with the base estimator
ada_model = AdaBoostRegressor(estimator=base_estimator, random_state=1)

# Create the MultiOutputRegressor using the AdaBoostRegressor as the estimator
# This allows to handle multiple targets
multi_output_ada_model = MultiOutputRegressor(ada_model)

# Train the MultiOutputRegressor model
multi_output_ada_model.fit(X_train, y_train)

In [58]:
# Make predictions with the AdaBoostRegressor
y_ada_pred = multi_output_ada_model.predict(X_test)

# Print the predicted crime counts for the test set
print("AdaBoostRegressor Predicted crime counts for (test set):", y_ada_pred)

AdaBoostRegressor Predicted crime counts for (test set): [[ 20.66629907  87.17995984  14.41725162   4.78233754   6.05781044
  124.56540556]
 [ 19.38154797  66.29556166  14.76898081   8.0926096    6.51077534
  116.5561135 ]
 [ 23.20181503  79.18185071  14.65249415   4.65244405   4.52227489
  123.05092625]
 ...
 [  6.92463358   2.09002904   2.95467312   0.83790368   0.99710242
   13.44938033]
 [  7.78974773   2.80687418   3.02420033   0.75488071   1.08868655
   15.64016799]
 [  7.64313498   3.0028289    2.97980655   0.66599503   0.5909388
   14.50965179]]


In [59]:
# Evaluate the AdaBoostRegressor model
ada_mse = mean_squared_error(y_test, y_ada_pred)
ada_r2 = r2_score(y_test, y_ada_pred)
ada_mae = mean_absolute_error(y_test, y_ada_pred)

print(f"AdaBoostRegressor Mean Squared Error: {ada_mse}")
print(f"AdaBoostRegressor R-squared: {ada_r2}")
print(f"AdaBoostRegressor Mean Absolute Error: {ada_mae}")

AdaBoostRegressor Mean Squared Error: 15.514865328785016
AdaBoostRegressor R-squared: 0.4603984456235
AdaBoostRegressor Mean Absolute Error: 2.5524177817460036


In [60]:
# prompt: Create a Regression Chain model with the base_estimator being the best_rf_model Random Forests Regressor created in previous cells  with sci-kit learn

# Create a Regression Chain model with the best_rf_model as the base estimator
regression_chain_rf_model = RegressorChain(base_estimator=best_rf_model)

# Train the MultiOutputRegressor model
regression_chain_rf_model.fit(X_train, y_train)

In [61]:
# Make predictions using the Regression Chain model
y_chain_rf_pred = regression_chain_rf_model.predict(X_test)

# Print the predictions from the Regression Chain model
print("Random Forest Regression Chain Predicted crime counts for (test set):", y_chain_rf_pred)

Random Forest Regression Chain Predicted crime counts for (test set): [[ 19.28641161  63.35615897  11.58277805   4.23079919   5.15441394
  107.17621358]
 [ 19.41042559  62.90486529  11.58277805   4.12595792   5.09692296
  105.10204778]
 [ 20.40146526  62.72135652  12.43378031   4.24440268   5.07319568
  105.10204778]
 ...
 [  6.77701568   1.71043656   3.99087977   0.88878896   0.55660771
   14.7467921 ]
 [  7.31416377   1.55417242   3.63113415   0.96714881   0.55283734
   14.7467921 ]
 [  6.52442348   1.65951764   3.6284666    0.98267161   0.55530954
   14.7467921 ]]


In [62]:
# Evaluate the Regression Chain model
chain_rf_mse = mean_squared_error(y_test, y_chain_rf_pred)
chain_rf_r2 = r2_score(y_test, y_chain_rf_pred)
chain_rf_mae = mean_absolute_error(y_test, y_chain_rf_pred)

print(f"Regression Chain Model Mean Squared Error: {chain_rf_mse}")
print(f"Regression Chain Model R-squared: {chain_rf_r2}")
print(f"Regression Chain Model Mean Absolute Error: {chain_rf_mae}")

Regression Chain Model Mean Squared Error: 14.907590528079728
Regression Chain Model R-squared: 0.5211560076522394
Regression Chain Model Mean Absolute Error: 2.4530789200162197




---



### Creating the Testing and Training Datasets - Second Approach

In [63]:
# Split the training data into the features and targets
X_train = train_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

X_test = test_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

y_train = train_data[['Assault', 'Auto Theft', 'Break and Enter', 'Robbery', 'Theft Over', 'Total_Count']]  # Target

y_test = test_data[['Assault', 'Auto Theft', 'Break and Enter', 'Robbery', 'Theft Over', 'Total_Count']]  # Target

In [64]:
# Split the training data into the features and targets, but this time make a seperate dataframe for each target column
X_train = train_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

X_test = test_data[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']]  # Features

y_train_A = train_data[['Assault']]  # Assault Train Target
y_test_A = test_data[['Assault']]  # Assault Test Target

y_train_AT = train_data[['Auto Theft']]  # Auto Theft Train Target
y_test_AT = test_data[['Auto Theft']]  # Auto Theft Test Target

y_train_BE = train_data[['Break and Enter']]  # Break and Enter Train Target
y_test_BE = test_data[['Break and Enter']]  # Break and Enter Test Target

y_train_R = train_data[['Robbery']]  # Robbery Train Target
y_test_R = test_data[['Robbery']]  # Robbery Test Target

y_train_TO = train_data[['Theft Over']]  # Theft Over Train Target
y_test_TO = test_data[['Theft Over']]  # Theft Over Test Target

y_train_TC = train_data[['Total_Count']]  # Total Count Train Target
y_test_TC = test_data[['Total_Count']]  # Total Count Test Target

### Testing Different Models - Second Approach

#### Predicting Total Counts

In [65]:
# Initialize the Random Forest Regressor for the Total Count Targets
r_model_TC = RandomForestRegressor()

# Train the model on the training data
r_model_TC.fit(X_train, y_train_TC)

  return fit_method(estimator, *args, **kwargs)


In [66]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Total Count Targets
h_model_TC = HistGradientBoostingRegressor()

# Train the model on the training data
h_model_TC.fit(X_train, y_train_TC)

  y = column_or_1d(y, warn=True)


In [67]:
# Initialize the Lasso Regressor for the Total Count Targets
l_model_TC = Lasso()

# Train the model on the training data
l_model_TC.fit(X_train, y_train_TC)

In [68]:
# Initialize the Extra-Trees Regressor for the Total Count Targets
e_model_TC = ExtraTreesRegressor()

# Train the model on the training data
e_model_TC.fit(X_train, y_train_TC)

  return fit_method(estimator, *args, **kwargs)


In [69]:
# Initialize the K-Nearest Neighbors Regressor for the Total Count Targets
k_model_TC = KNeighborsRegressor()

# Train the model on the training data
k_model_TC.fit(X_train, y_train_TC)

In [70]:
# Initialize the Elastic Net Regressor for the Total Count Targets
e_n_model_TC = ElasticNet()

# Train the model on the training data
e_n_model_TC.fit(X_train, y_train_TC)

In [71]:
# Initialize the Radius Neighbors Regressor for the Total Count Targets
r_n_model_TC = RadiusNeighborsRegressor()

# Train the model on the training data
r_n_model_TC.fit(X_train, y_train_TC)

In [72]:
# Make predictions on the test set with the Random Forest Regressor for the Total Count Targets
y_r_TC_pred = r_model_TC.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Total Count Targets
y_h_TC_pred = h_model_TC.predict(X_test)

# Make predictions on the test set with the Lasso Regressor for the Total Count Targets
y_l_TC_pred = l_model_TC.predict(X_test)

# Make predictions on the test set with the Extra-Trees Regressor for the Total Count Targets
y_e_TC_pred = e_model_TC.predict(X_test)

# Make predictions on the test set with the K-Nearest Neighbors Regressor for the Total Count Targets
y_k_TC_pred = k_model_TC.predict(X_test)

# Make predictions on the test set with the Elastic Net Regressor for the Total Count Targets
y_e_n_TC_pred = e_n_model_TC.predict(X_test)

# Make predictions on the test set with the Radius Neighbors Regressor for the Total Count Targets
y_r_n_TC_pred = r_n_model_TC.predict(X_test)

# Print the predicted crime counts for the test set for all the models for the Total Count Targets
print("Random Forest Predicted crime counts for (test set):", y_r_TC_pred)
print("Histogram-Based Gradient Boosting Predicted crime counts for (test set):", y_h_TC_pred)
print("Lasso Regressor Predicted crime counts for (test set):", y_l_TC_pred)
print("Extra-Trees Regressor Predicted crime counts for (test set):", y_e_TC_pred)
print("K-Nearest Neighbors Regressor Predicted crime counts for (test set):", y_k_TC_pred)
print("Elastic Net Regressor Predicted crime counts for (test set):", y_e_n_TC_pred)
print("Radius Neighbors Regressor Predicted crime counts for (test set):", y_r_n_TC_pred)

Random Forest Predicted crime counts for (test set): [122.11 114.2  115.32 112.76 111.41 122.37  32.56  31.76  31.26  36.3
  40.55  44.06  14.14   7.89  11.22  14.76  14.44   9.45  19.51  13.66
  14.61  14.28  13.17  11.18  12.14   9.5   11.37  12.14   9.57   9.9
  17.88  18.33  26.6   24.5   27.    22.91  23.41  20.7   29.91  18.4
  18.28  22.25   8.09   4.83   5.09   8.01   7.82   9.57  16.24  11.02
  10.29  13.05  18.28  20.15  16.63  12.37  12.82  14.49  17.42  18.65
  20.33  13.13  15.05  12.29  14.37  16.97   9.09   6.23   9.79   6.68
   9.05   9.16  11.75   9.62  10.54   9.12  10.35   7.6   14.51  15.12
  14.54  15.3   19.12  18.41  30.42  31.14  30.78  33.43  38.4   33.28
  21.76  13.2   15.99  15.01  20.09  23.27  10.58  10.25  10.3   10.23
  12.82  12.54   8.98   8.18   7.4    6.92  12.23   9.29  26.23  21.83
  28.24  23.17  27.96  34.46  23.79  15.78  21.41  25.17  28.33  20.39
  17.77  16.66  14.56  20.31  17.26  18.89  23.89  25.35  25.19  24.21
  27.8   23.85  29.52  27.9

  multiarray.copyto(res, fill_value, casting='unsafe')


In [73]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Total Count.
r_TC_mse = mean_squared_error(y_test_TC, y_r_TC_pred)
r_TC_r2 = r2_score(y_test_TC, y_r_TC_pred)
r_TC_mae = mean_absolute_error(y_test_TC, y_r_TC_pred)

print(f"Mean Squared Error: {r_TC_mse}")
print(f"R-squared: {r_TC_r2}")
print(f"mean absolute error: {r_TC_mae}")

Mean Squared Error: 47.736269092827
R-squared: 0.7795259054055894
mean absolute error: 5.1626898734177225


In [74]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Total Count.
h_TC_mse = mean_squared_error(y_test_TC, y_h_TC_pred)
h_TC_r2 = r2_score(y_test_TC, y_h_TC_pred)
h_TC_mae = mean_absolute_error(y_test_TC, y_h_TC_pred)

print(f"Mean Squared Error: {h_TC_mse}")
print(f"R-squared: {h_TC_r2}")
print(f"mean absolute error: {h_TC_mae}")

Mean Squared Error: 42.3805269624916
R-squared: 0.8042618644469353
mean absolute error: 5.0338833270972545


In [75]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Lasso Regressor Total Count.
l_TC_mse = mean_squared_error(y_test_TC, y_l_TC_pred)
l_TC_r2 = r2_score(y_test_TC, y_l_TC_pred)
l_TC_mae = mean_absolute_error(y_test_TC, y_l_TC_pred)

print(f"Mean Squared Error: {l_TC_mse}")
print(f"R-squared: {l_TC_r2}")
print(f"mean absolute error: {l_TC_mae}")

Mean Squared Error: 215.0452817500199
R-squared: 0.006794735079959646
mean absolute error: 10.677136243599007


In [76]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Extra-Trees Regressor Total Count.
e_TC_mse = mean_squared_error(y_test_TC, y_e_TC_pred)
e_TC_r2 = r2_score(y_test_TC, y_e_TC_pred)
e_TC_mae = mean_absolute_error(y_test_TC, y_e_TC_pred)

print(f"Mean Squared Error: {e_TC_mse}")
print(f"R-squared: {e_TC_r2}")
print(f"mean absolute error: {e_TC_mae}")

Mean Squared Error: 61.83626058368495
R-squared: 0.7144038731057731
mean absolute error: 5.868597046413503


In [77]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for K-Nearest Neighbors Regressor Total Count.
k_TC_mse = mean_squared_error(y_test_TC, y_k_TC_pred)
k_TC_r2 = r2_score(y_test_TC, y_k_TC_pred)
k_TC_mae = mean_absolute_error(y_test_TC, y_k_TC_pred)

print(f"Mean Squared Error: {k_TC_mse}")
print(f"R-squared: {k_TC_r2}")
print(f"mean absolute error: {k_TC_mae}")

Mean Squared Error: 78.54392405063292
R-squared: 0.6372380818601189
mean absolute error: 6.74535864978903


In [78]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Elastic Net Regressor Total Count.
e_n_TC_mse = mean_squared_error(y_test_TC, y_e_n_TC_pred)
e_n_TC_r2 = r2_score(y_test_TC, y_e_n_TC_pred)
e_n_TC_mae = mean_absolute_error(y_test_TC, y_e_n_TC_pred)

print(f"Mean Squared Error: {e_n_TC_mse}")
print(f"R-squared: {e_n_TC_r2}")
print(f"mean absolute error: {e_n_TC_mae}")

Mean Squared Error: 214.33255721979504
R-squared: 0.010086515072044833
mean absolute error: 10.465525613654595


In [79]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Radius Neighbors Regressor Total Count.
r_n_TC_mse = mean_squared_error(y_test_TC, y_r_n_TC_pred)
r_n_TC_r2 = r2_score(y_test_TC, y_r_n_TC_pred)
r_n_TC_mae = mean_absolute_error(y_test_TC, y_r_n_TC_pred)

print(f"Mean Squared Error: {r_n_TC_mse}")
print(f"R-squared: {r_n_TC_r2}")
print(f"mean absolute error: {r_n_TC_mae}")

Mean Squared Error: 8.973691110784241e+34
R-squared: -4.144576986049706e+32
mean absolute error: 9729295397526140.0


#### Predicting Assault Counts

In [80]:
# Initialize the Random Forest Regressor for the Assault Targets.
r_model_A = RandomForestRegressor()

# Train the model on the training data.
r_model_A.fit(X_train, y_train_A)

  return fit_method(estimator, *args, **kwargs)


In [81]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Assault Targets.
h_model_A = HistGradientBoostingRegressor()

# Train the model on the training data.
h_model_A.fit(X_train, y_train_A)

  y = column_or_1d(y, warn=True)


In [82]:
# Make predictions on the test set with the Random Forest Regressor for the Assault Targets.
y_r_A_pred = r_model_A.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Assault Targets.
y_h_A_pred = h_model_A.predict(X_test)

In [83]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Assault Count.
r_A_mse = mean_squared_error(y_test_A, y_r_A_pred)
r_A_r2 = r2_score(y_test_A, y_r_A_pred)
r_A_mae = mean_absolute_error(y_test_A, y_r_A_pred)

print(f"Mean Squared Error: {r_A_mse}")
print(f"R-squared: {r_A_r2}")
print(f"mean absolute error: {r_A_mae}")

Mean Squared Error: 19.148338607594937
R-squared: 0.7588411055033697
mean absolute error: 3.149915611814346


In [84]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Assault Count.
h_A_mse = mean_squared_error(y_test_A, y_h_A_pred)
h_A_r2 = r2_score(y_test_A, y_h_A_pred)
h_A_mae = mean_absolute_error(y_test_A, y_h_A_pred)

print(f"Mean Squared Error: {h_A_mse}")
print(f"R-squared: {h_A_r2}")
print(f"mean absolute error: {h_A_mae}")

Mean Squared Error: 16.26862998275755
R-squared: 0.7951088654730386
mean absolute error: 3.003620298129256


#### Predicting Auto Theft Counts

In [85]:
# Initialize the Random Forest Regressor for the Auto Theft Targets
r_model_AT = RandomForestRegressor()

# Train the model on the training data
r_model_AT.fit(X_train, y_train_AT)

  return fit_method(estimator, *args, **kwargs)


In [86]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Auto Theft Targets
h_model_AT = HistGradientBoostingRegressor()

# Train the model on the training data
h_model_AT.fit(X_train, y_train_AT)

  y = column_or_1d(y, warn=True)


In [87]:
# Make predictions on the test set with the Random Forest Regressor for the Auto Theft Targets
y_r_AT_pred = r_model_AT.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Auto Theft Targets
y_h_AT_pred = h_model_AT.predict(X_test)

In [88]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Auto Theft Count
r_AT_mse = mean_squared_error(y_test_AT, y_r_AT_pred)
r_AT_r2 = r2_score(y_test_AT, y_r_AT_pred)
r_AT_mae = mean_absolute_error(y_test_AT, y_r_AT_pred)

print(f"Mean Squared Error: {r_AT_mse}")
print(f"R-squared: {r_AT_r2}")
print(f"mean absolute error: {r_AT_mae}")

Mean Squared Error: 18.80362151898734
R-squared: 0.2978844502973613
mean absolute error: 2.807426160337552


In [89]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Auto Theft Count
h_AT_mse = mean_squared_error(y_test_AT, y_h_AT_pred)
h_AT_r2 = r2_score(y_test_AT, y_h_AT_pred)
h_AT_mae = mean_absolute_error(y_test_AT, y_h_AT_pred)

print(f"Mean Squared Error: {h_AT_mse}")
print(f"R-squared: {h_AT_r2}")
print(f"mean absolute error: {h_AT_mae}")

Mean Squared Error: 12.029894217961466
R-squared: 0.5508112209565743
mean absolute error: 2.4878698854824157


#### Predicting Break and Enter Counts

In [90]:
# Initialize the Random Forest Regressor for the Break and Enter Targets
r_model_BE = RandomForestRegressor()

# Train the model on the training data
r_model_BE.fit(X_train, y_train_BE)

  return fit_method(estimator, *args, **kwargs)


In [91]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Break and Enter Targets
h_model_BE = HistGradientBoostingRegressor()

# Train the model on the training data
h_model_BE.fit(X_train, y_train_BE)

  y = column_or_1d(y, warn=True)


In [92]:
# Make predictions on the test set with the Random Forest Regressor for the Break and Enter Targets
y_r_BE_pred = r_model_BE.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Break and Enter Targets
y_h_BE_pred = h_model_BE.predict(X_test)

In [93]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Break and Enter Count
r_BE_mse = mean_squared_error(y_test_BE, y_r_BE_pred)
r_BE_r2 = r2_score(y_test_BE, y_r_BE_pred)
r_BE_mae = mean_absolute_error(y_test_BE, y_r_BE_pred)

print(f"Mean Squared Error: {r_BE_mse}")
print(f"R-squared: {r_BE_r2}")
print(f"mean absolute error: {r_BE_mae}")

Mean Squared Error: 8.349552109704641
R-squared: 0.28991457625399264
mean absolute error: 2.056940928270042


In [94]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Break and Enter Count
h_BE_mse = mean_squared_error(y_test_BE, y_h_BE_pred)
h_BE_r2 = r2_score(y_test_BE, y_h_BE_pred)
h_BE_mae = mean_absolute_error(y_test_BE, y_h_BE_pred)

print(f"Mean Squared Error: {h_BE_mse}")
print(f"R-squared: {h_BE_r2}")
print(f"mean absolute error: {h_BE_mae}")

Mean Squared Error: 7.185335567450399
R-squared: 0.38892506039455554
mean absolute error: 1.9032188779372148


#### Predicting Robbery Counts

In [95]:
# Initialize the Random Forest Regressor for the Robbery Targets
r_model_R = RandomForestRegressor()

# Train the model on the training data
r_model_R.fit(X_train, y_train_R)

  return fit_method(estimator, *args, **kwargs)


In [96]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Robbery Targets
h_model_R = HistGradientBoostingRegressor()

# Train the model on the training data
h_model_R.fit(X_train, y_train_R)

  y = column_or_1d(y, warn=True)


In [97]:
# Make predictions on the test set with the Random Forest Regressor for the Robbery Targets
y_r_R_pred = r_model_R.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Robbery Targets
y_h_R_pred = h_model_R.predict(X_test)

In [98]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Robbery Count.
r_R_mse = mean_squared_error(y_test_R, y_r_R_pred)
r_R_r2 = r2_score(y_test_R, y_r_R_pred)
r_R_mae = mean_absolute_error(y_test_R, y_r_R_pred)

print(f"Mean Squared Error: {r_R_mse}")
print(f"R-squared: {r_R_r2}")
print(f"mean absolute error: {r_R_mae}")

Mean Squared Error: 2.5982489451476796
R-squared: 0.20832223994271581
mean absolute error: 1.113333333333333


In [99]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Robbery Count.
h_R_mse = mean_squared_error(y_test_R, y_h_R_pred)
h_R_r2 = r2_score(y_test_R, y_h_R_pred)
h_R_mae = mean_absolute_error(y_test_R, y_h_R_pred)

print(f"Mean Squared Error: {h_R_mse}")
print(f"R-squared: {h_R_r2}")
print(f"mean absolute error: {h_R_mae}")

Mean Squared Error: 2.218185363946034
R-squared: 0.3241263414730897
mean absolute error: 0.9848720183410007


#### Predicting Theft Over Counts

In [100]:
# Initialize the Random Forest Regressor for the Theft Over Targets.
r_model_TO = RandomForestRegressor()

# Train the model on the training data.
r_model_TO.fit(X_train, y_train_TO)

  return fit_method(estimator, *args, **kwargs)


In [101]:
# Initialize the Histogram-Based Gradient Boosting Regressor for the Theft Over Targets.
h_model_TO = HistGradientBoostingRegressor()

# Train the model on the training data.
h_model_TO.fit(X_train, y_train_TO)

  y = column_or_1d(y, warn=True)


In [102]:
# Make predictions on the test set with the Random Forest Regressor for the Theft Over Targets.
y_r_TO_pred = r_model_TO.predict(X_test)

# Make predictions on the test set with the Histogram-Based Gradient Boosting Regressor for the Theft Over Targets.
y_h_TO_pred = h_model_TO.predict(X_test)

In [103]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Random Forests Theft Over Count
r_TO_mse = mean_squared_error(y_test_TO, y_r_TO_pred)
r_TO_r2 = r2_score(y_test_TO, y_r_TO_pred)
r_TO_mae = mean_absolute_error(y_test_TO, y_r_TO_pred)

print(f"Mean Squared Error: {r_TO_mse}")
print(f"R-squared: {r_TO_r2}")
print(f"mean absolute error: {r_TO_mae}")

Mean Squared Error: 1.4462132911392405
R-squared: 0.2098192952975252
mean absolute error: 0.829029535864979


In [104]:
# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for Histogram-Based Gradient Boosting Theft Over Count
h_TO_mse = mean_squared_error(y_test_TO, y_h_TO_pred)
h_TO_r2 = r2_score(y_test_TO, y_h_TO_pred)
h_TO_mae = mean_absolute_error(y_test_TO, y_h_TO_pred)

print(f"Mean Squared Error: {h_TO_mse}")
print(f"R-squared: {h_TO_r2}")
print(f"mean absolute error: {h_TO_mae}")

Mean Squared Error: 1.139272032559403
R-squared: 0.3775255814261935
mean absolute error: 0.7612698567025898


### Optimizing the models - Second Approach

#### Auto Theft Models

##### Apply Regression Chain Boosting Algorithm

In [105]:
# prompt: Create a Regression Chain model with the base_estimator being a Random Forest Regressor, and then train and test the models with the Auto Theft target data with sci-kit learn

# Create a Regression Chain model with Random Forest Regressor as the base estimator
chain_model = RegressorChain(base_estimator=RandomForestRegressor())

# Train the model on the training data for the Auto Theft target
chain_model.fit(X_train, y_train_AT)

# Make predictions on the test set
y_chain_AT_pred = chain_model.predict(X_test)

# Evaluate the model
chain_AT_mse = mean_squared_error(y_test_AT, y_chain_AT_pred)
chain_AT_r2 = r2_score(y_test_AT, y_chain_AT_pred)
chain_AT_mae = mean_absolute_error(y_test_AT, y_chain_AT_pred)

print(f"Regression Chain Random Forest Regressor (Auto Theft) Mean Squared Error: {chain_AT_mse}")
print(f"Regression Chain Random Forest Regressor (Auto Theft) R-squared: {chain_AT_r2}")
print(f"Regression Chain Random Forest Regressor (Auto Theft) Mean Absolute Error: {chain_AT_mae}")

Regression Chain Random Forest Regressor (Auto Theft) Mean Squared Error: 18.42441223628692
Regression Chain Random Forest Regressor (Auto Theft) R-squared: 0.3120438894089549
Regression Chain Random Forest Regressor (Auto Theft) Mean Absolute Error: 2.808438818565401


##### Perform Hyper-Parameter Tuning

In [106]:
# prompt: Perform hyperparameter tuning with sci-kit learn on a Regression Chain model with the base_estimator being a Random Forest Regressor, using the Auto Theft target data with sci-kit learn for training and testing

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}

# Create the Regression Chain model with RandomForestRegressor as the base estimator
chain_model = RegressorChain(base_estimator=RandomForestRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=chain_model,
    param_distributions=param_grid,
    n_iter=20,
    scoring=r2_scorer,  # Use R-Squared for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Auto Theft target
random_search.fit(X_train, y_train_AT)

# Get the best estimator and its parameters
best_chain_model = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_chain_AT_pred = best_chain_model.predict(X_test)

# Evaluate the best model
chain_AT_mse = mean_squared_error(y_test_AT, y_chain_AT_pred)
chain_AT_r2 = r2_score(y_test_AT, y_chain_AT_pred)
chain_AT_mae = mean_absolute_error(y_test_AT, y_chain_AT_pred)

print(f"Regression Chain Random Forest Regressor (Auto Theft) Mean Squared Error: {chain_AT_mse}")
print(f"Regression Chain Random Forest Regressor (Auto Theft) R-squared: {chain_AT_r2}")
print(f"Regression Chain Random Forest Regressor (Auto Theft) Mean Absolute Error: {chain_AT_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'base_estimator__n_estimators': 100, 'base_estimator__min_samples_split': 2, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 5}
Regression Chain Random Forest Regressor (Auto Theft) Mean Squared Error: 17.48539971878824
Regression Chain Random Forest Regressor (Auto Theft) R-squared: 0.34710603364726056
Regression Chain Random Forest Regressor (Auto Theft) Mean Absolute Error: 2.9645043606439847


##### Apply ADA Boosting Algorithm

In [107]:
# prompt: Create an AdaBoost regressor with the base estimator being a Histogram-Based Gradient Boosting Regressor, and then train and test the models with the Auto Theft target data with sci-kit learn

# Initialize the AdaBoost regressor with HistGradientBoostingRegressor as the base estimator
ada_model_AT = AdaBoostRegressor(estimator=HistGradientBoostingRegressor(), n_estimators=50, random_state=42)

# Train the AdaBoost model on the training data for Auto Theft
ada_model_AT.fit(X_train, y_train_AT)

# Make predictions on the test set with the AdaBoost model for Auto Theft
y_ada_AT_pred = ada_model_AT.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for AdaBoost Auto Theft Count
ada_AT_mse = mean_squared_error(y_test_AT, y_ada_AT_pred)
ada_AT_r2 = r2_score(y_test_AT, y_ada_AT_pred)
ada_AT_mae = mean_absolute_error(y_test_AT, y_ada_AT_pred)

print(f"AdaBoost Mean Squared Error: {ada_AT_mse}")
print(f"AdaBoost R-squared: {ada_AT_r2}")
print(f"AdaBoost mean absolute error: {ada_AT_mae}")

  y = column_or_1d(y, warn=True)


AdaBoost Mean Squared Error: 18.659485596532548
AdaBoost R-squared: 0.30326639612753314
AdaBoost mean absolute error: 2.8809512187634048


##### Perform Hyper-Parameter Tuning

In [108]:
# prompt: Perform hyperparameter tuning with sci-kit learn on an AdaBoost regressor with the base estimator being a Histogram-Based Gradient Boosting Regressor,  using the Auto Theft target data with sci-kit learn for training and testing

# Define the parameter grid for hyperparameter tuning
param_grid_ada = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.5],
    'loss': ['linear', 'square', 'exponential'],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [None, 3, 5],
    'estimator__min_samples_leaf': [10, 20, 40],
    'estimator__l2_regularization': [0.0, 0.1, 0.2]
}

# Create the AdaBoost model with HistGradientBoostingRegressor as the base estimator
ada_model_AT = AdaBoostRegressor(estimator=HistGradientBoostingRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search_ada = RandomizedSearchCV(
    estimator=ada_model_AT,
    param_distributions=param_grid_ada,
    n_iter=20,
    scoring='neg_mean_squared_error',  # Use negative MSE for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Auto Theft target
random_search_ada.fit(X_train, y_train_AT)

# Get the best estimator and its parameters
best_ada_model_AT = random_search_ada.best_estimator_
best_params_ada = random_search_ada.best_params_

print("Best Parameters:", best_params_ada)

# Make predictions using the best model
y_ada_AT_pred = best_ada_model_AT.predict(X_test)

# Evaluate the best model
ada_AT_mse = mean_squared_error(y_test_AT, y_ada_AT_pred)
ada_AT_r2 = r2_score(y_test_AT, y_ada_AT_pred)
ada_AT_mae = mean_absolute_error(y_test_AT, y_ada_AT_pred)

print(f"AdaBoost Mean Squared Error: {ada_AT_mse}")
print(f"AdaBoost R-squared: {ada_AT_r2}")
print(f"AdaBoost mean absolute error: {ada_AT_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  y = column_or_1d(y, warn=True)


Best Parameters: {'n_estimators': 50, 'loss': 'exponential', 'learning_rate': 0.01, 'estimator__min_samples_leaf': 10, 'estimator__max_depth': 3, 'estimator__learning_rate': 0.1, 'estimator__l2_regularization': 0.0}
AdaBoost Mean Squared Error: 13.446577414823562
AdaBoost R-squared: 0.49791315020382454
AdaBoost mean absolute error: 2.6967522187991984


#### Total Count Models

##### Apply Regression Chain Boosting Algorithm

In [109]:
# prompt: Create a Regression Chain model with the base_estimator being a Random Forest Regressor, and then train and test the models with the Total Count target data with sci-kit learn

# Create a Regression Chain model with Random Forest Regressor as the base estimator
chain_model_TC = RegressorChain(base_estimator=RandomForestRegressor())

# Train the model on the training data for the Total Count target
chain_model_TC.fit(X_train, y_train_TC)

# Make predictions on the test set
y_chain_TC_pred_0 = chain_model_TC.predict(X_test)

# Evaluate the model
chain_TC_mse = mean_squared_error(y_test_TC, y_chain_TC_pred_0)
chain_TC_r2 = r2_score(y_test_TC, y_chain_TC_pred_0)
chain_TC_mae = mean_absolute_error(y_test_TC, y_chain_TC_pred_0)

print(f"Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: {chain_TC_mse}")
print(f"Regression Chain Random Forest Regressor (Total Count) R-squared: {chain_TC_r2}")
print(f"Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: {chain_TC_mae}")

Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: 47.93471529535864
Regression Chain Random Forest Regressor (Total Count) R-squared: 0.7786093644261555
Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: 5.159208860759494


##### Perform Hyper-Parameter Tuning

In [110]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}

# Create the Regression Chain model with RandomForestRegressor as the base estimator
chain_model_TC = RegressorChain(base_estimator=RandomForestRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=chain_model_TC,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_squared_error',  # Use negative MSE for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_chain_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_chain_TC_pred_1 = best_chain_model_TC.predict(X_test)

# Evaluate the best model
chain_TC_mse = mean_squared_error(y_test_TC, y_chain_TC_pred_1)
chain_TC_r2 = r2_score(y_test_TC, y_chain_TC_pred_1)
chain_TC_mae = mean_absolute_error(y_test_TC, y_chain_TC_pred_1)

print(f"Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: {chain_TC_mse}")
print(f"Regression Chain Random Forest Regressor (Total Count) R-squared: {chain_TC_r2}")
print(f"Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: {chain_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'base_estimator__n_estimators': 100, 'base_estimator__min_samples_split': 2, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10}
Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: 73.47493318728533
Regression Chain Random Forest Regressor (Total Count) R-squared: 0.6606496553312393
Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: 6.447729487449837


In [111]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}

# Create the Regression Chain model with RandomForestRegressor as the base estimator
chain_model_TC = RegressorChain(base_estimator=RandomForestRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=chain_model_TC,
    param_distributions=param_grid,
    n_iter=20,
    scoring=r2_scorer,  # Use R-Squared for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_chain_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_chain_TC_pred_2 = best_chain_model_TC.predict(X_test)

# Evaluate the best model
chain_TC_mse = mean_squared_error(y_test_TC, y_chain_TC_pred_2)
chain_TC_r2 = r2_score(y_test_TC, y_chain_TC_pred_2)
chain_TC_mae = mean_absolute_error(y_test_TC, y_chain_TC_pred_2)

print(f"Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: {chain_TC_mse}")
print(f"Regression Chain Random Forest Regressor (Total Count) R-squared: {chain_TC_r2}")
print(f"Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: {chain_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'base_estimator__n_estimators': 100, 'base_estimator__min_samples_split': 10, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10}
Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: 67.93023502764888
Regression Chain Random Forest Regressor (Total Count) R-squared: 0.6862583241647413
Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: 6.238498027911412


In [112]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}

# Create the Regression Chain model with RandomForestRegressor as the base estimator
chain_model_TC = RegressorChain(base_estimator=RandomForestRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=chain_model_TC,
    param_distributions=param_grid,
    n_iter=81,
    scoring='neg_mean_squared_error',  # Use negative MSE for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_chain_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_chain_TC_pred_3 = best_chain_model_TC.predict(X_test)

# Evaluate the best model
chain_TC_mse = mean_squared_error(y_test_TC, y_chain_TC_pred_3)
chain_TC_r2 = r2_score(y_test_TC, y_chain_TC_pred_3)
chain_TC_mae = mean_absolute_error(y_test_TC, y_chain_TC_pred_3)

print(f"Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: {chain_TC_mse}")
print(f"Regression Chain Random Forest Regressor (Total Count) R-squared: {chain_TC_r2}")
print(f"Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: {chain_TC_mae}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'base_estimator__n_estimators': 100, 'base_estimator__min_samples_split': 5, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10}
Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: 70.90202843448286
Regression Chain Random Forest Regressor (Total Count) R-squared: 0.6725328388441507
Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: 6.353469254462679


In [113]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}

# Create the Regression Chain model with RandomForestRegressor as the base estimator
chain_model_TC = RegressorChain(base_estimator=RandomForestRegressor())

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=chain_model_TC,
    param_distributions=param_grid,
    n_iter=81,
    scoring=r2_scorer,  # Use R-Squared for optimization
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_chain_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_chain_TC_pred_4 = best_chain_model_TC.predict(X_test)

# Evaluate the best model
chain_TC_mse = mean_squared_error(y_test_TC, y_chain_TC_pred_4)
chain_TC_r2 = r2_score(y_test_TC, y_chain_TC_pred_4)
chain_TC_mae = mean_absolute_error(y_test_TC, y_chain_TC_pred_4)

print(f"Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: {chain_TC_mse}")
print(f"Regression Chain Random Forest Regressor (Total Count) R-squared: {chain_TC_r2}")
print(f"Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: {chain_TC_mae}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'base_estimator__n_estimators': 50, 'base_estimator__min_samples_split': 5, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10}
Regression Chain Random Forest Regressor (Total Count) Mean Squared Error: 76.33942276403245
Regression Chain Random Forest Regressor (Total Count) R-squared: 0.6474197620465263
Regression Chain Random Forest Regressor (Total Count) Mean Absolute Error: 6.521182737051699


##### Apply ADA Boosting Algorithm

In [114]:
# prompt: Create an AdaBoost regressor with the base estimator being a Histogram-Based Gradient Boosting Regressor, and then train and test the models with the Total Count target data with sci-kit learn

# Initialize the AdaBoost regressor with HistGradientBoostingRegressor as the base estimator
ada_model_TC = AdaBoostRegressor(estimator=HistGradientBoostingRegressor(), n_estimators=50, random_state=42)

# Train the AdaBoost model on the training data for Total Count
ada_model_TC.fit(X_train, y_train_TC)

# Make predictions on the test set with the AdaBoost model for Total Count
y_ada_TC_pred = ada_model_TC.predict(X_test)

# Calculate the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R^2) for AdaBoost Total Count
ada_TC_mse = mean_squared_error(y_test_TC, y_ada_TC_pred)
ada_TC_r2 = r2_score(y_test_TC, y_ada_TC_pred)
ada_TC_mae = mean_absolute_error(y_test_TC, y_ada_TC_pred)

print(f"AdaBoost Mean Squared Error: {ada_TC_mse}")
print(f"AdaBoost R-squared: {ada_TC_r2}")
print(f"AdaBoost mean absolute error: {ada_TC_mae}")

  y = column_or_1d(y, warn=True)


AdaBoost Mean Squared Error: 45.44923474580936
AdaBoost R-squared: 0.7900887716820576
AdaBoost mean absolute error: 5.161318291385233


##### Perform Hyper-Parameter Tuning

Attempts were made to perform hyper-parameter tuning on ADA Boosting Histogram-based Gradient Boosting Regressor, but it took way too much time to run them and they did not yield siginficant R-Squared scores.

##### Perform Hyper-Parameter Tuning on Random Forest Regressor

In [117]:
# prompt: Perform Hyper-Parameter Tuning on Random Forest Regressor with r-squared being the performance score, and then train and test the model with the Total Count target data with sci-kit learn

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Regressor model
rf_model_TC = RandomForestRegressor()

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=rf_model_TC,
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter settings that are sampled
    scoring=r2_scorer,
    cv=5,  # Number of cross-validation folds
    verbose=2,  # Controls the verbosity: the higher, the more messages
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_rf_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_rf_TC_pred = best_rf_model_TC.predict(X_test)

# Evaluate the best model
rf_TC_mse = mean_squared_error(y_test_TC, y_rf_TC_pred)
rf_TC_r2 = r2_score(y_test_TC, y_rf_TC_pred)
rf_TC_mae = mean_absolute_error(y_test_TC, y_rf_TC_pred)

print(f"Random Forest Regressor (Total Count) Mean Squared Error: {rf_TC_mse}")
print(f"Random Forest Regressor (Total Count) R-squared: {rf_TC_r2}")
print(f"Random Forest Regressor (Total Count) Mean Absolute Error: {rf_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10}
Random Forest Regressor (Total Count) Mean Squared Error: 70.19476576714632
Random Forest Regressor (Total Count) R-squared: 0.6757993927492796
Random Forest Regressor (Total Count) Mean Absolute Error: 6.318822765814394


In [118]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Create a Random Forest Regressor model
rf_model_TC = RandomForestRegressor()

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=rf_model_TC,
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter settings that are sampled
    scoring=r2_scorer,
    cv=5,  # Number of cross-validation folds
    verbose=2,  # Controls the verbosity: the higher, the more messages
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_rf_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_rf_TC_pred = best_rf_model_TC.predict(X_test)

# Evaluate the best model
rf_TC_mse = mean_squared_error(y_test_TC, y_rf_TC_pred)
rf_TC_r2 = r2_score(y_test_TC, y_rf_TC_pred)
rf_TC_mae = mean_absolute_error(y_test_TC, y_rf_TC_pred)

print(f"Random Forest Regressor (Total Count) Mean Squared Error: {rf_TC_mse}")
print(f"Random Forest Regressor (Total Count) R-squared: {rf_TC_r2}")
print(f"Random Forest Regressor (Total Count) Mean Absolute Error: {rf_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
Random Forest Regressor (Total Count) Mean Squared Error: 72.90391467688298
Random Forest Regressor (Total Count) R-squared: 0.6632869537936056
Random Forest Regressor (Total Count) Mean Absolute Error: 6.394313560588884


In [119]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Create a Random Forest Regressor model
rf_model_TC = RandomForestRegressor()

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=rf_model_TC,
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter settings that are sampled
    scoring=r2_scorer,
    cv=5,  # Number of cross-validation folds
    verbose=2,  # Controls the verbosity: the higher, the more messages
    random_state=1,
    n_jobs=-1  # Use all available processors
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_rf_model_TC = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
y_rf_TC_pred = best_rf_model_TC.predict(X_test)

# Evaluate the best model
rf_TC_mse = mean_squared_error(y_test_TC, y_rf_TC_pred)
rf_TC_r2 = r2_score(y_test_TC, y_rf_TC_pred)
rf_TC_mae = mean_absolute_error(y_test_TC, y_rf_TC_pred)

print(f"Random Forest Regressor (Total Count) Mean Squared Error: {rf_TC_mse}")
print(f"Random Forest Regressor (Total Count) R-squared: {rf_TC_r2}")
print(f"Random Forest Regressor (Total Count) Mean Absolute Error: {rf_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_depth': 10}
Random Forest Regressor (Total Count) Mean Squared Error: 67.94504319972252
Random Forest Regressor (Total Count) R-squared: 0.6861899313390644
Random Forest Regressor (Total Count) Mean Absolute Error: 6.2394998734025


##### Perform Hyper-Parameter Tuning on Histogram-Based Gradient Boosting Regressor

In [120]:
# prompt: Perform Hyper-Parameter Tuning on Histogram-Based Gradient Boosting Regressor with r-squared being the performance score, and then train and test the model with the Total Count target data with sci-kit learn

# Define the parameter grid for hyperparameter tuning
param_grid_hgbr_TC = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_leaf': [10, 20, 30],
    'l2_regularization': [0.0, 0.1, 0.2]
}

# Create a HistGradientBoostingRegressor model
hgbr_model_TC = HistGradientBoostingRegressor()

# Create a RandomizedSearchCV object to perform hyperparameter tuning
random_search_hgbr_TC = RandomizedSearchCV(
    estimator=hgbr_model_TC,
    param_distributions=param_grid_hgbr_TC,
    n_iter=20,  # Number of parameter settings that are sampled
    scoring='r2',  # Use R-squared for optimization
    cv=5,  # Number of cross-validation folds
    verbose=2,  # Controls the verbosity: the higher, the more messages
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Fit the RandomizedSearchCV object to the training data for Total Count target
random_search_hgbr_TC.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_hgbr_model_TC = random_search_hgbr_TC.best_estimator_
best_params_hgbr_TC = random_search_hgbr_TC.best_params_

print("Best Parameters:", best_params_hgbr_TC)

# Make predictions using the best model
y_hgbr_TC_pred = best_hgbr_model_TC.predict(X_test)

# Evaluate the best model
hgbr_TC_mse = mean_squared_error(y_test_TC, y_hgbr_TC_pred)
hgbr_TC_r2 = r2_score(y_test_TC, y_hgbr_TC_pred)
hgbr_TC_mae = mean_absolute_error(y_test_TC, y_hgbr_TC_pred)

print(f"HistGradientBoostingRegressor (Total Count) Mean Squared Error: {hgbr_TC_mse}")
print(f"HistGradientBoostingRegressor (Total Count) R-squared: {hgbr_TC_r2}")
print(f"HistGradientBoostingRegressor (Total Count) Mean Absolute Error: {hgbr_TC_mae}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  y = column_or_1d(y, warn=True)


Best Parameters: {'min_samples_leaf': 10, 'max_iter': 300, 'max_depth': 10, 'learning_rate': 0.01, 'l2_regularization': 0.0}
HistGradientBoostingRegressor (Total Count) Mean Squared Error: 68.39689337906064
HistGradientBoostingRegressor (Total Count) R-squared: 0.68410302213826
HistGradientBoostingRegressor (Total Count) Mean Absolute Error: 6.424647018374788


##### Create a Voting Regressor Ensemble Learning Model with the initial Random Forest and Histogram-Based Gradient Boosting Models

In [121]:
# prompt: Create a Voting Regressor with r_model_TC and h_model_TC, and then train and test it with Total Count target data

# Create a Voting Regressor with rf_model_TC and h_model_TC
voting_model_TC_fitted = VotingRegressor(estimators=[('rf', r_model_TC), ('hgbr', h_model_TC)])

# Train the Voting Regressor on the training data
voting_model_TC_fitted.fit(X_train, y_train_TC)

# Make predictions on the test data
y_voting_TC_fitted_pred = voting_model_TC_fitted.predict(X_test)

# Evaluate the Voting Regressor
voting_TC_fitted_mse = mean_squared_error(y_test_TC, y_voting_TC_fitted_pred)
voting_TC_fitted_r2 = r2_score(y_test_TC, y_voting_TC_fitted_pred)
voting_TC_fitted_mae = mean_absolute_error(y_test_TC, y_voting_TC_fitted_pred)

print(f"Voting Regressor (Total Count) Mean Squared Error: {voting_TC_fitted_mse}")
print(f"Voting Regressor (Total Count) R-squared: {voting_TC_fitted_r2}")
print(f"Voting Regressor (Total Count) Mean Absolute Error: {voting_TC_fitted_mae}")

  y = column_or_1d(y, warn=True)


Voting Regressor (Total Count) Mean Squared Error: 41.32425336335093
Voting Regressor (Total Count) R-squared: 0.8091403555783152
Voting Regressor (Total Count) Mean Absolute Error: 4.84929300439395


In [122]:
# prompt: Create a Voting Regressor with Random Forest and Histogram-Based Gradient Boosting Regressors, and then train and test it with Total Count target data

# Initialize a Random Forest Regressor
rf_model_TC = RandomForestRegressor()

# Initialize a Histogram-Based Gradient Boosting Regressor
hgbr_model_TC = HistGradientBoostingRegressor()

# Create a Voting Regressor with the Random Forest and Histogram-Based Gradient Boosting models
voting_model_TC_unfitted = VotingRegressor(estimators=[('rf', rf_model_TC), ('hgbr', hgbr_model_TC)])

# Train the Voting Regressor on the training data
voting_model_TC_unfitted.fit(X_train, y_train_TC)

# Make predictions on the test data
y_voting_TC_unfitted_pred = voting_model_TC_unfitted.predict(X_test)

# Evaluate the Voting Regressor
voting_TC_unfitted_mse = mean_squared_error(y_test_TC, y_voting_TC_unfitted_pred)
voting_TC_unfitted_r2 = r2_score(y_test_TC, y_voting_TC_unfitted_pred)
voting_TC_unfitted_mae = mean_absolute_error(y_test_TC, y_voting_TC_unfitted_pred)

print(f"Voting Regressor (Total Count) Mean Squared Error: {voting_TC_unfitted_mse}")
print(f"Voting Regressor (Total Count) R-squared: {voting_TC_unfitted_r2}")
print(f"Voting Regressor (Total Count) Mean Absolute Error: {voting_TC_unfitted_mae}")

  y = column_or_1d(y, warn=True)


Voting Regressor (Total Count) Mean Squared Error: 41.07981269486411
Voting Regressor (Total Count) R-squared: 0.8102693259836455
Voting Regressor (Total Count) Mean Absolute Error: 4.8375101381712025


In [123]:
# prompt: perform hyper-parameter tuning on voting_model_TC_fitted with the Voting Regressor hyperparameters, not the base estimators' hyperparameters and r-squared being the performance score; and then train and test the model with the Total Count target data with sci-kit learn

# Define the parameter grid for hyperparameter tuning of VotingRegressor
param_grid_voting = {
    'weights': [[1, 1], [2, 1], [1, 2]],
    'n_jobs': [-1]
}

# Create a Voting Regressor model
voting_model_TC = VotingRegressor(estimators=[('rf', r_model_TC), ('hgbr', h_model_TC)])

# Create a GridSearchCV object to perform hyperparameter tuning
r2_scorer = make_scorer(r2_score)
grid_search_voting_TC = GridSearchCV(
    estimator=voting_model_TC,
    param_grid=param_grid_voting,
    scoring=r2_scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)


# Fit the GridSearchCV object to the training data for Total Count target
grid_search_voting_TC.fit(X_train, y_train_TC)

# Get the best estimator and its parameters
best_voting_model_TC = grid_search_voting_TC.best_estimator_
best_params_voting_TC = grid_search_voting_TC.best_params_

print("Best Parameters for Voting Regressor:", best_params_voting_TC)

# Make predictions using the best model
y_voting_TC_pred = best_voting_model_TC.predict(X_test)

# Evaluate the best model
voting_TC_mse = mean_squared_error(y_test_TC, y_voting_TC_pred)
voting_TC_r2 = r2_score(y_test_TC, y_voting_TC_pred)
voting_TC_mae = mean_absolute_error(y_test_TC, y_voting_TC_pred)

print(f"Voting Regressor (Total Count) Mean Squared Error: {voting_TC_mse}")
print(f"Voting Regressor (Total Count) R-squared: {voting_TC_r2}")
print(f"Voting Regressor (Total Count) Mean Absolute Error: {voting_TC_mae}")


Fitting 5 folds for each of 3 candidates, totalling 15 fits


  y = column_or_1d(y, warn=True)


Best Parameters for Voting Regressor: {'n_jobs': -1, 'weights': [1, 2]}
Voting Regressor (Total Count) Mean Squared Error: 40.61612797868213
Voting Regressor (Total Count) R-squared: 0.8124108940181873
Voting Regressor (Total Count) Mean Absolute Error: 4.85271977817272




---



In [134]:
# prompt: Make a dataframe with just the unique values in 'NEIGHBOURHOOD_158'  and 'HOOD_158' columns of the crime_groups_joined dataframe

# Assuming 'crime_groups_joined' is your DataFrame
unique_neighborhood_df = crime_groups_joined[['NEIGHBOURHOOD_158', 'HOOD_158']].drop_duplicates()

unique_neighborhood_df

Unnamed: 0,NEIGHBOURHOOD_158,HOOD_158
0,West Queen West (162),162
1,Morningside Heights (144),144
2,Moss Park (73),73
3,Fort York-Liberty Village (163),163
4,Eglinton East (138),138
...,...,...
1091,Broadview North (57),57
1337,Guildwood (140),140
1369,Lambton Baby Point (114),114
1412,Bayview Woods-Steeles (49),49


In [135]:
# prompt: Perform a left outer merge on a copy of crime_by_month_year_hood dataframe with unique_neighborhood_df on HOOD_158 column

# Create a copy of the crime_by_month_year_hood dataframe
crime_by_month_year_hood_copy = crime_by_month_year_hood.copy()

# Perform a left outer merge
crime_by_M_Y_H = pd.merge(crime_by_month_year_hood_copy, unique_neighborhood_df, on='HOOD_158', how='left')

# Now merged_df contains the 'NEIGHBOURHOOD_158' column from unique_neighborhood_df
crime_by_M_Y_H

Unnamed: 0,HOOD_158,OCC_YEAR,OCC_MONTH,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count,NEIGHBOURHOOD_158
0,1,2021,1,18,35,7,1,3,62,West Humber-Clairville (1)
1,1,2021,2,17,17,5,1,3,43,West Humber-Clairville (1)
2,1,2021,3,15,20,8,6,6,54,West Humber-Clairville (1)
3,1,2021,4,11,31,4,2,4,52,West Humber-Clairville (1)
4,1,2021,5,18,26,9,5,4,62,West Humber-Clairville (1)
...,...,...,...,...,...,...,...,...,...,...
6630,174,2024,2,9,0,5,1,1,15,South Eglinton-Davisville (174)
6631,174,2024,3,6,1,0,0,0,7,South Eglinton-Davisville (174)
6632,174,2024,4,12,2,2,0,1,17,South Eglinton-Davisville (174)
6633,174,2024,5,8,1,2,0,1,12,South Eglinton-Davisville (174)


In [140]:
# prompt: Move 'NEIGHBOURHOOD_158' in front of all the other columns in the crime_by_M_Y_H dataframe

# Move 'NEIGHBOURHOOD_158' to the front of the DataFrame
cols = crime_by_M_Y_H.columns.tolist()
cols.insert(0, cols.pop(cols.index('NEIGHBOURHOOD_158')))
crime_by_M_Y_H = crime_by_M_Y_H[cols]
crime_by_M_Y_H

Unnamed: 0,NEIGHBOURHOOD_158,HOOD_158,OCC_YEAR,OCC_MONTH,Assault,Auto Theft,Break and Enter,Robbery,Theft Over,Total_Count
0,West Humber-Clairville (1),1,2021,1,18,35,7,1,3,62
1,West Humber-Clairville (1),1,2021,2,17,17,5,1,3,43
2,West Humber-Clairville (1),1,2021,3,15,20,8,6,6,54
3,West Humber-Clairville (1),1,2021,4,11,31,4,2,4,52
4,West Humber-Clairville (1),1,2021,5,18,26,9,5,4,62
...,...,...,...,...,...,...,...,...,...,...
6630,South Eglinton-Davisville (174),174,2024,2,9,0,5,1,1,15
6631,South Eglinton-Davisville (174),174,2024,3,6,1,0,0,0,7
6632,South Eglinton-Davisville (174),174,2024,4,12,2,2,0,1,17
6633,South Eglinton-Davisville (174),174,2024,5,8,1,2,0,1,12




---



## Results

In [148]:
# prompt: Create a dataframe with just the data for the year 2024 and concatenate y_voting_TC_pred values onto it in a column named Predicted_Total_Count

predicted_crime_2024 = crime_by_M_Y_H.loc[crime_by_M_Y_H['OCC_YEAR'] == 2024, :]

# Concatenate the predictions onto the DataFrame
predicted_crime_2024['Predicted_Total_Count'] = y_voting_TC_pred

# Round the values in Predicted_Total_Count to zero decimal places and convert the column to integers
predicted_crime_2024['Predicted_Total_Count'] = predicted_crime_2024['Predicted_Total_Count'].round(0).astype(int)

# Drop the Assault, Auto Theft, Break and Enter, Robbery, Theft Over columns from the Dataframe
predicted_crime_2024 = predicted_crime_2024.drop(columns=['Assault', 'Auto Theft', 'Break and Enter', 'Robbery', 'Theft Over'])

# Now df_2024 contains the data for 2024 with a new column 'Predicted_Total_Count'
predicted_crime_2024

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_crime_2024['Predicted_Total_Count'] = y_voting_TC_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_crime_2024['Predicted_Total_Count'] = predicted_crime_2024['Predicted_Total_Count'].round(0).astype(int)


Unnamed: 0,NEIGHBOURHOOD_158,HOOD_158,OCC_YEAR,OCC_MONTH,Total_Count,Predicted_Total_Count
36,West Humber-Clairville (1),1,2024,1,110,114
37,West Humber-Clairville (1),1,2024,2,101,109
38,West Humber-Clairville (1),1,2024,3,79,111
39,West Humber-Clairville (1),1,2024,4,93,110
40,West Humber-Clairville (1),1,2024,5,104,110
...,...,...,...,...,...,...
6630,South Eglinton-Davisville (174),174,2024,2,15,11
6631,South Eglinton-Davisville (174),174,2024,3,7,14
6632,South Eglinton-Davisville (174),174,2024,4,17,13
6633,South Eglinton-Davisville (174),174,2024,5,12,13


In [144]:
# prompt: Show all the unique values in 'OCC_MONTH' column of predicted_crime_2024

# All the months for which there is crime data for the year 2024
unique_occ_month = predicted_crime_2024['OCC_MONTH'].unique()

unique_occ_month

array([1, 2, 3, 4, 5, 6])

In [146]:
# prompt: Make a copy of crime_by_M_Y_H with just the four starting columns. Change the value 2021 in the 'OCC_YEAR' column to 2025, the value 2022 to 2026, and 2023 to 2027. Change the value 1 in the 'OCC_MONTH to 7, 2 to 8, 3 to 9, 4 to 10, 5 to 11, and 6 to 12, whenever the value in the 'OCC_YEAR' column is 2024. Order the dataframe by the 'HOOD_158' and 'OCC_YEAR' columns.

crime_by_M_Y_H_copy = crime_by_M_Y_H.iloc[:, :4].copy()

# Replace values in 'OCC_YEAR' column
crime_by_M_Y_H_copy.loc[crime_by_M_Y_H_copy['OCC_YEAR'] == 2021, 'OCC_YEAR'] = 2025
crime_by_M_Y_H_copy.loc[crime_by_M_Y_H_copy['OCC_YEAR'] == 2022, 'OCC_YEAR'] = 2026
crime_by_M_Y_H_copy.loc[crime_by_M_Y_H_copy['OCC_YEAR'] == 2023, 'OCC_YEAR'] = 2027

# Replace values in 'OCC_MONTH' column for year 2024
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 1), 'OCC_MONTH'] = 7
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 2), 'OCC_MONTH'] = 8
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 3), 'OCC_MONTH'] = 9
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 4), 'OCC_MONTH'] = 10
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 5), 'OCC_MONTH'] = 11
crime_by_M_Y_H_copy.loc[(crime_by_M_Y_H_copy['OCC_YEAR'] == 2024) & (crime_by_M_Y_H_copy['OCC_MONTH'] == 6), 'OCC_MONTH'] = 12

# Order the DataFrame
future_crime_predictions = crime_by_M_Y_H_copy.sort_values(['HOOD_158', 'OCC_YEAR'])

# Now future_crime_predictions contains the modified DataFrame with index values reset as well
future_crime_predictions = future_crime_predictions.reset_index(drop=True)
future_crime_predictions

Unnamed: 0,NEIGHBOURHOOD_158,HOOD_158,OCC_YEAR,OCC_MONTH
0,West Humber-Clairville (1),1,2024,7
1,West Humber-Clairville (1),1,2024,8
2,West Humber-Clairville (1),1,2024,9
3,West Humber-Clairville (1),1,2024,10
4,West Humber-Clairville (1),1,2024,11
...,...,...,...,...
6630,South Eglinton-Davisville (174),174,2027,8
6631,South Eglinton-Davisville (174),174,2027,9
6632,South Eglinton-Davisville (174),174,2027,10
6633,South Eglinton-Davisville (174),174,2027,11


In [149]:
# prompt: Make a copy of the future_crime_predictions with just the 'HOOD_158',  'OCC_YEAR', 'OCC_MONTH' columns, and use that dataframe to make new predictions with the best_voting_model_TC and then concatenate those predictions onto future_crime_predictions under the column Total_Counts

# Create a copy of future_crime_predictions with only the feature columns
future_crime_predictions_subset = future_crime_predictions[['HOOD_158', 'OCC_YEAR', 'OCC_MONTH']].copy()

# Make predictions using the best_voting_model_TC
new_predictions = best_voting_model_TC.predict(future_crime_predictions_subset)

# Concatenate the new predictions onto the future_crime_predictions DataFrame
future_crime_predictions['Total_Counts'] = new_predictions

# Round the values in the Total_Counts to zero decimal places and convert the values to integers
future_crime_predictions['Total_Counts'] = future_crime_predictions['Total_Counts'].round(0).astype(int)

# Display the updated future_crime_predictions DataFrame
future_crime_predictions

Unnamed: 0,NEIGHBOURHOOD_158,HOOD_158,OCC_YEAR,OCC_MONTH,Total_Counts
0,West Humber-Clairville (1),1,2024,7,110
1,West Humber-Clairville (1),1,2024,8,111
2,West Humber-Clairville (1),1,2024,9,109
3,West Humber-Clairville (1),1,2024,10,110
4,West Humber-Clairville (1),1,2024,11,107
...,...,...,...,...,...
6630,South Eglinton-Davisville (174),174,2027,8,12
6631,South Eglinton-Davisville (174),174,2027,9,12
6632,South Eglinton-Davisville (174),174,2027,10,14
6633,South Eglinton-Davisville (174),174,2027,11,14
