<a href="https://colab.research.google.com/github/aadhityasw/Minority-Report/blob/main/Crime-Analysis/Boston_Crime_Prediction_and_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Boston Crime Prediction Analysis

## 1. Import files and libraries

Here, we load the dataset and have a quick look at its features.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

# Copy the file from your drive to "/content"
%cp "/content/drive/My Drive/crime.csv" "/content/crime.csv"

Mounted at /content/drive


In [3]:
df = pd.read_csv('crime.csv', encoding='latin-1')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,E18,495,,2018-10-03 20:13:00,2018,10,Wednesday,20,Part Two,ARLINGTON ST,42.262608,-71.121186,"(42.26260773, -71.12118637)"
1,I182080053,3201,Property Lost,PROPERTY - LOST,D14,795,,2018-08-30 20:00:00,2018,8,Thursday,20,Part Three,ALLSTON ST,42.352111,-71.135311,"(42.35211146, -71.13531147)"
2,I182080052,2647,Other,THREATS TO DO BODILY HARM,B2,329,,2018-10-03 19:20:00,2018,10,Wednesday,19,Part Two,DEVON ST,42.308126,-71.07693,"(42.30812619, -71.07692974)"
3,I182080051,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,A1,92,,2018-10-03 20:00:00,2018,10,Wednesday,20,Part One,CAMBRIDGE ST,42.359454,-71.059648,"(42.35945371, -71.05964817)"
4,I182080050,3122,Aircraft,AIRCRAFT INCIDENTS,A7,36,,2018-10-03 20:49:00,2018,10,Wednesday,20,Part Three,PRESCOTT ST,42.375258,-71.024663,"(42.37525782, -71.02466343)"


In [4]:
# To learn more about the dataset imported
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327820 entries, 0 to 327819
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   INCIDENT_NUMBER      327820 non-null  object 
 1   OFFENSE_CODE         327820 non-null  int64  
 2   OFFENSE_CODE_GROUP   327820 non-null  object 
 3   OFFENSE_DESCRIPTION  327820 non-null  object 
 4   DISTRICT             326046 non-null  object 
 5   REPORTING_AREA       327820 non-null  object 
 6   SHOOTING             1055 non-null    object 
 7   OCCURRED_ON_DATE     327820 non-null  object 
 8   YEAR                 327820 non-null  int64  
 9   MONTH                327820 non-null  int64  
 10  DAY_OF_WEEK          327820 non-null  object 
 11  HOUR                 327820 non-null  int64  
 12  UCR_PART             327727 non-null  object 
 13  STREET               316843 non-null  object 
 14  Lat                  307188 non-null  float64
 15  Long             

## 2. Pre Processing of dataset

In this section, we create a copy of the dataset, perform the required pre-processing tasks to get the dataset ready for training.

### 2.1 Make a copy of the dataset

In this section, we create a copy of the dataset and use this to perform the pre-processing and then the Modelling

In [68]:
# Create a model

df_model = df.copy()

### 2.2 Minimize the dataset by Prioritization of crimes

We choose the top `15` categories of crimes and minimize the dataset based on these. This is done so as to simplify the task and to deliver a model as fast as possible, and this model once successful can be scaled later for all categories of crimes.

In [69]:
# We extract the top 15 crimes by their count of occurance

df_model['OFFENSE_CODE_GROUP'].value_counts().head(15)

Motor Vehicle Accident Response    38134
Larceny                            26670
Medical Assistance                 24226
Investigate Person                 19176
Other                              18612
Drug Violation                     17037
Simple Assault                     16263
Vandalism                          15810
Verbal Disputes                    13478
Towed                              11632
Investigate Property               11443
Larceny From Motor Vehicle         11120
Property Lost                      10077
Warrant Arrests                     8579
Aggravated Assault                  8033
Name: OFFENSE_CODE_GROUP, dtype: int64

In [70]:
# This is the list of all the categories we are going to choose

choosen_offense_code_groups = (
    'Motor Vehicle Accident Response',
    'Larceny',
    'Medical Assistance',
    'Investigate Person',
    'Other',
    'Drug Violation',
    'Simple Assault',
    'Vandalism',
    'Verbal Disputes',
    'Towed',
    'Investigate Property',
    'Larceny From Motor Vehicle'
)

In [71]:
# Create a minimised dataframe to store the values of crimes belonging to only these categories

df_model = df.loc[df['OFFENSE_CODE_GROUP'].isin(choosen_offense_code_groups)]

In [72]:
# Print this Miniatured dataset to understand its size

df_model.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
2,I182080052,2647,Other,THREATS TO DO BODILY HARM,B2,329.0,,2018-10-03 19:20:00,2018,10,Wednesday,19,Part Two,DEVON ST,42.308126,-71.07693,"(42.30812619, -71.07692974)"
5,I182080049,1402,Vandalism,VANDALISM,C11,351.0,,2018-10-02 20:40:00,2018,10,Tuesday,20,Part Two,DORCHESTER AVE,42.299197,-71.06047,"(42.29919694, -71.06046974)"
6,I182080048,3803,Motor Vehicle Accident Response,M/V ACCIDENT - PERSONAL INJURY,,,,2018-10-03 20:16:00,2018,10,Wednesday,20,Part Three,,42.320734,-71.056764,"(42.32073413, -71.05676415)"
7,I182080047,3301,Verbal Disputes,VERBAL DISPUTE,B2,603.0,,2018-10-03 19:32:00,2018,10,Wednesday,19,Part Three,TREMONT ST,42.333807,-71.103778,"(42.33380683, -71.10377843)"
8,I182080045,802,Simple Assault,ASSAULT SIMPLE - BATTERY,E18,543.0,,2018-10-03 19:27:51,2018,10,Wednesday,19,Part Two,AVILA RD,42.256145,-71.128025,"(42.25614494, -71.12802506)"


### 2.3 Converting Date of Occurance

In this section, we will take the date in string, convert it into a python-datetime object, adn use it to extract the month, year, day of week and hour from this timestamp. Note that this has already been done.

In [73]:
# Convert the date in string into a pandas timestamp

df_model['OCCURRED_ON_DATE'] = pd.to_datetime(df_model['OCCURRED_ON_DATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### 2.4 Choose Necessary Columns

In this section, we trim the columns of the dataset and extract only the necessary and useful columns from the dataset which might help in the Machine Learning Predictions

In [74]:
# This is the list of columns that we choose

choosen_columns = [
    'DISTRICT','REPORTING_AREA','MONTH','DAY_OF_WEEK', 'HOUR',
    'UCR_PART', 'Lat','Long', 'OFFENSE_CODE_GROUP', 'OFFENSE_CODE'
]

In [75]:
# We trim the dataset with this choosen set of columns

df_model = df_model[choosen_columns]

In [76]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223601 entries, 2 to 327813
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   DISTRICT            222350 non-null  object 
 1   REPORTING_AREA      223601 non-null  object 
 2   MONTH               223601 non-null  int64  
 3   DAY_OF_WEEK         223601 non-null  object 
 4   HOUR                223601 non-null  int64  
 5   UCR_PART            223601 non-null  object 
 6   Lat                 208742 non-null  float64
 7   Long                208742 non-null  float64
 8   OFFENSE_CODE_GROUP  223601 non-null  object 
 9   OFFENSE_CODE        223601 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 18.8+ MB


### 2.5  Map the offences to Day-time and Night-time

We use the points taken out from the timestamp above which are the month and the hour of occurance and use these to map the crime to whether it was committed in day time or during the night.

This can give insights to whether a trend exists corresponding to crimes and the time of day.

We consider the daylight savings schedules to map the day and night for every month of the year into being `day` or `night`.

In [77]:
# Create new columns which have 0 or 1 values denoting whether it was day or night
# These are one hot encoded values

df_model['Day'] = 0
df_model['Night'] = 0

In [78]:
# Map the daytime for each month based on the boston schedules of daylight savings, etc

# Determining Daytime for January
df_model['Day'].loc[(df_model['MONTH'] == 1) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 18)] = 1

# Determining Daytime for Feburary
df_model['Day'].loc[(df_model['MONTH'] == 2) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 19)] = 1

# Determining Daytime for March
df_model['Day'].loc[(df_model['MONTH'] == 3) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 20)] = 1

# Determining Daytime for April
df_model['Day'].loc[(df_model['MONTH'] == 4) & (df_model['HOUR'] >= 5) & (df_model['HOUR'] <= 20)] = 1

# Determining Daytime for May
df_model['Day'].loc[(df_model['MONTH'] == 5) & (df_model['HOUR'] >= 5) & (df_model['HOUR'] <= 21)] = 1

# Determining Daytime for June
df_model['Day'].loc[(df_model['MONTH'] == 6) & (df_model['HOUR'] >= 4) & (df_model['HOUR'] <= 21)] = 1

# Determining Daytime July
df_model['Day'].loc[(df_model['MONTH'] == 7) & (df_model['HOUR'] >= 5) & (df_model['HOUR'] <= 21)] = 1

# Determining Daytime for August
df_model['Day'].loc[(df_model['MONTH'] == 8) & (df_model['HOUR'] >= 5) & (df_model['HOUR'] <= 21)] = 1

# Determining Daytime for September
df_model['Day'].loc[(df_model['MONTH'] == 9) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 20)] = 1

# Determining Daytime for October
df_model['Day'].loc[(df_model['MONTH'] == 10) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 19)] = 1

# Determining Daytime for November
df_model['Day'].loc[(df_model['MONTH'] == 11) & (df_model['HOUR'] >= 6) & (df_model['HOUR'] <= 17)] = 1

# Determining Daytime for December
df_model['Day'].loc[(df_model['MONTH'] == 12) & (df_model['HOUR'] >= 7) & (df_model['HOUR'] <= 17)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, val

In [79]:
# Map the night time of the crimes
# We use the fact that : If a record (crime row in dataset) has occured during the day, then it does not occur during the night and vice versa

df_model['Night'].loc[df_model['Day']==0]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


### 2.6 Map the District values

We convert the district values which are its codes into numerical values as it better represents the districts during the training phase.

In [80]:
# Find all the unique values of district codes

df_model['DISTRICT'].unique()

array(['B2', 'C11', nan, 'E18', 'D4', 'D14', 'B3', 'C6', 'E13', 'A1',
       'A7', 'A15', 'E5'], dtype=object)

In [81]:
# Map these district codes to numerical values

df_model['DISTRICT'] = df_model['DISTRICT'].map({
    'B3':1, 
    'E18':2, 
    'B2':3, 
    'E5':4, 
    'C6':5, 
    'D14':6, 
    'E13':7, 
    'C11':8, 
    'D4':9, 
    'A7':10, 
    'A1':11, 
    'A15':12
})

### 2.7 Map the "Reporting Area" to numerical values

We perform the same process as done for "District" values so as to keep these suitable for training phase

In [82]:
# To know the number of unique values of "Reporting areas"

len(df_model['REPORTING_AREA'].unique())

880

In [83]:
# We map these 880 values of "Reporting areas" into numerical values

df_model['REPORTING_AREA'] = pd.to_numeric(df_model['REPORTING_AREA'], errors='coerce')

### 2.8 Map the "Day of Week" into Numerical Values

As done for the above columns, we map the days of the week into numerical values from `1` to `7`

In [84]:
# Map the days of the week into numerical values

df_model['DAY_OF_WEEK'] = df_model['DAY_OF_WEEK'].map({
    'Monday':1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7
})

### 2.9 Map the "Offence Group" into Numerical Values

In [85]:
# Create a map for the offence groups

offence_code_groups_mapping = {
    'Motor Vehicle Accident Response':1, 
    'Larceny':2, 
    'Medical Assistance':3,
    'Investigate Person':4, 
    'Other':5, 
    'Drug Violation':6, 
    'Simple Assault':7,
    'Vandalism':8, 
    'Verbal Disputes':9, 
    'Towed':10, 
    'Investigate Property':11,
    'Larceny From Motor Vehicle':12
}

In [86]:
# Map these values

df_model['OFFENSE_CODE_GROUP'] = df_model['OFFENSE_CODE_GROUP'].map(offence_code_groups_mapping)

### 2.10 Map `OFFENCE_CODE` Into Numerical Values

In this section, we make sure that all the values in this column are numbers.
Almost all the values here are numbers already, so this section is just a precaution.

In [96]:
# View the list of all offense codes

df_model['OFFENSE_CODE'].value_counts()

3006    19360
3115    19176
3831    16730
1402    15542
802     15199
        ...  
637         1
624         1
627         1
634         1
1864        1
Name: OFFENSE_CODE, Length: 93, dtype: int64

In [88]:
# Map the values of the "Offence Codes"

df_model['OFFENSE_CODE'] = pd.to_numeric(df_model['OFFENSE_CODE'], errors='coerce')

### 2.11 Map the `UCR Part` into Numerical Values

The `UCR_PART` is for the Uniform Crime Reporting Format which denotes the seriousness of the crime.

In [89]:
# View the unique values for this parameter

df_model['UCR_PART'].unique()

array(['Part Two', 'Part Three', 'Part One', 'Other'], dtype=object)

In [90]:
# Create a map for these values

ucr_part_map = {
    'Part Three':3, 
    'Part One':1, 
    'Part Two':2, 
    #'Other':4
}

In [91]:
# Map these values into numbers

df_model['UCR_PART'] = df_model['UCR_PART'].map(ucr_part_map)

In [92]:
df_model['UCR_PART'].unique()

array([ 2.,  3.,  1., nan])

### 2.12 Pre-Processed Dataset

In [93]:
# View the final model which will be used for training

df_model

Unnamed: 0,DISTRICT,REPORTING_AREA,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,Lat,Long,OFFENSE_CODE_GROUP,OFFENSE_CODE,Day,Night
2,3.0,329.0,10,3,19,2.0,42.308126,-71.076930,5,2647,1,0
5,8.0,351.0,10,2,20,2.0,42.299197,-71.060470,8,1402,0,1
6,,,10,3,20,3.0,42.320734,-71.056764,1,3803,0,1
7,3.0,603.0,10,3,19,3.0,42.333807,-71.103778,9,3301,1,0
8,2.0,543.0,10,3,19,2.0,42.256145,-71.128025,7,802,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
327797,3.0,318.0,10,5,21,2.0,42.311277,-71.089093,7,802,0,1
327799,9.0,285.0,6,7,17,1.0,42.336951,-71.085748,2,629,1,0
327811,11.0,111.0,8,3,12,2.0,42.352312,-71.063705,6,1848,1,0
327812,11.0,111.0,8,3,12,2.0,42.352312,-71.063705,6,1849,1,0


In [94]:
# There are a few NULL values in these columns
# Each Method we take up in section 5 will have their own ways of handling these null values, so this is not touched upon here

df_model.isnull().sum()

DISTRICT               1251
REPORTING_AREA        15385
MONTH                     0
DAY_OF_WEEK               0
HOUR                      0
UCR_PART                 26
Lat                   14859
Long                  14859
OFFENSE_CODE_GROUP        0
OFFENSE_CODE              0
Day                       0
Night                     0
dtype: int64

## 3. Algorithms Design

In this section, we define the Classification Algorithms that we will be using to perform the training

In [42]:
# Import the classifiers used for creating the models
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.semi_supervised import LabelSpreading
from sklearn.svm import LinearSVC
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# Import the metrics libraries for comparinf the results from these algorithms
from sklearn.metrics import f1_score



### 3.1 Results Evaluation

We will define a function which takes in the result of a particlar algorithm and then uses it to find the scores based on the specfied evaluation metrics.

In [43]:
def fun_results(result):
    print('mean: ' + str(result.mean()))
    print('max: ' + str(result.max()))
    print('min: ' + str(result.min()))
    return result

# Create a class for each type of ML classification we will do
# Each model will use this object to put in their scores into this
# We can later take all the scores for a category of classification to print the final result

### 3.2 Decision Trees

In [44]:
def DecisionTreeClassifier_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Decision Tree Classification Model and fit it on the training data
    dec_tree_clf = DecisionTreeClassifier().fit(X_train, Y_train)
    Y_pred = dec_tree_clf.predict(X_test)

    dec_tree_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(dec_tree_score)

### 3.3 Random Forest Classifier

In [45]:
def RandomForestClassifier_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Random Forest Classification Model and fit it on the training data
    random_forest_clf = RandomForestClassifier().fit(X_train, Y_train)
    Y_pred = random_forest_clf.predict(X_test)

    rfc_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(rfc_score)

### 3.4 Extra Tree Classifier

**Extra Tree Classifier** is a type of ensemble learning technique which aggregates the results of multiple de-correlated decision trees collected in a “forest” to output it’s classification result. 

In [46]:
def ExtraTreeClassifier_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Extra Tree Classification Model and fit it on the training data
    ext_tree_clf = ExtraTreeClassifier().fit(X_train, Y_train)
    Y_pred = ext_tree_clf.predict(X_test)

    ext_tree_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(ext_tree_score)

### 3.5 K-Nearest Neighbor Classifier

In [47]:
def KNearestNeighborsClassifier_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a K - Nearest Neighbor Classification Model and fit it on the training data
    k_neighbor_clf = KNeighborsClassifier().fit(X_train, Y_train) 
    Y_pred = k_neighbor_clf.predict(X_test)

    neigh_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(neigh_score)

### 3.6 Bernoulli Naive Bayes Classifier

In [48]:
def BernoulliNB_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Bernoulli Naive Bayes Classification Model and fit it on the training data
    bernoulli_clf = BernoulliNB().fit(X_train, Y_train)
    Y_pred = bernoulli_clf.predict(X_test)

    bernoulli_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(bernoulli_score)

### 3.7 Gaussian Naive Bayes Classifier

In [49]:
def GaussianNB_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Gaussian Naive Bayes Classification Model and fit it on the training data
    gauss_clf = GaussianNB().fit(X_train, Y_train)
    Y_pred = gauss_clf.predict(X_test)

    gauss_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(gauss_score)

### 3.8 Light GBM Classifier

**Light GBM** is a gradient boosting framework that uses tree based learning algorithm. (Light Gradient Boosting Machine).
Light GBM grows tree vertically while other algorithm grows trees horizontally meaning that Light GBM grows tree leaf-wise while other algorithm grows level-wise. It will choose the leaf with max delta loss to grow. 

In [50]:
def LGBM_Model(X_train, Y_train, X_test, Y_test, result_obj=None):

    # Create a Light GBM Classification Model and fit it on the training data
    lgbm_clf = LGBMClassifier().fit(X_train, Y_train)
    Y_pred = lgbm_clf.predict(X_test)

    clf_score = f1_score(Y_test, Y_pred, average=None)
    return fun_results(clf_score)

## 4. Modelling Scenarios

In this section, we take up different scenarios in which ML based Pedictions can be useful, and what they mean for us, and also how the different classification models perform for these scenarios.

### 4.1 Prediction of Future Crime Offence Categories

In this section, we will explore the possibility of being able to predict the "Offence Group" of the crime that might happen in the future based on parameters like `date`, `time`, `location`, etc

In [51]:
# Create a copy of the dataset for this modelling

df_model_1 = df_model.copy()

In [52]:
# Handle the NULL values by replacing them to zeros

df_model_1.fillna(0, inplace = True)

In [53]:
# Partition the columns into input and output variables for making predictions
# X_1 => Input set of parameters
# Y_1 => Output variable

X_1 = df_model_1[['DISTRICT','REPORTING_AREA','MONTH','DAY_OF_WEEK','HOUR','Lat','Long','Day','Night']]
Y_1 = df_model_1['OFFENSE_CODE_GROUP']

In [54]:
# Split the dataframe into random train and test subsets

X_train_1, X_test_1, Y_train_1, Y_test_1 = train_test_split(
    X_1,
    Y_1, 
    test_size = 0.1,
    random_state=42
)

print(f"Train Set Shape : ({X_train_1.shape}, {Y_train_1.shape})")
print(f"Test Set Shape : ({X_test_1.shape}, {Y_test_1.shape})")

Train Set Shape : ((201240, 9), (201240,))
Test Set Shape : ((22361, 9), (22361,))


In [55]:
# Modelling based on different algorithms

DecisionTreeClassifier_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
RandomForestClassifier_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
ExtraTreeClassifier_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
KNearestNeighborsClassifier_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
BernoulliNB_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
GaussianNB_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)
LGBM_Model(X_train_1, Y_train_1, X_test_1, Y_test_1)

mean: 0.20494950403481257
max: 0.4809775062482643
min: 0.10065645514223195
mean: 0.23522790351160838
max: 0.5267349260523322
min: 0.09523809523809523
mean: 0.18928894904594343
max: 0.4768842813112917
min: 0.08896000000000001
mean: 0.17734560867895388
max: 0.3580034423407917
min: 0.06781829814459374
mean: 0.04918694379846069
max: 0.2836893591535236
min: 0.0
mean: 0.06197873550466903
max: 0.2630730659025788
min: 0.0
mean: 0.20067793617060695
max: 0.40472489897419955
min: 0.018007202881152463


array([0.36109618, 0.4047249 , 0.17317073, 0.07883462, 0.09892086,
       0.36897404, 0.09596929, 0.0180072 , 0.16586804, 0.3457424 ,
       0.21297602, 0.08385093])

### 4.2 Prediction of Location of Future Crimes

Use the `date` and the `offence type` to predict where or in which district a particular type of crime can occur.

In [97]:
# Create a copy of the dataset for this modelling

df_model_2 = df_model.copy()

In [98]:
# Handle the NULL values by dropping records that contain them

df_model_2 = df_model_2.dropna()

In [99]:
# Partition the columns into input and output variables for making predictions
# X_2 => Input set of parameters
# Y_2 => Output variable

X_2 = df_model_2[['OFFENSE_CODE', 'DISTRICT','MONTH','DAY_OF_WEEK','HOUR','Day','Night']]
Y_2 = df_model_2['DISTRICT']

In [100]:
# Split the dataframe into random train and test subsets

X_train_2, X_test_2, Y_train_2, Y_test_2 = train_test_split(
    X_2,
    Y_2, 
    test_size = 0.1,
    random_state=42
)

print(f"Train Set Shape : ({X_train_2.shape}, {Y_train_2.shape})")
print(f"Test Set Shape : ({X_test_2.shape}, {Y_test_2.shape})")

Train Set Shape : ((186048, 7), (186048,))
Test Set Shape : ((20672, 7), (20672,))


In [101]:
# Modelling based on different algorithms

DecisionTreeClassifier_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
RandomForestClassifier_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
ExtraTreeClassifier_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
KNearestNeighborsClassifier_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
BernoulliNB_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
GaussianNB_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)
LGBM_Model(X_train_2, Y_train_2, X_test_2, Y_test_2)

mean: 1.0
max: 1.0
min: 1.0
mean: 1.0
max: 1.0
min: 1.0
mean: 0.9117899683452905
max: 0.9702497285559175
min: 0.855131964809384
mean: 0.7700538359868779
max: 0.9306139613120269
min: 0.4846526655896608
mean: 0.027088428590105615
max: 0.2591089584151503
min: 0.0
mean: 1.0
max: 1.0
min: 1.0
mean: 1.0
max: 1.0
min: 1.0


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

### 4.3 Prediction of the Seriousness of Crimes in the near Future

This is based on the `UCR` parameter which stands for `Uniform Crime Reporting`

This section deals with the prediction of the seriousness of the crime that might occur at a particular time and location.

The `UCR_PART` has categories like part 1-3, with one being the most serious types of crimes.

In [63]:
# Create a copy of the dataset for this modelling

df_model_3 = df_model.copy()

In [64]:
# Handle the NULL values by dropping records that contain them

df_model_3 = df_model_3.dropna()

In [65]:
# Partition the columns into input and output variables for making predictions
# X_3 => Input set of parameters
# Y_3 => Output variable

X_3 = df_model_3[['DISTRICT','REPORTING_AREA', 'MONTH','DAY_OF_WEEK','HOUR','Lat','Long']]
Y_3 = df_model_3['UCR_PART']

In [66]:
# Split the dataframe into random train and test subsets

X_train_3, X_test_3, Y_train_3, Y_test_3 = train_test_split(
    X_3,
    Y_3, 
    test_size = 0.1,
    random_state=42
)

print(f"Train Set Shape : ({X_train_3.shape}, {Y_train_3.shape})")
print(f"Test Set Shape : ({X_test_3.shape}, {Y_test_3.shape})")

Train Set Shape : ((186048, 7), (186048,))
Test Set Shape : ((20672, 7), (20672,))


In [67]:
# Modelling based on different algorithms

DecisionTreeClassifier_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
RandomForestClassifier_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
ExtraTreeClassifier_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
KNearestNeighborsClassifier_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
BernoulliNB_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
GaussianNB_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)
LGBM_Model(X_train_3, Y_train_3, X_test_3, Y_test_3)

mean: 0.45453698233938233
max: 0.5943256303526147
min: 0.3468071099407505
mean: 0.49145448541656034
max: 0.6921983511351175
min: 0.3623822341857335
mean: 0.4430550063354736
max: 0.5909974861262628
min: 0.32272606735613896
mean: 0.4427394059264255
max: 0.6010874034815188
min: 0.3340653307713423
mean: 0.22873682167592144
max: 0.685886840432295
min: 0.0
mean: 0.22873682167592144
max: 0.685886840432295
min: 0.0
mean: 0.41560243592475254
max: 0.7024633657799487
min: 0.2478837650031586


array([0.29646018, 0.24788377, 0.70246337])

# Partially corrected until here

## Classification based on time only

Use the `date` and the `offence type` to predict where or in which `district` a particular type of crime can occur.

In [None]:
df_model_2 = df[['OFFENSE_CODE', 'DISTRICT','MONTH','DAY_OF_WEEK','HOUR','Day','Night']]
df_model_2['OFFENSE_CODE'] = pd.to_numeric(df_model_2['OFFENSE_CODE'], errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# DISTRICT

df_model_2['DISTRICT'] = df_model_2['DISTRICT'].map({
    'B3':1, 
    'E18':2, 
    'B2':3, 
    'E5':4, 
    'C6':5, 
    'D14':6, 
    'E13':7, 
    'C11':8, 
    'D4':9, 
    'A7':10, 
    'A1':11, 
    'A15':12
})

df_model_2['DISTRICT'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


array([ 2.,  6.,  3., 11., 10.,  8., nan,  9.,  7.,  1.,  5., 12.,  4.])

In [None]:
# DAY_OF_WEEK

df_model_2['DAY_OF_WEEK'] = df_model_2['DAY_OF_WEEK'].map({
    'Tuesday':2, 
    'Saturday':6, 
    'Monday':1, 
    'Sunday':7, 
    'Thursday':4, 
    'Wednesday':3,
    'Friday':5
})

df_model_2['DAY_OF_WEEK'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


array([3, 4, 2, 1, 6, 7, 5])

In [None]:
df_model_2.isnull().sum()

OFFENSE_CODE       0
DISTRICT        1774
MONTH              0
DAY_OF_WEEK        0
HOUR               0
Day                0
Night              0
dtype: int64

In [None]:
df_model_2 = df_model_2.dropna()
df_model_2['DISTRICT'].unique()

array([ 2.,  6.,  3., 11., 10.,  8.,  9.,  7.,  1.,  5., 12.,  4.])

In [None]:
df_model_2.shape

(326046, 7)

In [None]:
x = df_model_2[['OFFENSE_CODE','MONTH','DAY_OF_WEEK','HOUR','Day','Night']]
y = df_model_2['DISTRICT']

In [None]:
# Split dataframe into random train and test subsets

X_train, X_test, Y_train, Y_test = train_test_split(
    x,
    y, 
    test_size = 0.1,
    random_state=42
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(293441, 6) (293441,)
(32605, 6) (32605,)


In [None]:
fun_DecisionTreeClassifier(X_train, Y_train)

mean: 0.1134380676293032
max: 0.21250719040184074
min: 0.013683010262257697


array([0.18710518, 0.09007634, 0.21250719, 0.05863454, 0.09761533,
       0.0870542 , 0.05537245, 0.15197495, 0.20222521, 0.03539823,
       0.16961018, 0.01368301])

In [None]:
fun_BernoulliNB(X_train, Y_train)

mean: 0.029193253496147903
max: 0.27446228415631624
min: 0.0


array([0.        , 0.        , 0.27446228, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.07585676, 0.        ])

In [None]:
fun_ExtraTreeClassifier(X_train, Y_train)

mean: 0.111408207693842
max: 0.20839433870180574
min: 0.01408450704225352


array([0.18944223, 0.08879816, 0.20839434, 0.05383688, 0.09774604,
       0.08338028, 0.05836967, 0.14926096, 0.20039053, 0.03247774,
       0.16071714, 0.01408451])

In [None]:
fun_KNeighborsClassifier(X_train, Y_train)

mean: 0.10796506324051282
max: 0.2128002579397066
min: 0.007653061224489796


array([0.19255863, 0.08250825, 0.21280026, 0.04460303, 0.0862069 ,
       0.08146766, 0.04778157, 0.14998135, 0.19602063, 0.02480271,
       0.16919671, 0.00765306])

In [None]:
fun_GaussianNB(X_train, Y_train)

mean: 0.05671163456222778
max: 0.2624587528555715
min: 0.0


array([0.        , 0.        , 0.26245875, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.23499748, 0.        ,
       0.18308338, 0.        ])

In [None]:
fun_RandomForestClassifier(X_train, Y_train)

mean: 0.11720923196523449
max: 0.2147693609818028
min: 0.01874414245548266


array([0.17482609, 0.07060334, 0.20725481, 0.06521739, 0.08941394,
       0.08150818, 0.06983328, 0.16835592, 0.21476936, 0.04867257,
       0.19731176, 0.01874414])

In [None]:
fun_LGBMClassifier(X_train, Y_train)

mean: 0.10939254551439898
max: 0.2820928237872868
min: 0.002962962962962963


array([0.16477173, 0.00766284, 0.28142721, 0.02236198, 0.0501764 ,
       0.03815728, 0.00560224, 0.17945142, 0.28209282, 0.01025641,
       0.26778724, 0.00296296])