<a href="https://www.kaggle.com/code/aronpollner/predicting-age-and-race-of-offender-nd-2021?scriptVersionId=114908006" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# FBI crime report analysis
##### The objective of this notebook is to use **Machine Learning** models to predict crime data. I chose to predict the age (numerical) and race (categorical) of the offenders based on data from the victims (age, race, ethnicity, sex, etc), the relationship between victim and offender, crime location, crime type, etc. I'll be using North Dakota 2021 data.

![S](https://www.fbi.gov/image-repository/nibrs-logo-1.jpg/@@images/image/large)


   ![](https://media0.giphy.com/media/rCqHtYuB0a9re731gG/giphy.gif)

#### Library Imports

In [None]:
import itertools
import numpy as np
import pandas as pd
import os
import glob
import re
from tqdm import tqdm
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
import seaborn as sns
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
# modeling
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor,RandomForestRegressor,\
ExtraTreesRegressor,BaggingRegressor,RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,\
     AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBRegressor
# data evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,\
     mean_squared_error,r2_score,mean_absolute_error,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None 
np.random.seed(31415)

#### Data Loading

In [None]:
state = 'ND-'
year = '2021'

file_path = rf'/kaggle/input/us-fbi-nibrs-crime-data-2021-all-states/{state}{year}/'
# list all the files from the directory
file_list = os.listdir(file_path)
csv_list = []
# list only csv files
for file in file_list:
    if re.search('.+csv', file):
        csv_list.append(file)
# dictionary of dataframes corresponding to each csv file, where the key is the
# dataframe name and the value is the actual data frame
df_dict = {file[:-4].lower(): pd.read_csv(rf'/kaggle/input/us-fbi-nibrs-crime-data-2021-all-states/{state}{year}/{file}')
           for file in csv_list}
# Since we chose the state and year, data_year and state columns don't give any extra information
# so we drop them
for key, val in df_dict.items():
    val=val[val.columns.drop(list(val.filter(regex='data_year')))].copy()
    val=val[val.columns.drop(list(val.filter(regex='state')))].copy()
    df_dict[key]=val
# Now we create variables for all the dictionary keys and assign to them
# the value from the respective value of the dictionary according to their key
locals().update(df_dict)

##### Merging the dataframes
###### For this project I chose to use the offenders data and ignore the arrestees since the former has more data, although caution should be taken that data collected for offenders that where not arrested might be less reliable

In [None]:
df_combined = pd.merge(nibrs_victim_offense, nibrs_offense,
                       how='outer', on='offense_id')
df_combined = pd.merge(df_combined, nibrs_suspect_using,
                       how='outer', on='offense_id')
df_combined = pd.merge(df_combined, nibrs_bias_motivation,
                       how='outer', on='offense_id')
nibrs_offender = pd.merge(nibrs_offender, nibrs_age,
                       how='outer',
                          on='age_id')
nibrs_offender = pd.merge(nibrs_offender, ref_race,
                          on='race_id')
nibrs_offender = pd.merge(nibrs_offender, nibrs_ethnicity,
                          on='ethnicity_id')
nibrs_victim = pd.merge(nibrs_victim, nibrs_age,
                       how='outer',
                        on='age_id')
nibrs_victim = pd.merge(nibrs_victim, ref_race, on='race_id')
nibrs_victim = pd.merge(nibrs_victim, nibrs_ethnicity,
                        on='ethnicity_id')
df_victim_offender = pd.merge(nibrs_victim, nibrs_offender,
                              on='incident_id', how='outer', suffixes=('_victim', '_offender'))
df_combined = pd.merge(df_combined, df_victim_offender,
                       how='outer', left_on=['incident_id', 'victim_id'], right_on=['incident_id',
                                                                                    'victim_id'])
df_combined = pd.merge(df_combined, nibrs_incident,
                       how='outer', on='incident_id')
df_combined = pd.merge(df_combined, nibrs_property,
                       how='outer', on='incident_id')
df_combined = pd.merge(df_combined, nibrs_property_desc,
                       how='outer', on='property_id')
df_combined = pd.merge(df_combined, nibrs_victim_offender_rel, how='outer', left_on=[
    'victim_id', 'offender_id'], right_on=[
        'victim_id', 'offender_id'])
df_combined = pd.merge(df_combined, nibrs_criminal_act,
                       how='outer', on='offense_id')
df_combined = pd.merge(df_combined, nibrs_weapon,
                       how='outer', on='offense_id')
df_combined = pd.merge(df_combined, nibrs_victim_injury,
                       how='outer', on='victim_id')
df_combined = pd.merge(df_combined, agencies,
                       how='outer', on='agency_id')
df_combined = pd.merge(df_combined, nibrs_offense_type, how='outer',
                       on='offense_code')
df_combined = pd.merge(df_combined, nibrs_prop_desc_type,
                       how='outer', on='prop_desc_id')
df_combined = pd.merge(df_combined, nibrs_location_type,
                       how='outer', on='location_id')
df_combined = pd.merge(df_combined, nibrs_bias_list,
                       how='outer', on='bias_id')
df_combined = pd.merge(df_combined, nibrs_weapon_type,
                       how='outer', on='weapon_id')
df_combined = pd.merge(df_combined, nibrs_relationship,
                       how='outer', on='relationship_id')
df_combined = pd.merge(df_combined, nibrs_victim_circumstances,
                       how='outer', on='victim_id')
df_combined = pd.merge(df_combined, nibrs_circumstances,
                       how='outer', on='circumstances_id')
df_combined = pd.merge(df_combined, nibrs_criminal_act_type,
                       how='outer', on='criminal_act_id')
df_combined = pd.merge(df_combined, nibrs_activity_type,
                       how='outer', on='activity_type_id')
df_combined = pd.merge(df_combined, nibrs_victim_type,
                       how='outer', on='victim_type_id')
df_combined = pd.merge(df_combined, nibrs_prop_loss_type,
                       how='outer', on='prop_loss_id')
df_combined = pd.merge(df_combined, nibrs_injury,
                       how='outer', on='injury_id')
df_combined = pd.merge(df_combined, nibrs_using_list, on='suspect_using_id')

#### Data preprocessing, cleaning and feature engineering
<a id='Data preprocessing, cleaning and feature engineering'></a>

In [None]:
# we create a copy of the combined dataframe to work on
df_cleaned=df_combined.copy()

In [None]:
# Let's see the complete dataframe
pd.set_option('display.max_columns', None)
df_cleaned

In [None]:
# Transform date to datetime format
df_cleaned['incident_date'] = pd.to_datetime(
    df_cleaned['incident_date'], format="%Y/%m/%d")

In [None]:
# Since crime is done by humans and we know that the date has an effect on human behavior, 
# we do some feature engineering extracting new variables from the date data could be useful
# for our predictions. We extract day of the week,if it is weekend or a holiday, month,
# day of the month, and week of the year
df_cleaned['incident_day'] = df_cleaned['incident_date'].dt.dayofweek
df_cleaned['incident_isweekend'] = df_cleaned['incident_day'] > 4
cal = USFederalHolidayCalendar()
holidays = cal.holidays()
df_cleaned['incident_is_holiday'] = df_cleaned['incident_date'].isin(holidays)
df_cleaned['incident_month'] = df_cleaned['incident_date'].dt.month
df_cleaned['incident_dayofmonth'] = df_cleaned['incident_date'].dt.day
df_cleaned['incident_weekofyear'] =df_cleaned['incident_date'].dt.isocalendar().week

###### Each NIBRS offense belongs to one of three categories: Crimes Against Persons, Crimes Against Property, and Crimes Against Society ([See this document](Crimes_Against_Persons_Property_and_Society.pdf)). Therefore we split the data into these categories. In this project will focus on Crimes Against Persons.

In [None]:
grouped = df_cleaned.groupby(df_cleaned.crime_against)
df_person = grouped.get_group("Person")
df_society = grouped.get_group("Society")
df_property = grouped.get_group("Property")

###### Let's plot the crime against persons along time

###### First we plot the number of incidents per day along the year. In the background we added green lines representing weekends and orange representing holidays. Here we  can see is that there is a long up-and-down movement few times a year, short weekly movements 52 times a year, and perhaps others.

In [None]:
weekends=sorted(df_person[df_person['incident_isweekend']==True].incident_date.unique())
holidays=sorted(df_person[df_person['incident_is_holiday']==True].incident_date.unique())
df_person.groupby('incident_date')['incident_id'].nunique().plot(title='incident_date'+' time series',figsize=(12,6))
i = 0
while i < len(weekends)-1:
    plt.axvspan(weekends[i], weekends[i+1]+np.timedelta64(1,'D'), facecolor='green', edgecolor='none', alpha=.2)
    i += 2
for i in range(len(holidays)):
    plt.axvspan(holidays[i], holidays[i]+np.timedelta64(1,'D'), facecolor='red', edgecolor='none', alpha=.5)

###### It can be seen that in general there is more crime during the weekend

In [None]:
p1=sns.cubehelix_palette(7)
plt.figure(figsize=(13,8))
sns.countplot(x="incident_day", data=df_person,palette=p1)

###### Here we see something strange, that at 12am there is much more crimes. This might be that crimes without a recorded time are recorded at 12am by default. We'll have to take care of this anomaly.

In [None]:
p2=sns.cubehelix_palette(24)
plt.figure(figsize=(13,8))
sns.countplot(x="incident_hour", data=df_person, color='grey',palette=p2)

###### It seems that theres a trend for crimes to go down along the month, although not all month have 31 days.

In [None]:
p3=sns.cubehelix_palette(31)
plt.figure(figsize=(13,8))
sns.countplot(x="incident_dayofmonth", data=df_person, color='grey',palette=p3)

###### Let's remove the columns with only null values


In [None]:
for df in [df_person,df_property,df_society]:
    df.dropna(thresh=2, axis=1,inplace=True)

In [None]:
df_person.relationship_name.value_counts()

###### Remove all columns where there is a class that represents more than 90% of the data

In [None]:
# Remove all columns where there is a class that represents more than 90% of the data
for df in [df_person,df_property,df_society]:
    very_imbalanced_columns=[]
    for col in tqdm(df.columns):
        if df[col].value_counts(normalize=True,dropna=False).max() >= 0.90:
            very_imbalanced_columns.append(col)
    df.drop(very_imbalanced_columns, axis=1,inplace=True)  

###### Remove all columns where there is more than 90% null values

In [None]:
for df in [df_person,df_property,df_society]:
    # Remove all columns where there is more than 90% null values
    to_remove=[]
    for column,null_percent in enumerate((df.isna().sum()/df.shape[0]).to_frame().values):
        if null_percent >0.90:
            to_remove.append(column)
            #print(null_percent,column)
    df.drop(columns=df.columns[to_remove], inplace=True)

###### Let's remove columns that seem irrelevant for our goal

In [None]:
for df in [df_person,df_property,df_society]:
    df.drop(columns=['num_premises_entered',
                          'age_range_low_num_victim',
                         'notes_victim'
                         ,'population_group_desc','county_name',
                         'age_range_low_num_offender',
                         'notes_offender', 'submission_date',
                         'cleared_except_date','offense_category_name',
                         'incident_status', 'did','suspected_drug_name','drug_measure_name',
                         'date_recovered', 'ori', 'legacy_ori',
                         'ucr_agency_name', 'ncic_agency_name',
                         'pub_agency_unit','prop_loss_desc',
                         'suburban_area_flag', 'parent_pop_group_desc',
                         'mip_flag', 'pe_reported_flag',
                         'nibrs_start_date', 'nibrs_leoka_start_date',
                         'nibrs_ct_start_date', 'msa_name', 'ct_flag',
                         'age_range_high_num', 'est_drug_qty',
                         'male_officer','male_civilian','criminal_act_desc',
                         'male_officer+male_civilian', 'female_officer',
                         'female_civilian', 'female_officer+female_civilian',
                         'officer_rate', 'hc_flag', 'shr_flag', 'employee_rate',
                         ], inplace=True, axis=1,errors='ignore')

###### Looking at nibrs_age csv we see that there are a minority of ages that don't correspond to a specific age. We'll remove this for simplicity

In [None]:
df_person=df_person.loc[~df_person['age_id_victim'].isin([1,2,3,103,104])]
df_person=df_person.loc[~df_person['age_id_offender'].isin([1,2,3,103,104])]

###### Let's remove columns that don't give valuable information like id and code columns

In [None]:
for df in [df_person,df_property,df_society]:
    (df.drop(columns=[i for i in list(df.filter(regex='_id'))],errors='ignore',inplace=True))
    (df.drop(columns=[i for i in list(df.filter(regex='_code')) if i!='resident_status_code'],errors='ignore',inplace=True))

###### Let's transform age to numeric

In [None]:
df_person["age_num_victim"] = pd.to_numeric(df_person["age_num_victim"])
df_person["age_num_offender"] = pd.to_numeric(df_person["age_num_offender"])

In [None]:
df_person.age_num_offender.hist(alpha=.5,label='Age Offender')
df_person.age_num_victim.hist(alpha=.5,label='Age Victim')
plt.title('Histogram Age Victim vs Offender')
plt.legend()

#### Predictions

###### We'll choose the relevant inputs and we'll predict two outputs, one numerical (age of the offender) and one categorical (race of the offender)

In [None]:
inputFeature = df_person[['victim_seq_num','age_num_victim','resident_status_code','race_desc_victim',
'ethnicity_name_victim','incident_hour','pub_agency_name','offense_name','location_name','weapon_name','population'
,'injury_name','incident_day','incident_isweekend','incident_month','incident_dayofmonth','incident_weekofyear','relationship_name']]
numerical_output = df_person[['age_num_offender']]
categorial_output = df_person[['race_desc_offender']]

##### Categorical predictions

###### We split the data into 90/10 train/test and shuffle

In [None]:
X_train, X_test, y_train, y_test = train_test_split(inputFeature, categorial_output, test_size=0.1, random_state=42,stratify=categorial_output)

In [None]:
numerical_features=['population','victim_seq_num','age_num_victim','incident_hour','incident_month','incident_day','incident_dayofmonth','incident_weekofyear']
categorical_features = ['resident_status_code','race_desc_victim',
'ethnicity_name_victim','pub_agency_name','offense_name','location_name','weapon_name'
,'injury_name','relationship_name','incident_isweekend']

###### We create a pipeline for imputing missing values, scaling the numerical features and do one hot encoding for categorical ones

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

###### We see below that all the predictors perform better than predicting the the most common value, with **Random Forest Classifier** and **Extra Trees Classifier** being the best ones. We seee that both classifiers do a (similiar) good job even with imbalanced classes. With a little bit of misclassification between white, african american and american indian or alaska native. We show the accuracy along the confusion matrix and the distribution plot for each model.

In [None]:
classifiers = [
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    SVC(kernel="rbf", C=0.025, probability=True),
    ]
race_list=df_person.race_desc_offender.value_counts().index.tolist()
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train.values.ravel())   
    print(type(classifier).__name__)
    print("model Accuracy score: %.3f" % pipe.score(X_test, y_test))
    confusion_matrix_race = confusion_matrix(y_test, pipe.predict(X_test), labels=race_list)
    # Plot normalized confusion matrix in race
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_race,
                              display_labels=race_list)
    a4_dims = (10, 6)
    fig, ax =plt.subplots(1,2,figsize=a4_dims)
    disp.plot(xticks_rotation='vertical', ax=ax[0]) 
    sns.countplot(pipe.predict(X_test), ax=ax[1]).set(title=type(classifier).__name__)
    ax[1].tick_params(axis='x', rotation=90)
    fig.figure.subplots_adjust(wspace=.3)
    plt.show()



###### We get the base case which is always prediciting the most common value and see it is the worst

In [None]:
race_count = df_person['race_desc_offender'].value_counts()
race_base = race_count.idxmax()

In [None]:
print("model Accuracy score: %.3f" % accuracy_score(y_test,[race_base]*len(y_test)))

##### Numerical Predictions

###### We split the data into 90/10 train/test and shuffle

In [None]:
X_train, X_test, y_train, y_test = train_test_split(inputFeature, numerical_output, test_size=0.1, random_state=42)

###### We compare the R2 score, RMSE and density plots of the predicting models with the test. It seems that **Random Forest Regressor** and **Extra Trees Regressor** are the best ones

In [None]:
regressors = [
GradientBoostingRegressor()
,AdaBoostRegressor()
,ExtraTreesRegressor()
,BaggingRegressor()
,RandomForestRegressor()
,XGBRegressor(objective="reg:squarederror", random_state=42,verbosity=0)
    ]
   
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train.values.ravel())   
    print(type(regressor).__name__)
    print("model R2 score: %.3f" % pipe.score(X_test, y_test))
    print("model RMSE: %.3f" % mean_squared_error(y_test.values.ravel(),pipe.predict(X_test),squared=False))
    sns.kdeplot(pipe.predict(X_test),label=type(regressor).__name__)
sns.kdeplot(y_test.values.ravel(),label='Test')
plt.legend()

###### We get the base case which is always prediciting the mean and see it is the worst

In [None]:
age_base = df_person['age_num_offender'].mean()

In [None]:
print("base model R2 score: %.3f" % r2_score(y_test,[age_base]*len(y_test)))
print("base model RMSE: %.3f" % mean_squared_error(y_test.values.ravel(),[age_base]*len(y_test),squared=False))