In [19]:
# ==========================================
# CrimeHotspotSim - Data Preprocessing Notebook
# ==========================================
# Goal:
# Explore, clean, and prepare crime and contextual datasets
# for spatio-temporal prediction and hotspot simulation.
# ==========================================

# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os

# Set visualization style
sns.set(style="whitegrid", palette="muted")

In [20]:
# --- 2. Load Dataset ---
# Path to raw dataset
crime_path = "../data/raw/Crime_Data_from_2020_to_Present.csv" # Name of the Dataset (You can name it whatever you want.)

# Load data
df = pd.read_csv(crime_path)

# Basic inspection
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (1004991, 28)


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,211507896,04/11/2021 12:00:00 AM,11/07/2020 12:00:00 AM,845,15,N Hollywood,1502,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,7800 BEEMAN AV,,34.2124,-118.4092
1,201516622,10/21/2020 12:00:00 AM,10/18/2020 12:00:00 AM,1845,15,N Hollywood,1521,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,IC,Invest Cont,230.0,,,,ATOLL AV,N GAULT,34.1993,-118.4203
2,240913563,12/10/2024 12:00:00 AM,10/30/2020 12:00:00 AM,1240,9,Van Nuys,933,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,14600 SYLVAN ST,,34.1847,-118.4509
3,210704711,12/24/2020 12:00:00 AM,12/24/2020 12:00:00 AM,1310,7,Wilshire,782,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,...,IC,Invest Cont,331.0,,,,6000 COMEY AV,,34.0339,-118.3747
4,201418201,10/03/2020 12:00:00 AM,09/29/2020 12:00:00 AM,1830,14,Pacific,1454,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),...,IC,Invest Cont,420.0,,,,4700 LA VILLA MARINA,,33.9813,-118.435


In [21]:
# --- 3. Initial Exploration ---
# View dataset info
df.info()

# Check missing values
df.isna().sum()

# Quick statistics
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004991 entries, 0 to 1004990
Data columns (total 28 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   DR_NO           1004991 non-null  int64  
 1   Date Rptd       1004991 non-null  object 
 2   DATE OCC        1004991 non-null  object 
 3   TIME OCC        1004991 non-null  int64  
 4   AREA            1004991 non-null  int64  
 5   AREA NAME       1004991 non-null  object 
 6   Rpt Dist No     1004991 non-null  int64  
 7   Part 1-2        1004991 non-null  int64  
 8   Crm Cd          1004991 non-null  int64  
 9   Crm Cd Desc     1004991 non-null  object 
 10  Mocodes         853372 non-null   object 
 11  Vict Age        1004991 non-null  int64  
 12  Vict Sex        860347 non-null   object 
 13  Vict Descent    860335 non-null   object 
 14  Premis Cd       1004975 non-null  float64
 15  Premis Desc     1004403 non-null  object 
 16  Weapon Used Cd  327247 non-null   fl

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
count,1004991.0,1004991,1004991,1004991.0,1004991.0,1004991,1004991.0,1004991.0,1004991.0,1004991,...,1004990,1004991,1004980.0,69160.0,2314.0,64.0,1004991,154236,1004991.0,1004991.0
unique,,1896,1879,,,21,,,,140,...,6,6,,,,,66566,10413,,
top,,02/02/2023 12:00:00 AM,01/01/2020 12:00:00 AM,,,Central,,,,VEHICLE - STOLEN,...,IC,Invest Cont,,,,,800 N ALAMEDA ST,BROADWAY,,
freq,,929,1164,,,69670,,,,115190,...,802862,802862,,,,,2598,2486,,
mean,220221500.0,,,1339.9,10.69174,,1115.633,1.400348,500.1568,,...,,,499.9174,958.101258,984.01599,991.21875,,,33.99821,-118.0909
std,13197180.0,,,651.0613,6.110255,,611.1605,0.4899691,205.2731,,...,,,205.0736,110.354348,52.350982,27.06985,,,1.610713,5.582386
min,817.0,,,1.0,1.0,,101.0,1.0,110.0,,...,,,110.0,210.0,310.0,821.0,,,0.0,-118.6676
25%,210616900.0,,,900.0,5.0,,587.0,1.0,331.0,,...,,,331.0,998.0,998.0,998.0,,,34.0147,-118.4305
50%,220915900.0,,,1420.0,11.0,,1139.0,1.0,442.0,,...,,,442.0,998.0,998.0,998.0,,,34.0589,-118.3225
75%,231110300.0,,,1900.0,16.0,,1613.0,2.0,626.0,,...,,,626.0,998.0,998.0,998.0,,,34.1649,-118.2739


In [23]:
# --- 4. Handle Missing or Invalid Data ---
# Drop rows with missing coordinates
df = df.dropna(subset=['LAT', 'LON'])

# Drop duplicates if any
df = df.drop_duplicates()

# Example: Fill missing categorical data with mode
if 'Crime_Type' in df.columns:
    df['Crime_Type'].fillna(df['Crime_Type'].mode()[0], inplace=True)

# Confirm cleaning
df.isna().sum()

DR_NO                   0
Date Rptd               0
DATE OCC                0
TIME OCC                0
AREA                    0
AREA NAME               0
Rpt Dist No             0
Part 1-2                0
Crm Cd                  0
Crm Cd Desc             0
Mocodes            151619
Vict Age                0
Vict Sex           144644
Vict Descent       144656
Premis Cd              16
Premis Desc           588
Weapon Used Cd     677744
Weapon Desc        677744
Status                  1
Status Desc             0
Crm Cd 1               11
Crm Cd 2           935831
Crm Cd 3          1002677
Crm Cd 4          1004927
LOCATION                0
Cross Street       850755
LAT                     0
LON                     0
dtype: int64