In [14]:
# Imports & Paths
import pandas as pd
import numpy as np
from pathlib import Path
import os

# print("Current Working Directory:", os.getcwd())

BASE_DIR = Path(os.getcwd()).resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"

CRIME_DIR = RAW_DIR / "crime_data"
EDU_DIR = RAW_DIR / "education_data"

# print("BASE_DIR:", BASE_DIR)
# print("RAW_DIR:", RAW_DIR)
# print("CRIME_DIR:", CRIME_DIR)
# print("EDU_DIR:", EDU_DIR)

In [15]:
# Load Crime Data
crime_path = CRIME_DIR / "estimated_crimes_1979_2024.csv"

# print("Reading crime file from:", crime_path)

crime_df = pd.read_csv(crime_path)

print("FBI Crime Data Loaded Successfully!")
print("Shape:", crime_df.shape)

crime_df.head()

FBI Crime Data Loaded Successfully!
Shape: (2388, 15)


Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,caveats
0,1979,,,220099000,1208030,21460,76390.0,,480700,629480,11041500,3327700,6601000,1112800,
1,1979,AK,Alaska,406000,1994,54,292.0,,445,1203,23193,5616,15076,2501,
2,1979,AL,Alabama,3769000,15578,496,1037.0,,4127,9918,144372,48517,83791,12064,
3,1979,AR,Arkansas,2180000,7984,198,595.0,,1626,5565,70949,21457,45267,4225,
4,1979,AZ,Arizona,2450000,14528,219,1120.0,,4305,8884,177977,48916,116976,12085,


In [16]:
# Crime Data Structure Profiling
print("=== Crime Data Columns ===")
print(list(crime_df.columns))

print("\n=== Year Range ===")
print(crime_df["year"].min(), "→", crime_df["year"].max())

print("\n=== Number of Unique States ===")
print(crime_df["state_name"].nunique())

print("\n=== First 20 Unique States ===")
print(crime_df["state_name"].dropna().unique()[:20])

print("\n=== Missing State Name Count ===")
print(crime_df["state_name"].isna().sum())

=== Crime Data Columns ===
['year', 'state_abbr', 'state_name', 'population', 'violent_crime', 'homicide', 'rape_legacy', 'rape_revised', 'robbery', 'aggravated_assault', 'property_crime', 'burglary', 'larceny', 'motor_vehicle_theft', 'caveats']

=== Year Range ===
1979 → 2024

=== Number of Unique States ===
103

=== First 20 Unique States ===
['Alaska' 'Alabama' 'Arkansas' 'Arizona' 'California' 'Colorado'
 'Connecticut' 'District of Columbia' 'Delaware' 'Florida' 'Georgia'
 'Hawaii' 'Iowa' 'Idaho' 'Illinois' 'Indiana' 'Kansas' 'Kentucky'
 'Louisiana' 'Massachusetts']

=== Missing State Name Count ===
38


### Crime Data Profiling Summary

- The FBI crime dataset contains **15 columns**, including year, state identifiers, population, and major crime categories (violent crime, homicide, robbery, burglary, larceny, etc.).
- The dataset spans a broad temporal range from **1979 to 2024**, which fully covers the analysis period (2015–2024).
- There are **103 unique values** in `state_name`, exceeding the expected 51 (50 states + DC). This indicates the presence of territories, duplicated entries, or summary rows.
- The first 20 state names appear valid and correctly formatted.
- There are **38 rows with missing `state_name`**, which likely correspond to improperly reported or aggregate entries and will need to be removed during cleaning.

In [17]:
# Missing Values Overview
print("=== Missing Value Count by Column (Top 20) ===")
missing = crime_df.isna().sum().sort_values(ascending=False)
print(missing.head(20))

print("\n=== Percentage Missing by Column ===")
missing_pct = (crime_df.isna().mean() * 100).round(2)
print(missing_pct.sort_values(ascending=False).head(20))

=== Missing Value Count by Column (Top 20) ===
caveats                2316
rape_revised           1768
rape_legacy             412
state_abbr               42
state_name               38
year                      0
population                0
violent_crime             0
homicide                  0
robbery                   0
aggravated_assault        0
property_crime            0
burglary                  0
larceny                   0
motor_vehicle_theft       0
dtype: int64

=== Percentage Missing by Column ===
caveats                96.98
rape_revised           74.04
rape_legacy            17.25
state_abbr              1.76
state_name              1.59
year                    0.00
population              0.00
violent_crime           0.00
homicide                0.00
robbery                 0.00
aggravated_assault      0.00
property_crime          0.00
burglary                0.00
larceny                 0.00
motor_vehicle_theft     0.00
dtype: float64


### Crime Data Missing Value Summary

- The `caveats` column has **96.98% missing values**, indicating it is not useful for analysis and should be removed.
- `rape_revised` shows **74.04% missing**, and `rape_legacy` has **17.25% missing**, reflecting changes in FBI reporting definitions. These fields are inconsistent across years and will not be used as primary variables.
- `state_abbr` (1.76% missing) and `state_name` (1.59% missing) contain a small number of missing entries, likely corresponding to summary or invalid rows that should be removed.
- Core analytical columns (`violent_crime`, `property_crime`, `homicide`, `robbery`, `burglary`, `larceny`, `motor_vehicle_theft`) have **0% missing values**, which supports reliable downstream analysis.


### Crime Data Cleaning Plan

Based on the profiling results, the following cleaning steps will be applied to the FBI crime dataset:

1. **Remove the `caveats` column**  
   - Contains ~97% missing values and provides no analytical value.

2. **Ignore `rape_legacy` and `rape_revised` fields**  
   - Missing values occur due to changes in FBI reporting definitions.
   - Analysis will rely on `violent_crime` instead of individual rape metrics.

3. **Remove rows with missing `state_name`**  
   - 38 rows correspond to summary or invalid entries and should be dropped.

4. **Filter the dataset to the analysis period (2015–2024)**  
   - Ensures temporal consistency with the education data.

5. **Standardize column names**  
   - Example: `state_name` → `state`, `year` → `year` (clean lowercase naming).

6. **Remove non-state territories**  
   - Exclude entries for territories such as Guam or Puerto Rico to keep only the 50 states + DC.



In [18]:
crime_clean = crime_df.copy()

print("Initial shape:", crime_clean.shape)
crime_clean.head()

Initial shape: (2388, 15)


Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,caveats
0,1979,,,220099000,1208030,21460,76390.0,,480700,629480,11041500,3327700,6601000,1112800,
1,1979,AK,Alaska,406000,1994,54,292.0,,445,1203,23193,5616,15076,2501,
2,1979,AL,Alabama,3769000,15578,496,1037.0,,4127,9918,144372,48517,83791,12064,
3,1979,AR,Arkansas,2180000,7984,198,595.0,,1626,5565,70949,21457,45267,4225,
4,1979,AZ,Arizona,2450000,14528,219,1120.0,,4305,8884,177977,48916,116976,12085,


In [19]:
# Drop highly missing column 'caveats'

if "caveats" in crime_clean.columns:
    crime_clean = crime_clean.drop(columns=["caveats"])
    print("Dropped column: 'caveats'")

print("Shape after dropping caveats:", crime_clean.shape)
crime_clean.head()

Dropped column: 'caveats'
Shape after dropping caveats: (2388, 14)


Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft
0,1979,,,220099000,1208030,21460,76390.0,,480700,629480,11041500,3327700,6601000,1112800
1,1979,AK,Alaska,406000,1994,54,292.0,,445,1203,23193,5616,15076,2501
2,1979,AL,Alabama,3769000,15578,496,1037.0,,4127,9918,144372,48517,83791,12064
3,1979,AR,Arkansas,2180000,7984,198,595.0,,1626,5565,70949,21457,45267,4225
4,1979,AZ,Arizona,2450000,14528,219,1120.0,,4305,8884,177977,48916,116976,12085


In [20]:
# Drop rows where state_name is missing

missing_state_before = crime_clean['state_name'].isna().sum()
print(f"Missing state_name before cleaning: {missing_state_before}")

crime_clean = crime_clean.dropna(subset=['state_name'])

missing_state_after = crime_clean['state_name'].isna().sum()
print(f"Missing state_name after cleaning: {missing_state_after}")

print("Shape after dropping missing state_name rows:", crime_clean.shape)
crime_clean.head()

Missing state_name before cleaning: 38
Missing state_name after cleaning: 0
Shape after dropping missing state_name rows: (2350, 14)


Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft
1,1979,AK,Alaska,406000,1994,54,292.0,,445,1203,23193,5616,15076,2501
2,1979,AL,Alabama,3769000,15578,496,1037.0,,4127,9918,144372,48517,83791,12064
3,1979,AR,Arkansas,2180000,7984,198,595.0,,1626,5565,70949,21457,45267,4225
4,1979,AZ,Arizona,2450000,14528,219,1120.0,,4305,8884,177977,48916,116976,12085
5,1979,CA,California,22696000,184087,2952,12239.0,,75767,93129,1511021,496310,847148,167563


In [21]:
# Keep only 50 states + DC

valid_states = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut",
    "Delaware","District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois",
    "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts",
    "Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada",
    "New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota",
    "Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina",
    "South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington",
    "West Virginia","Wisconsin","Wyoming"
]

before = crime_clean.shape[0]

crime_clean = crime_clean[crime_clean["state_name"].isin(valid_states)]

after = crime_clean.shape[0]

print(f"Rows before filtering states: {before}")
print(f"Rows after filtering states: {after}")
print(f"Removed rows: {before - after}")

crime_clean.head()

Rows before filtering states: 2350
Rows after filtering states: 2193
Removed rows: 157


Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft
1,1979,AK,Alaska,406000,1994,54,292.0,,445,1203,23193,5616,15076,2501
2,1979,AL,Alabama,3769000,15578,496,1037.0,,4127,9918,144372,48517,83791,12064
3,1979,AR,Arkansas,2180000,7984,198,595.0,,1626,5565,70949,21457,45267,4225
4,1979,AZ,Arizona,2450000,14528,219,1120.0,,4305,8884,177977,48916,116976,12085
5,1979,CA,California,22696000,184087,2952,12239.0,,75767,93129,1511021,496310,847148,167563


In [22]:
# Filter years between 2015 and 2024
before = crime_clean.shape[0]

crime_clean = crime_clean[(crime_clean['year'] >= 2015) & (crime_clean['year'] <= 2024)]

after = crime_clean.shape[0]

print(f"Rows before year filtering: {before}")
print(f"Rows after year filtering:  {after}")
print(f"Removed: {before - after}")

Rows before year filtering: 2193
Rows after year filtering:  357
Removed: 1836


In [23]:
# Rename columns for consistency
crime_clean = crime_clean.rename(columns={
    "state_name": "state",
    "state_abbr": "state_abbr",
    "violent_crime": "violent_crime",
})

# Drop unreliable rape columns
crime_clean = crime_clean.drop(columns=["rape_legacy", "rape_revised"], errors="ignore")

print("Shape after column rename & dropping rape columns:", crime_clean.shape)
crime_clean.head()

Shape after column rename & dropping rape columns: (357, 12)


Unnamed: 0,year,state_abbr,state,population,violent_crime,homicide,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft
1873,2015,AK,Alaska,737709,5391,59,761,3671,20806,3511,15249,2046
1874,2015,AL,Alabama,4853875,22957,348,4612,15960,144785,35265,99182,10338
1875,2015,AR,Arkansas,2977853,15769,189,2117,11455,97391,22851,68720,5820
1876,2015,AZ,Arizona,6817565,27968,306,6360,18193,207184,38010,152388,16786
1877,2015,CA,California,38993940,166883,1861,52862,99349,1024914,197404,656517,170993


In [24]:
numeric_cols = [
    "population",
    "violent_crime",
    "homicide",
    "robbery",
    "aggravated_assault",
    "property_crime",
    "burglary",
    "larceny",
    "motor_vehicle_theft"
]

for col in numeric_cols:
    crime_clean[col] = pd.to_numeric(crime_clean[col], errors="coerce")

crime_clean[numeric_cols].dtypes

# Create per-capita crime metrics (per 100k population)
crime_clean["violent_crime_rate"] = (
    crime_clean["violent_crime"] / crime_clean["population"] * 100000
)

crime_clean["property_crime_rate"] = (
    crime_clean["property_crime"] / crime_clean["population"] * 100000
)

crime_clean["homicide_rate"] = (
    crime_clean["homicide"] / crime_clean["population"] * 100000
)

crime_clean["robbery_rate"] = (
    crime_clean["robbery"] / crime_clean["population"] * 100000
)

crime_clean["aggravated_assault_rate"] = (
    crime_clean["aggravated_assault"] / crime_clean["population"] * 100000
)

print("Shape after adding rate columns:", crime_clean.shape)
crime_clean.head()

Shape after adding rate columns: (357, 17)


Unnamed: 0,year,state_abbr,state,population,violent_crime,homicide,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,violent_crime_rate,property_crime_rate,homicide_rate,robbery_rate,aggravated_assault_rate
1873,2015,AK,Alaska,737709,5391,59,761,3671,20806,3511,15249,2046,730.775956,2820.353283,7.997734,103.157207,497.621691
1874,2015,AL,Alabama,4853875,22957,348,4612,15960,144785,35265,99182,10338,472.962324,2982.874507,7.169529,95.016868,328.809456
1875,2015,AR,Arkansas,2977853,15769,189,2117,11455,97391,22851,68720,5820,529.542593,3270.510667,6.346855,71.091488,384.673119
1876,2015,AZ,Arizona,6817565,27968,306,6360,18193,207184,38010,152388,16786,410.234446,3038.973592,4.488406,93.288439,266.854808
1877,2015,CA,California,38993940,166883,1861,52862,99349,1024914,197404,656517,170993,427.971628,2628.393027,4.772536,135.564654,254.780615


In [25]:
from pathlib import Path

cleaned_dir = BASE_DIR / "data" / "cleaned"
cleaned_dir.mkdir(parents=True, exist_ok=True)

crime_output_path = cleaned_dir / "crime_cleaned.csv"

crime_clean.to_csv(crime_output_path, index=False)

print("Crime cleaned dataset saved to:")
print(crime_output_path)

Crime cleaned dataset saved to:
/Users/lyt/Desktop/is477-course_project-deepsick/data/cleaned/crime_cleaned.csv
