In [None]:
import pandas as pd

In [None]:
# Data Reading

# Total Crime Stats
df1 = pd.read_excel("../data/landing/crime/crime_by_lga_2025.xlsx", sheet_name="Table 01")

# Individual Crime Stats
df2 = pd.read_excel("../data/landing/crime/crime_by_lga_2025.xlsx", sheet_name="Table 02")


In [None]:
len(df1),len(df2),

In [None]:
# Standardize column names 
df1.columns = (
    df1.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace(",", "", regex=True)   # remove commas
              .str.replace(r"[()]", "", regex=True)
)

# Remove totals
df = df1[df1["local_government_area"].str.lower() != "total"]

# Strip whitespace from LGA names
df1["local_government_area"] = df1["local_government_area"].str.strip()

# Create population estimate 
df1["population_est"] = (df1["incidents_recorded"] / df1["rate_per_100000_population"]) * 100000

# Select useful columns (for merging later)
df1 = df1[[
    "year", 
    "local_government_area", 
    "incidents_recorded", 
    "rate_per_100000_population", 
    "population_est"
]]

print(df1.head())

In [None]:
# Feature Engineering DF1

# Crime Density
df1["crime_per_person"] = df1["incidents_recorded"] / df1["population_est"]

# Relative CI
state_avg = df1["rate_per_100000_population"].mean()
df1["crime_index"] = df1["rate_per_100000_population"] / state_avg

# Rank Crime
df1["crime_rank"] = df1["rate_per_100000_population"].rank(ascending=False)

print(df1.head(10))


In [None]:
# Standardize column names
df2.columns = (
    df2.columns.str.strip()
               .str.lower()
               .str.replace(" ", "_")
               .str.replace(",", "", regex=True)
               .str.replace(r"[()]", "", regex=True)
)

# Drop totals
df2 = df2[df2["local_government_area"].str.lower() != "total"]

# Clean LGA names
df2["local_government_area"] = df2["local_government_area"].str.strip()

# Population estimate
df2["population_est"] = (df2["incidents_recorded"] / df2["lga_rate_per_100000_population"]) * 100000

# Crime share of each offence type within LGA
df2["crime_share"] = df2["incidents_recorded"] / df2.groupby(
    ["year", "local_government_area"]
)["incidents_recorded"].transform("sum")

# Add a high-level grouping: violent vs property 
violent_keywords = ["assault", "homicide", "sexual", "robbery", "abduction"]
df2["crime_type"] = df2["offence_division"].str.lower().apply(
    lambda x: "violent" if any(k in x for k in violent_keywords) else "property"
)

# Keep useful columns for analysis
df2_clean = df2[[
    "year", 
    "local_government_area", 
    "offence_division", 
    "offence_subdivision", 
    "offence_subgroup", 
    "incidents_recorded", 
    "lga_rate_per_100000_population", 
    "population_est", 
    "crime_share", 
    "crime_type"
]]

print(df2_clean.head())

In [None]:
# Feature Engineering DF2

# Violent vs Property Ratios
violent = df2[df2["crime_type"]=="violent"].groupby("local_government_area")["incidents_recorded"].sum()
property = df2[df2["crime_type"]=="property"].groupby("local_government_area")["incidents_recorded"].sum()
ratio = (violent / property).fillna(0)

# Top Crime Type
top_share = df2.groupby(["local_government_area"])["crime_share"].max().reset_index(name="top_crime_share")



In [None]:
# Preview
print(df2.head(10))

# Group summary
violent_summary = df2[df2["crime_type"]=="violent"].groupby("local_government_area")["incidents_recorded"].sum()
property_summary = df2[df2["crime_type"]=="property"].groupby("local_government_area")["incidents_recorded"].sum()

print("Violent crimes by LGA:")
print(violent_summary.head())

print("Property crimes by LGA:")
print(property_summary.head())