In [13]:
import pandas as pd

In [14]:
# Data Reading

# Total Crime Stats
df1 = pd.read_excel("../data/landing/crime/crime_by_lga_2025.xlsx", sheet_name="Table 01")

# Individual Crime Stats
df2 = pd.read_excel("../data/landing/crime/crime_by_lga_2025.xlsx", sheet_name="Table 02")


In [15]:
len(df1),len(df2),

(870, 50294)

In [17]:
# Standardize column names 
df1.columns = (
    df1.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace(",", "", regex=True)   # remove commas
              .str.replace(r"[()]", "", regex=True)
)

# Remove totals
df = df1[df1["local_government_area"].str.lower() != "total"]

# Strip whitespace from LGA names
df1["local_government_area"] = df1["local_government_area"].str.strip()

# Create population estimate 
df1["population_est"] = (df1["incidents_recorded"] / df1["rate_per_100000_population"]) * 100000

# Select useful columns (for merging later)
df1 = df1[[
    "year", 
    "local_government_area", 
    "incidents_recorded", 
    "rate_per_100000_population", 
    "population_est"
]]

print(df1.head())

   year local_government_area  incidents_recorded  rate_per_100000_population  \
0  2025               Banyule                8086                 6082.275801   
1  2025              Brimbank               14369                 7214.154149   
2  2025               Darebin               15001                 9271.641976   
3  2025           Hobsons Bay                6289                 6491.469850   
4  2025                  Hume               17321                 6180.821302   

   population_est  
0    132943.65900  
1    199177.88980  
2    161794.42690  
3     96880.98605  
4    280237.83820  


In [28]:
# Feature Engineering DF1

# Crime Density
df1["crime_per_person"] = df1["incidents_recorded"] / df1["population_est"]

# Relative CI
state_avg = df1["rate_per_100000_population"].mean()
df1["crime_index"] = df1["rate_per_100000_population"] / state_avg

# Rank Crime
df1["crime_rank"] = df1["rate_per_100000_population"].rank(ascending=False)

print(df1.head(10))


   year local_government_area  incidents_recorded  rate_per_100000_population  \
0  2025               Banyule                8086                 6082.275801   
1  2025              Brimbank               14369                 7214.154149   
2  2025               Darebin               15001                 9271.641976   
3  2025           Hobsons Bay                6289                 6491.469850   
4  2025                  Hume               17321                 6180.821302   
5  2025           Maribyrnong                9817                10299.008117   
6  2025             Melbourne               34620                17792.112910   
7  2025                Melton               11912                 5103.404254   
8  2025             Merri-bek               12563                 6654.135828   
9  2025         Moonee Valley                8049                 6137.601063   

   population_est  crime_per_person  crime_index  crime_rank  
0    132943.65900          0.060823     1.079

In [None]:
# Standardize column names
df2.columns = (
    df2.columns.str.strip()
               .str.lower()
               .str.replace(" ", "_")
               .str.replace(",", "", regex=True)
               .str.replace(r"[()]", "", regex=True)
)

# Drop totals
df2 = df2[df2["local_government_area"].str.lower() != "total"]

# Clean LGA names
df2["local_government_area"] = df2["local_government_area"].str.strip()

# Population estimate
df2["population_est"] = (df2["incidents_recorded"] / df2["lga_rate_per_100000_population"]) * 100000

# Crime share of each offence type within LGA
df2["crime_share"] = df2["incidents_recorded"] / df2.groupby(
    ["year", "local_government_area"]
)["incidents_recorded"].transform("sum")

# Add a high-level grouping: violent vs property 
violent_keywords = ["assault", "homicide", "sexual", "robbery", "abduction"]
df2["crime_type"] = df2["offence_division"].str.lower().apply(
    lambda x: "violent" if any(k in x for k in violent_keywords) else "property"
)

# Keep useful columns for analysis
df2_clean = df2[[
    "year", 
    "local_government_area", 
    "offence_division", 
    "offence_subdivision", 
    "offence_subgroup", 
    "incidents_recorded", 
    "lga_rate_per_100000_population", 
    "population_est", 
    "crime_share", 
    "crime_type"
]]

print(df2_clean.head())

   year local_government_area             offence_division  \
0  2025              Ballarat  A Crimes against the person   
1  2025              Ballarat  A Crimes against the person   
2  2025              Ballarat  A Crimes against the person   
3  2025              Ballarat  A Crimes against the person   
4  2025              Ballarat  A Crimes against the person   

                 offence_subdivision  \
0  A10 Homicide and related offences   
1   A20 Assault and related offences   
2   A20 Assault and related offences   
3   A20 Assault and related offences   
4   A20 Assault and related offences   

                                    offence_subgroup  incidents_recorded  \
0                  A10 Homicide and related offences                   8   
1                            A211 FV Serious assault                 221   
2                        A212 Non-FV Serious assault                 184   
3  A22 Assault police, emergency services or othe...                  72   
4     

In [24]:
# Feature Engineering DF2

# Violent vs Property Ratios
violent = df2[df2["crime_type"]=="violent"].groupby("local_government_area")["incidents_recorded"].sum()
property = df2[df2["crime_type"]=="property"].groupby("local_government_area")["incidents_recorded"].sum()
ratio = (violent / property).fillna(0)

# Top Crime Type
top_share = df2.groupby(["local_government_area"])["crime_share"].max().reset_index(name="top_crime_share")



In [26]:
# Preview
print(df2.head(10))

# Group summary
violent_summary = df2[df2["crime_type"]=="violent"].groupby("local_government_area")["incidents_recorded"].sum()
property_summary = df2[df2["crime_type"]=="property"].groupby("local_government_area")["incidents_recorded"].sum()

print("Violent crimes by LGA:")
print(violent_summary.head())

print("Property crimes by LGA:")
print(property_summary.head())

   year year_ending police_service_area local_government_area  \
0  2025       March            Ballarat              Ballarat   
1  2025       March            Ballarat              Ballarat   
2  2025       March            Ballarat              Ballarat   
3  2025       March            Ballarat              Ballarat   
4  2025       March            Ballarat              Ballarat   
5  2025       March            Ballarat              Ballarat   
6  2025       March            Ballarat              Ballarat   
7  2025       March            Ballarat              Ballarat   
8  2025       March            Ballarat              Ballarat   
9  2025       March            Ballarat              Ballarat   

              offence_division                 offence_subdivision  \
0  A Crimes against the person   A10 Homicide and related offences   
1  A Crimes against the person    A20 Assault and related offences   
2  A Crimes against the person    A20 Assault and related offences   
3  A