# Crime Data Preprocess

Preprocess the raw crime data to curated

# Import packages

In [15]:
import pandas as pd
import numpy as np
import re

# Read in the data


In [16]:
df = pd.read_csv("../../data/raw/crime.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Year ending,Postcode,Suburb/Town Name,Offence Division,Offence Count
0,0,2023,March,3691,Dederang,A Crimes against the person,1
1,1,2023,March,3691,Dederang,A Crimes against the person,1
2,2,2023,March,3691,Dederang,B Property and deception offences,2
3,3,2023,March,3691,Dederang,B Property and deception offences,1
4,4,2023,March,3691,Dederang,D Public order and security offences,1


In [17]:
df = df.drop(columns=df.columns[0], errors='ignore') if 'Unnamed: 0' in df.columns else df
df.head()

Unnamed: 0,Year,Year ending,Postcode,Suburb/Town Name,Offence Division,Offence Count
0,2023,March,3691,Dederang,A Crimes against the person,1
1,2023,March,3691,Dederang,A Crimes against the person,1
2,2023,March,3691,Dederang,B Property and deception offences,2
3,2023,March,3691,Dederang,B Property and deception offences,1
4,2023,March,3691,Dederang,D Public order and security offences,1


# Map year ending to number

In [18]:
# Dictionary to map month names to their respective numbers
month_to_number = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12"
}

# Convert 'Year ending' column to month numbers
df['Year ending'] = df['Year ending'].map(month_to_number)

df.head()

Unnamed: 0,Year,Year ending,Postcode,Suburb/Town Name,Offence Division,Offence Count
0,2023,3,3691,Dederang,A Crimes against the person,1
1,2023,3,3691,Dederang,A Crimes against the person,1
2,2023,3,3691,Dederang,B Property and deception offences,2
3,2023,3,3691,Dederang,B Property and deception offences,1
4,2023,3,3691,Dederang,D Public order and security offences,1


# Merge year ending and year

In [19]:
# Merge 'Year ending' and 'Year' columns
df['Year Ending'] = df['Year ending'] + '/' + df['Year'].astype(str)

# Drop the original 'Year' and 'Year ending' columns
df = df.drop(columns=['Year', 'Year ending'])

df.head()

Unnamed: 0,Postcode,Suburb/Town Name,Offence Division,Offence Count,Year Ending
0,3691,Dederang,A Crimes against the person,1,03/2023
1,3691,Dederang,A Crimes against the person,1,03/2023
2,3691,Dederang,B Property and deception offences,2,03/2023
3,3691,Dederang,B Property and deception offences,1,03/2023
4,3691,Dederang,D Public order and security offences,1,03/2023


In [20]:
# Change column order
desired_order = ['Year Ending', 'Postcode', 'Suburb/Town Name', 'Offence Division', 'Offence Count']

df = df[desired_order]

# Display the first few rows of the DataFrame to verify the new column order
df.head()

Unnamed: 0,Year Ending,Postcode,Suburb/Town Name,Offence Division,Offence Count
0,03/2023,3691,Dederang,A Crimes against the person,1
1,03/2023,3691,Dederang,A Crimes against the person,1
2,03/2023,3691,Dederang,B Property and deception offences,2
3,03/2023,3691,Dederang,B Property and deception offences,1
4,03/2023,3691,Dederang,D Public order and security offences,1


In [21]:
# Change column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

df.head()

Unnamed: 0,year_ending,postcode,suburb/town_name,offence_division,offence_count
0,03/2023,3691,Dederang,A Crimes against the person,1
1,03/2023,3691,Dederang,A Crimes against the person,1
2,03/2023,3691,Dederang,B Property and deception offences,2
3,03/2023,3691,Dederang,B Property and deception offences,1
4,03/2023,3691,Dederang,D Public order and security offences,1


# Group by offence division

In [22]:
# Group by 'offence_division' and sum the 'offence_count'
grouped_data = df.groupby('offence_division').agg({'offence_count': 'sum'}).reset_index()

grouped_data

Unnamed: 0,offence_division,offence_count
0,A Crimes against the person,774173
1,B Property and deception offences,2824773
2,C Drug offences,307426
3,D Public order and security offences,339078
4,E Justice procedures offences,702987
5,F Other offences,62762


In [23]:
# All possible offence divisions (based on the example given)
all_offence_divisions = [
    "A Crimes against the person",
    "B Property and deception offences",
    "C Drug offences",
    "D Public order and security offences",
    "E Justice procedures offences",
    "F Other offences"
]

# Create a MultiIndex of all combinations of suburb/town_name, year_ending, and offence_division
multi_index = pd.MultiIndex.from_product(
    [df['suburb/town_name'].unique(), df['year_ending'].unique(), all_offence_divisions],
    names=['suburb/town_name', 'year_ending', 'offence_division']
)

# Grouping the original data
grouped_data = df.groupby(['suburb/town_name', 'year_ending', 'offence_division']).agg({'offence_count': 'sum'}).reset_index()

all_combinations_df = pd.DataFrame(index=multi_index).reset_index()
merged_data = pd.merge(
    all_combinations_df, 
    grouped_data, 
    on=['suburb/town_name', 'year_ending', 'offence_division'], 
    how='left'
)

# Fill NA values with 0
merged_data['offence_count'].fillna(0, inplace=True)

# Add the Postcode column (assuming one postcode per suburb/town name)
postcode_map = df.drop_duplicates(subset='suburb/town_name').set_index('suburb/town_name')['postcode'].to_dict()
merged_data['postcode'] = merged_data['suburb/town_name'].map(postcode_map)

# Rearrange the columns
data = merged_data[['year_ending', 'postcode', 'suburb/town_name', 'offence_division', 'offence_count']]

data


Unnamed: 0,year_ending,postcode,suburb/town_name,offence_division,offence_count
0,03/2023,3691,Dederang,A Crimes against the person,2.0
1,03/2023,3691,Dederang,B Property and deception offences,3.0
2,03/2023,3691,Dederang,C Drug offences,0.0
3,03/2023,3691,Dederang,D Public order and security offences,1.0
4,03/2023,3691,Dederang,E Justice procedures offences,0.0
...,...,...,...,...,...
170995,03/2014,3315,Tarrenlea,B Property and deception offences,1.0
170996,03/2014,3315,Tarrenlea,C Drug offences,0.0
170997,03/2014,3315,Tarrenlea,D Public order and security offences,0.0
170998,03/2014,3315,Tarrenlea,E Justice procedures offences,0.0


## Put offence_division as column

In [24]:
# Splitting the offence_division column to extract the prefix (e.g., A, B, ...)
data['offence_code'] = data['offence_division'].str.extract(r'(\w)')

# Pivoting the table to reshape it
final_data = data.pivot_table(index=['year_ending', 'postcode', 'suburb/town_name'],
                          columns='offence_code',
                          values='offence_count',
                          aggfunc='sum').reset_index()

# Renaming columns
final_data.columns = ['year_ending', 'postcode', 'suburb/town_name'] + [f'offence_{col}' for col in final_data.columns[3:]]
final_data.fillna(0, inplace=True)
final_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['offence_code'] = data['offence_division'].str.extract(r'(\w)')


Unnamed: 0,year_ending,postcode,suburb/town_name,offence_A,offence_B,offence_C,offence_D,offence_E,offence_F
0,03/2014,3000,Carlton,232.0,1207.0,123.0,125.0,70.0,4.0
1,03/2014,3000,Melbourne,2238.0,11516.0,1296.0,4108.0,2369.0,75.0
2,03/2014,3002,East Melbourne,123.0,855.0,32.0,190.0,24.0,4.0
3,03/2014,3003,West Melbourne,68.0,410.0,44.0,131.0,24.0,56.0
4,03/2014,3004,Windsor,95.0,566.0,37.0,120.0,24.0,2.0


## Only include data 2015 onward

In [25]:
# Extracting the year directly from the 'year_ending' column
final_data['year'] = final_data['year_ending'].str.split('/').str[1].astype(int)

# Filtering the data based on the year (2015 onward)
final_final = final_data[final_data['year'] >= 2015].drop(columns=['year'])
final_final.head()

Unnamed: 0,year_ending,postcode,suburb/town_name,offence_A,offence_B,offence_C,offence_D,offence_E,offence_F
2850,03/2015,3000,Carlton,183.0,1347.0,113.0,125.0,62.0,1.0
2851,03/2015,3000,Melbourne,2526.0,11125.0,1150.0,3701.0,3222.0,51.0
2852,03/2015,3002,East Melbourne,106.0,480.0,53.0,147.0,28.0,12.0
2853,03/2015,3003,West Melbourne,80.0,384.0,51.0,105.0,34.0,6.0
2854,03/2015,3004,Windsor,79.0,593.0,55.0,101.0,26.0,3.0


## Change suburb/town_name to lowercase

In [26]:
final_final['suburb/town_name'] = final_final['suburb/town_name'].str.lower()
final_final.head()

Unnamed: 0,year_ending,postcode,suburb/town_name,offence_A,offence_B,offence_C,offence_D,offence_E,offence_F
2850,03/2015,3000,carlton,183.0,1347.0,113.0,125.0,62.0,1.0
2851,03/2015,3000,melbourne,2526.0,11125.0,1150.0,3701.0,3222.0,51.0
2852,03/2015,3002,east melbourne,106.0,480.0,53.0,147.0,28.0,12.0
2853,03/2015,3003,west melbourne,80.0,384.0,51.0,105.0,34.0,6.0
2854,03/2015,3004,windsor,79.0,593.0,55.0,101.0,26.0,3.0


## Save the data

In [27]:
final_final.to_csv('../../data/curated/curated_crime.csv', index = False)