In [None]:
%pip install mlxtend
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# Load the pre-processed dataset from ETL output
df = pd.read_csv('fatalities_mining_dataset.csv')

# Remove identifier columns that are not needed for pattern mining
# - fatality_id: Unique identifier for each victim
# - crash_id: Unique identifier for each crash
# These identifiers are not meaningful for association rule mining
new_df = df.drop(['fatality_id','crash_id'], axis='columns')

  df = pd.read_csv('fatalities_mining_dataset.csv')


### Data Bining

In [3]:
# Define speed limit categories based on common speed zones in Australia
def bin_speed(speed):
    if speed <= 30:
        return 'Very Low'    # School zones, residential areas
    elif speed <= 50:
        return 'Low'         # Urban areas, local streets
    elif speed <= 70:
        return 'Medium'      # Suburban roads
    elif speed <= 90:
        return 'High'        # Rural roads
    else:
        return 'Very High'   # Highways, freeways

# Apply categorization to create new feature
new_df['speed_level'] = new_df['speed_limit'].apply(bin_speed)

In this project, we selected a meaningful subset of attributes from the preprocessed dataset to perform association rule mining. The goal is to identify interpretable and relevant patterns that can help understand the relationship between contextual factors and the type of road user involved in fatal crashes.

### Selection Criteria
The selected attributes satisfy the following conditions:

Categorical Nature: Association rule mining requires attributes to be categorical. All selected features are either originally categorical or have been derived from continuous variables via binning (e.g., speed_limit → speed_level).

Interpretability: Each attribute is human-understandable and can be directly used in explaining patterns or policy recommendations (e.g., "female drivers are more vulnerable at night").

Relevance to Road User Risk: Chosen features are strongly related to road context, temporal conditions, demographics, and vehicle/environment characteristics.

In [4]:
# Define selected attributes for mining
selected_columns = [
    'gender',                    
    'age_group',                
    'state',                     
    'day_of_week',             
    'time_of_day',              
    'season',                  
    'national_road_type',        
    'christmas_period',         
    'easter_period',            
    'abs_pct_65_plus_group_2023', 
    'speed_level',              
    'road_user'                 # Target variable for RHS of rules
]

# Create subset for mining
sub_data = new_df[selected_columns].copy()

In [5]:
for col in sub_data.columns:
    print(col)
    print(sub_data[col].value_counts(dropna=False))
    print('-' * 40)

gender
gender
Male      40808
Female    16001
NaN          31
Name: count, dtype: int64
----------------------------------------
age_group
age_group
40_to_64       14637
17_to_25       14537
26_to_39       13235
75_or_older     5611
65_to_74        4442
0_to_16         4279
NaN               99
Name: count, dtype: int64
----------------------------------------
state
state
NSW    17326
VIC    12423
QLD    11438
WA      6842
SA      4853
NT      1772
TAS     1677
ACT      509
Name: count, dtype: int64
----------------------------------------
day_of_week
day_of_week
Weekday    33492
Weekend    23336
NaN           12
Name: count, dtype: int64
----------------------------------------
time_of_day
time_of_day
Day      32376
Night    24421
NaN         43
Name: count, dtype: int64
----------------------------------------
season
season
Spring    14417
Autumn    14341
Summer    14044
Winter    14038
Name: count, dtype: int64
----------------------------------------
national_road_type
national_roa

In [6]:
# Check number of missing values in each column
sub_data.isnull().sum()

# Remove rows with any missing values
# This ensures complete cases for pattern mining
sub_data = sub_data.dropna()

### Data transformation

In [7]:
# Convert all values to string type for consistent encoding
sub_data_str = sub_data.astype(str)

# Transform data into transaction format required by Apriori
# Each row becomes a transaction with attribute-value pairs
transactions = [
    [f"{col}={row[col]}" for col in sub_data_str.columns]
    for _, row in sub_data_str.iterrows()
]

# Encode transactions into binary format
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
encoded_df = pd.DataFrame(te_ary, columns=te.columns_)

In [8]:
[col for col in encoded_df.columns if 'road_user=' in col]

['road_user=Driver',
 'road_user=Motorcycle pillion passenger',
 'road_user=Motorcycle rider',
 'road_user=Passenger',
 'road_user=Pedal cyclist',
 'road_user=Pedestrian']

### Implementing

In [15]:
# Generate frequent itemsets using Apriori algorithm
# min_support=0.2 ensures rules appear in at least 20% of transactions
frequent_itemsets = apriori(encoded_df, min_support=0.2, use_colnames=True)

#  Generate association rules and calculate metrics
# min_threshold=0.3 ensures rules have at least 30% confidence
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Analyze rule distribution
print(f"Total number of rules generated: {len(rules)}")

road_user_rules = rules[rules['consequents'].apply(lambda x: any('road_user=' in item for item in x))]
print(f"Total number of rules generated: {len(road_user_rules)}")

Total number of rules generated: 4184
Total number of rules generated: 312


In [16]:
# Sort rules by lift and confidence in descending order
# This identifies the strongest and most interesting patterns
top_k_rules = road_user_rules.sort_values(by=['lift', 'confidence'], ascending=False)

# Export top 3 rules for detailed analysis
# Includes antecedents, consequents, and key metrics
top_k_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(3).to_csv("top_k_rules.csv", index=False)