In [54]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
import os
import gc

In [2]:
# Set parameters

from pylab import rcParams

rcParams['figure.figsize'] = 15, 10
rcParams['font.size'] = 20
rcParams['axes.facecolor'] = 'white'
plots_rgb_blue = 'rgb(31,119,180)'

%matplotlib inline

In [3]:
meshblock_auckland_full_df = pd.read_csv('meshblock-higher-geographies-2024-auckland.csv')

In [4]:
meshblock_auckland_full_df.head()

Unnamed: 0,WKT,MB2024_V1_00,SA12023_V1_00,SA22023_V1_00,SA22023_V1_00_NAME,SA22023_V1_00_NAME_ASCII,SA32023_V1_00,SA32023_V1_00_NAME,SA32023_V1_00_NAME_ASCII,UR2023_V1_00,...,FUA2023_V1_00_NAME_ASCII,IFUA2023_V1_00,IFUA2023_V1_00_NAME,TFUA2023_V1_00,TFUA2023_V1_00_NAME,LANDWATER,LANDWATER_NAME,LAND_AREA_SQ_KM,AREA_SQ_KM,Shape_Length
0,"POLYGON ((1753128.8397 5915046.9774,1753132.89...",4019145,7037220,135500,Ōwairaka West,Owairaka West,51440,Mount Albert,Mount Albert,1108,...,Auckland,101,Urban core,1,Metropolitan area,12,Mainland,0.053445,0.053445,1183.791512
1,"POLYGON ((1758471.9046 5920096.1957,1758519.93...",4001328,7037219,136400,Parnell West,Parnell West,51200,Parnell,Parnell,1108,...,Auckland,101,Urban core,1,Metropolitan area,12,Mainland,0.009572,0.009572,421.238251
2,"POLYGON ((1772737.6177 5879642.14,1772741.0269...",827902,7032417,169901,Tuakau Rural,Tuakau Rural,52450,Tuakau,Tuakau,1164,...,Auckland,201,Hinterland,1,Metropolitan area,12,Mainland,1.450875,1.450875,5144.014389
3,"POLYGON ((1772799.1844 5892796.4598,1772800.50...",4014920,7032175,164302,Drury East,Drury East,52250,Drury,Drury,1108,...,Auckland,101,Urban core,1,Metropolitan area,12,Mainland,0.261285,0.261285,3082.177087
4,"POLYGON ((1773448.2967 5893337.3844,1773557.31...",4004236,7010152,164302,Drury East,Drury East,52250,Drury,Drury,1108,...,Auckland,101,Urban core,1,Metropolitan area,12,Mainland,0.272358,0.272358,2963.151998


In [5]:
meshblock_auckland_df =   meshblock_auckland_full_df[['MB2024_V1_00', 'SA22023_V1_00','SA22023_V1_00_NAME_ASCII']]

In [6]:
cas_df = pd.read_csv('Crash_Analysis_System_(CAS)_data.csv')

In [7]:
cas_mesh_df = pd.merge(cas_df,meshblock_auckland_df, left_on='meshblockId', right_on='MB2024_V1_00', how='left')

In [8]:
cas_mesh_df.columns

Index(['X', 'Y', 'OBJECTID', 'advisorySpeed', 'areaUnitID', 'bicycle',
       'bridge', 'bus', 'carStationWagon', 'cliffBank',
       'crashDirectionDescription', 'crashFinancialYear', 'crashLocation1',
       'crashLocation2', 'crashRoadSideRoad', 'crashSeverity',
       'crashSHDescription', 'crashYear', 'debris', 'directionRoleDescription',
       'ditch', 'fatalCount', 'fence', 'flatHill', 'guardRail', 'holiday',
       'houseOrBuilding', 'intersection', 'kerb', 'light', 'meshblockId',
       'minorInjuryCount', 'moped', 'motorcycle', 'NumberOfLanes',
       'objectThrownOrDropped', 'otherObject', 'otherVehicleType', 'overBank',
       'parkedVehicle', 'pedestrian', 'phoneBoxEtc', 'postOrPole', 'region',
       'roadCharacter', 'roadLane', 'roadSurface', 'roadworks', 'schoolBus',
       'seriousInjuryCount', 'slipOrFlood', 'speedLimit', 'strayAnimal',
       'streetLight', 'suv', 'taxi', 'temporarySpeedLimit', 'tlaId', 'tlaName',
       'trafficControl', 'trafficIsland', 'trafficSi

In [9]:
# Percentage of NaN values in SA22023_V1_00_NAME in each crash year

filtered_df = cas_mesh_df[['OBJECTID','SA22023_V1_00','crashFinancialYear']]
nan_percentage = filtered_df.groupby('crashFinancialYear')['SA22023_V1_00'].apply(lambda x: x.isna().mean() * 100).reset_index(name='NaN_Percentage_SA22023_V1_00')

fig = px.line(nan_percentage, x='crashFinancialYear', y='NaN_Percentage_SA22023_V1_00', 
              title='Percentage of NaN Values in SA22023_V1_00_NAME by Crash Financial Year', 
              labels={'crashFinancialYear': 'Crash Financial Year', 'NaN_Percentage_SA22023_V1_00': 'Percentage of NaN Values'})
fig.update_layout(yaxis=dict(range=[0, nan_percentage['NaN_Percentage_SA22023_V1_00'].max() + 5]))
fig.show()

In [10]:
filtered_block_df = cas_mesh_df[['OBJECTID','SA22023_V1_00_NAME_ASCII','meshblockId']]
block_nan_percentage = filtered_block_df.groupby('meshblockId')['SA22023_V1_00_NAME_ASCII'].apply(lambda x: x.isna().mean() * 100).reset_index(name='NaN_Percentage_SA22023_V1_00_NAME_ASCII')

In [11]:
block_nan_percentage.head()

Unnamed: 0,meshblockId,NaN_Percentage_SA22023_V1_00_NAME_ASCII
0,134100,100.0
1,135100,100.0
2,135201,100.0
3,135300,100.0
4,135400,100.0


In [12]:
filtered_xy_df = cas_mesh_df[['X','Y','SA22023_V1_00']]

In [13]:
filtered_xy_df

Unnamed: 0,X,Y,SA22023_V1_00
0,1.756461e+06,5.936053e+06,117900.0
1,1.772256e+06,5.896393e+06,163200.0
2,1.766897e+06,5.907471e+06,155701.0
3,1.754057e+06,5.936391e+06,
4,1.772574e+06,5.901427e+06,162300.0
...,...,...,...
285341,1.743947e+06,5.924009e+06,
285342,1.768891e+06,5.904246e+06,158502.0
285343,1.757918e+06,5.914599e+06,140800.0
285344,1.758255e+06,5.918060e+06,


In [14]:
filtered_xy_df['SA22023_V1_00'].isna().sum()

99106

In [15]:
filtered_xy_df['SA22023_V1_00'].nunique()

593

In [16]:
filtered_xy_df['X'].nunique()

80519

In [17]:
filtered_xy_df['Y'].nunique()

88946

In [18]:
# Split the data into training (non-missing) and prediction (missing) sets
train_df = cas_mesh_df[cas_mesh_df['SA22023_V1_00'].notna()]
predict_df = cas_mesh_df[cas_mesh_df['SA22023_V1_00'].isna()]

In [19]:
# Features and target variable for training
X = train_df[['X', 'Y']]
y = train_df['SA22023_V1_00']

In [20]:
# Split the training data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kNN with uniform weightings and k neigbours between 1 and 15

In [21]:
k=15
training_error = pd.Series(index=range(1, k+1), dtype=float)
test_error = pd.Series(index=range(1, k+1), dtype=float)
for i in range(1, k+1):
    knn = KNeighborsClassifier(n_neighbors=i, weights='uniform')
    knn.fit(X_train, y_train)
    training_error[i] = 1.0 - knn.score(X_train, y_train)
    test_error[i] = 1.0 - knn.score(X_test, y_test)

# Plot the training and test error for each value of k
trace1 = go.Scatter(x=training_error.index, y=training_error, mode='lines+markers', name='Training Error')
trace2 = go.Scatter(x=test_error.index, y=test_error, mode='lines+markers', name='Test Error')
fig = go.Figure(data=[trace1, trace2])
fig.update_layout(
    title='Generalisation, Convergence and Overfitting - uniform',
    xaxis_title='Classifier Complexity (k)',
    yaxis_title='Error Rate / Misclassification Rate',
    xaxis=dict(range=[1, k], tickmode='linear'),
    yaxis=dict(range=[0, 1])
)
fig.show()

kNN with distance weightings and k neigbours between 1 and 15

In [22]:
k=15
training_error = pd.Series(index=range(1, k+1), dtype=float)
test_error = pd.Series(index=range(1, k+1), dtype=float)
for i in range(1, k+1):
    knn = KNeighborsClassifier(n_neighbors=i, weights='distance')
    knn.fit(X_train, y_train)
    training_error[i] = 1.0 - knn.score(X_train, y_train)
    test_error[i] = 1.0 - knn.score(X_test, y_test)
# Plot the training and test error for each value of k
trace1 = go.Scatter(x=training_error.index, y=training_error, mode='lines+markers', name='Training Error')
trace2 = go.Scatter(x=test_error.index, y=test_error, mode='lines+markers', name='Test Error')
fig = go.Figure(data=[trace1, trace2])
fig.update_layout(
    title='Generalisation, Convergence and Overfitting - distance',
    xaxis_title='Classifier Complexity (k)',
    yaxis_title='Error Rate / Misclassification Rate',
    xaxis=dict(range=[1, k], tickmode='linear'),
    yaxis=dict(range=[0, 1])
)
fig.show()

Train a kNN classifier with distance weightings and k neighbours = 5

In [23]:
# Initialize the kNN classifier
knn = KNeighborsClassifier(n_neighbors=10 ,weights='distance')
# Train the classifier on the full training set
knn.fit(X_train, y_train)

# Evaluate the classifier on the test set
test_score = knn.score(X_test, y_test)
print(f"Test set score: {test_score}")

# Predict on the test set
y_pred = knn.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, zero_division=0))

Test set score: 0.9580380154639175
              precision    recall  f1-score   support

    110700.0       1.00      1.00      1.00        45
    110800.0       1.00      1.00      1.00        40
    110900.0       0.94      0.95      0.95        66
    111100.0       0.88      1.00      0.94        22
    111200.0       0.99      0.97      0.98        99
    111300.0       0.96      1.00      0.98        26
    111400.0       1.00      0.85      0.92        33
    111601.0       1.00      0.90      0.95        21
    111602.0       0.95      1.00      0.98        20
    111700.0       0.93      1.00      0.97        14
    111901.0       1.00      1.00      1.00         1
    112101.0       0.94      0.96      0.95        67
    112200.0       1.00      1.00      1.00         2
    112301.0       0.93      0.88      0.90        32
    112401.0       1.00      1.00      1.00        47
    112500.0       0.86      0.86      0.86        28
    112600.0       0.89      0.98      0.94   

In [24]:
# Predict the missing values  using the kNN model
# Features for prediction
X_predict = predict_df[['X', 'Y']]

# Predict the missing values
predictions = knn.predict(X_predict)

In [25]:
# Create a new column and fill the missing values with the predicted ones
cas_mesh_df['SA22023_V1_00_Filled'] = cas_mesh_df['SA22023_V1_00']

# Fill the missing values in the original DataFrame
cas_mesh_df.loc[cas_mesh_df['SA22023_V1_00_Filled'].isna(), 'SA22023_V1_00_Filled'] = predictions

In [26]:
cas_mesh_df['SA22023_V1_00_Filled'] =  cas_mesh_df['SA22023_V1_00_Filled'].astype('int64')

In [50]:
meshblock_auckland_SA2_level_df = meshblock_auckland_full_df.iloc[:,3:9]

In [51]:
meshblock_auckland_SA2_level_df

Unnamed: 0,SA22023_V1_00,SA22023_V1_00_NAME,SA22023_V1_00_NAME_ASCII,SA32023_V1_00,SA32023_V1_00_NAME,SA32023_V1_00_NAME_ASCII
0,135500,Ōwairaka West,Owairaka West,51440,Mount Albert,Mount Albert
1,136400,Parnell West,Parnell West,51200,Parnell,Parnell
2,169901,Tuakau Rural,Tuakau Rural,52450,Tuakau,Tuakau
3,164302,Drury East,Drury East,52250,Drury,Drury
4,164302,Drury East,Drury East,52250,Drury,Drury
...,...,...,...,...,...,...
14611,110800,Kaipara Hills,Kaipara Hills,50410,Rodney East,Rodney East
14612,110800,Kaipara Hills,Kaipara Hills,50410,Rodney East,Rodney East
14613,110800,Kaipara Hills,Kaipara Hills,50410,Rodney East,Rodney East
14614,110800,Kaipara Hills,Kaipara Hills,50410,Rodney East,Rodney East


In [52]:
meshblock_auckland_SA2_level_df = meshblock_auckland_SA2_level_df.drop_duplicates()

In [56]:
# Add the filled in SA2 column to the meshblock data set
cas_mesh_merged_df = pd.merge(cas_mesh_df,meshblock_auckland_SA2_level_df, left_on='SA22023_V1_00_Filled', right_on='SA22023_V1_00', how='left')


In [62]:
cas_mesh_merged_df['SA22023_V1_00_NAME_ASCII_y'].isna().sum()

0

In [63]:
cas_mesh_merged_df = cas_mesh_merged_df.drop(columns=['SA22023_V1_00_NAME_ASCII_x','SA22023_V1_00_y'])

In [65]:
cas_mesh_merged_df = cas_mesh_merged_df.rename(columns={'SA22023_V1_00_x' : 'SA22023_V1_00_original', 'SA22023_V1_00_NAME_ASCII_y':'SA22023_V1_00_NAME_ASCII'})

In [67]:
cas_mesh_merged_df.to_csv('cas_merged_with_SA2_data.csv', index=False)

In [68]:
cas_mesh_merged_df['SA22023_V1_00_NAME_ASCII'].nunique()

593

In [69]:
cas_mesh_merged_df['SA32023_V1_00_NAME_ASCII'].nunique()

192