In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os

#file path
fp = r"C:\Users\renhu\OneDrive\Desktop\205.2 DATA\rawdata\Average_Daily_Traffic_Counts.geojson"

#read file
traffic = gpd.read_file(fp)
print("origional shape:", traffic.shape)
print("Columns:", list(traffic.columns))

#keep columns 
keep_cols = [ "road_id", "road_name", "start_name", "end_name",
    "location", "latest", "count_date", "peak_hour",
    "adt", "peaktraffic",
    "pccar", "pclcv", "pcmcv", "pchcvi", "pchcvii", "pcbus", "pcheavy",
    "geometry"
]
traffic = traffic[[col for col in keep_cols if col in traffic.columns]].copy()


origional shape: (13093, 23)
Columns: ['OBJECTID', 'carr_way_no', 'road_id', 'road_name', 'start_name', 'end_name', 'location', 'latest', 'count_date', 'peak_hour', 'count_duration', 'adt', 'peaktraffic', 'pccar', 'pclcv', 'pcmcv', 'pchcvi', 'pchcvii', 'pcbus', 'pcheavy', 'NZTMX', 'NZTMY', 'geometry']


In [2]:
traffic.isna().sum().sort_values(ascending=False)

peaktraffic    3251
peak_hour      3217
pcbus          1182
pchcvii        1174
pcheavy        1076
pccar          1076
pclcv          1076
pcmcv          1076
pchcvi         1076
location        298
adt             119
end_name         68
start_name       55
road_name         1
road_id           0
count_date        0
latest            0
geometry          0
dtype: int64

In [3]:
traffic.geometry.is_valid.sum(), len(traffic)
traffic.geometry.notnull().sum()

13093

In [4]:
traffic["adt"].dtype
traffic["adt"].head(10)
traffic["adt"].unique()[:10]  # or .sample(10)
traffic["adt"].isna().sum()

119

In [5]:
traffic = traffic.dropna(subset=["adt", "geometry"])
print("Clean shape:", traffic.shape)

Clean shape: (12974, 18)


In [6]:
#impute missing vehicle category percentages with median
pct_cols = ["pccar", "pclcv", "pcmcv", "pchcvi", "pchcvii", "pcbus", "pcheavy"]
traffic[pct_cols] = traffic[pct_cols].apply(lambda x: x.fillna(x.median()))

#fill missing location names
traffic["location"] = traffic["location"].fillna("unknown")

#drop peak_hour (too many missing)
traffic = traffic.drop(columns=["peak_hour"])

#impute peaktraffic with median
traffic["peaktraffic"] = traffic["peaktraffic"].fillna(traffic["peaktraffic"].median())

#check 'latest' before filtering
if "latest" in traffic.columns:
    print("Latest column unique values:", traffic["latest"].unique())
    #filter if 'Y' exists
    if "Y" in traffic["latest"].unique():
        traffic = traffic[traffic["latest"] == "Y"]

#reset index
traffic.reset_index(drop=True, inplace=True)

#save
processed_folder = r"C:\Users\renhu\OneDrive\Desktop\205.2 DATA\processed"
os.makedirs(processed_folder, exist_ok=True)
output_file = os.path.join(processed_folder, "Average_Daily_Traffic_Clean.geojson")
traffic.to_file(output_file, driver="GeoJSON")

print("Final cleaned dataset saved!")
print("Shape:", traffic.shape)


Latest column unique values: ['Yes']
Final cleaned dataset saved!
Shape: (12974, 17)


In [7]:
#clean up texts 
text_cols = ["road_name", "start_name", "end_name", "location"]
for col in text_cols:
    if col in traffic.columns:
        traffic[col] = traffic[col].astype(str).str.strip().str.lower().replace("nan", np.nan)

In [8]:
num_cols = ["adt", "peaktraffic", "pccar", "pclcv", "pcmcv", "pchcvi", "pchcvii", "pcbus", "pcheavy"]

for col in num_cols:
    if col in traffic.columns:
        #convert to numeric, coerce errors to NaN
        traffic[col] = pd.to_numeric(traffic[col], errors="coerce")

#impute remaining missing with median
traffic[num_cols] = traffic[num_cols].fillna(traffic[num_cols].median())
        