In [108]:
import numpy as np
import pandas as pd

Data reading

In [109]:
metro_bus = pd.read_csv("data/raw/transport/metro_bus_stops.txt")
metro_tram = pd.read_csv("data/raw/transport/metro_tram_stops.txt")
metro_train = pd.read_csv("data/raw/transport/metro_train_stops.txt")
regional_train = pd.read_csv("data/raw/transport/regional_train_stops.txt")
regional_bus = pd.read_csv("data/raw/transport/regional_bus_stops.txt")

In [110]:
len(metro_bus), len(metro_tram), len(metro_train), len(regional_train), len(regional_bus)

(18781, 1634, 1058, 220, 6851)

Feature selection

In [111]:
def feature_selection(df):
    """
    Select only relevant columns: stop_name, stop_lat, stop_lon
    """
    return df[['stop_name', 'stop_lat', 'stop_lon']]

In [112]:
metro_bus = feature_selection(metro_bus)
metro_tram = feature_selection(metro_tram)
metro_train = feature_selection(metro_train)
regional_train = feature_selection(regional_train)
regional_bus = feature_selection(regional_bus)

In [113]:
len(metro_bus), len(metro_tram), len(metro_train), len(regional_train), len(regional_bus)

(18781, 1634, 1058, 220, 6851)

Data processing

In [114]:
# For train stations, remove those that do not have "Station" in the name. Non stations include taxi zones, bus replacement stops, etc.
metro_train = metro_train[metro_train['stop_name'].str.contains("Station")]
regional_train = regional_train[regional_train['stop_name'].str.contains("Station")]

Feature engineering

Only require stops.txt data, i.e., location of each tram, bus, etc., stop

This is because to answer our question, one of the covariates would be the number of public transport stops within X km of a property

In [115]:
def feature_engineering(df):
    """
    Some datasets (specifically train) have multiple locations for the same stop, e.g., flinder street station has 10 platforms, 
    diff platforms have different lat/longs. We want to only keep one row for each stop. 
    Thus, group by the stop name and take the mean lat/long. 
    """
    df_filtered = df.groupby('stop_name').agg({
        'stop_lat': 'mean',
        'stop_lon': 'mean'
    }).reset_index()
    return df_filtered

In [116]:
metro_bus = feature_engineering(metro_bus)
metro_tram = feature_engineering(metro_tram)
metro_train = feature_engineering(metro_train)
regional_train = feature_engineering(regional_train)
regional_bus = feature_engineering(regional_bus)

In [117]:
len(metro_bus), len(metro_tram), len(metro_train), len(regional_train), len(regional_bus)

(12831, 999, 444, 219, 5387)

Combine into 1 dataset for output

In [118]:
metro_bus["StopType"] = "Metro Bus"
metro_tram["StopType"] = "Metro Tram"
metro_train["StopType"] = "Metro Train"
regional_train["StopType"] = "Regional Train"
regional_bus["StopType"] = "Regional Bus"

transport_stops = pd.concat([metro_bus, metro_tram, metro_train, regional_train, regional_bus], ignore_index=True)
transport_stops.rename(columns={"stop_name": "StopName", "stop_lat": "Latitude", "stop_lon": "Longitude"}, inplace=True)
transport_stops = transport_stops[["StopName", "StopType", "Latitude", "Longitude"]]
transport_stops.to_csv("data/processed/transport/transport_stops.csv", index=False)