In [5]:
import pyspark, json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    sum as spark_sum,
    date_format,
    year,
    min as spark_min,
    max as spark_max,
    countDistinct,
    date_format,
)
from config.config import Config

# Initialise config
config = Config()

# Pandas display all data
pd.set_option("display.max_columns", None)

In [6]:
# Initialise Spark
spark = SparkSession.builder.getOrCreate()

In [7]:
# Profiling report
report = {}

# Load data
train_patron = spark.read.csv(f"{config.BRONZE_DATA_PATH}/train_patrons.csv", header=True, inferSchema=True)

In [8]:
# Quick schema and row count check

train_patron.printSchema()
train_patron.show(5, truncate=False)
total_rows = train_patron.count()

# Add schema and row count to report
report["schema"] = str(train_patron.schema.json())
report["total_rows"] = total_rows

print(f"Total rows: {total_rows:,}")

root
 |-- Business_Date: date (nullable = true)
 |-- Day_of_Week: string (nullable = true)
 |-- Day_Type: string (nullable = true)
 |-- Mode: string (nullable = true)
 |-- Train_Number: string (nullable = true)
 |-- Line_Name: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Direction: string (nullable = true)
 |-- Origin_Station: string (nullable = true)
 |-- Destination_Station: string (nullable = true)
 |-- Station_Name: string (nullable = true)
 |-- Station_Latitude: double (nullable = true)
 |-- Station_Longitude: double (nullable = true)
 |-- Station_Chainage: integer (nullable = true)
 |-- Stop_Sequence_Number: integer (nullable = true)
 |-- Arrival_Time_Scheduled: timestamp (nullable = true)
 |-- Departure_Time_Scheduled: timestamp (nullable = true)
 |-- Passenger_Boardings: integer (nullable = true)
 |-- Passenger_Alightings: integer (nullable = true)
 |-- Passenger_Arrival_Load: integer (nullable = true)
 |-- Passenger_Departure_Load: integer (nullable = tru

In [9]:
# Column lists

key_cols = ["Business_Date", "Train_Number", "Station_Name", "Arrival_Time_Scheduled"]
numeric_cols = [
    "Passenger_Boardings", "Passenger_Alightings",
    "Passenger_Arrival_Load", "Passenger_Departure_Load",
    "Station_Latitude", "Station_Longitude",
    "Station_Chainage", "Stop_Sequence_Number"
]
cat_cols = [
    "Day_of_Week", "Day_Type", "Mode", "Line_Name", "Group", "Direction"
]

In [10]:
# Null value profiling

agg_exprs = []

for c in train_patron.columns:
    agg_exprs.append(spark_sum(col(c).isNull().cast("int")).alias(f"{c}__nulls"))

# Min/Max for numeric columns
for c in numeric_cols:
    agg_exprs.append(spark_min(col(c)).alias(f"{c}__min"))
    agg_exprs.append(spark_max(col(c)).alias(f"{c}__max"))

# Aggregate null counts and min/max values
agg_results = train_patron.agg(*agg_exprs).collect()[0].asDict()

# Store null counts
report["null_counts"] = {
    c: agg_results[f"{c}__nulls"] for c in train_patron.columns
}

# Store numeric min/max
report["numeric_range"] = {
    c: {
        "min": agg_results[f"{c}__min"],
        "max": agg_results[f"{c}__max"]
    }
    for c in numeric_cols
}

# Display null counts and numeric ranges
print("Null counts per column")
nulls_df = pd.DataFrame.from_dict(report["null_counts"], orient='index', columns=['null_count'])
display(nulls_df)

print("\nNumeric columns min/max")
numeric_stats_df = pd.DataFrame.from_dict(report["numeric_range"], orient='index')
display(numeric_stats_df)

Null counts per column


Unnamed: 0,null_count
Business_Date,0
Day_of_Week,0
Day_Type,0
Mode,0
Train_Number,0
Line_Name,0
Group,51248
Direction,0
Origin_Station,0
Destination_Station,0



Numeric columns min/max


Unnamed: 0,min,max
Passenger_Boardings,-10.0,1890.0
Passenger_Alightings,-20.0,2020.0
Passenger_Arrival_Load,-20.0,2380.0
Passenger_Departure_Load,-20.0,2380.0
Station_Latitude,-38.386392,-35.341114
Station_Longitude,142.475545,147.627614
Station_Chainage,0.0,346299.0
Stop_Sequence_Number,1.0,31.0


In [11]:
# Duplicate check

total_rows = train_patron.count()
distinct_rows = train_patron.distinct().count()
duplicate_rows = total_rows - distinct_rows

# Store duplicate row count
report["duplicate_rows"] = duplicate_rows

print(f"Duplicates: {duplicate_rows}")

Duplicates: 0


In [12]:
# Categorical value sanity

report["categorical_values"] = {
    c: [r[0] for r in train_patron.select(c).distinct().collect()]
    for c in cat_cols
}

cat_values_series = pd.Series(report["categorical_values"])
print(cat_values_series.to_frame(name="Distinct Values").to_string())

                                                                                                                                                                                                                                                                                       Distinct Values
Day_of_Week                                                                                                                                                                                                                           [Wednesday, Tuesday, Friday, Thursday, Saturday, Monday, Sunday]
Day_Type                                                                                                                                                                                                                            [Normal Weekday, School Holiday, Saturday, Public Holiday, Sunday]
Mode                                                                                                               

In [13]:
# Temporal consistency

mismatch = train_patron.filter(
    date_format(col("Business_Date"), "EEEE") != col("Day_of_Week")
).count()

report["day_of_week_mismatch"] = mismatch

print(f"Day of week mismatch: {mismatch}")

Day of week mismatch: 0


In [14]:
# Geospatial bounds check
# We’ll check if stations fall roughly within Victoria’s bounds:

out_of_bounds = train_patron.filter(
    ( (col("Station_Latitude") < -39) | (col("Station_Latitude") > -33) |
      (col("Station_Longitude") < 140) | (col("Station_Longitude") > 150) )
).count()

report["stations_out_of_bounds"] = out_of_bounds

print(f"Stations out of bounds: {out_of_bounds}")

Stations out of bounds: 0


In [15]:
# Time sanity
# Checking for suspicious years in Arrival_Time_Scheduled vs Business_Date.

time_mismatch = train_patron.filter(
    year(col("Arrival_Time_Scheduled")) != year(col("Business_Date"))
).count()

report["year_mismatch_in_times"] = time_mismatch

print(f"Year mismatch in arrival time: {time_mismatch}")

Year mismatch in arrival time: 15612868


In [16]:
# Save report

json_path = f"{config.BRONZE_DATA_PATH}/profiling_report.json"
with open(json_path, "w") as f:
    json.dump(report, f, indent=2)

# Stop Spark session
spark.stop()