In [1]:
import pandas as pd
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    lag,
    coalesce,
    when,
    count,
    lit,
    sum as spark_sum,
    min as spark_min,
)
from pyspark.sql.window import Window
from config.config import Config

pd.options.display.max_columns = None

In [2]:
# Initialise configurations
config = Config()

# Initialise Spark session
builder = (
    SparkSession.builder.appName("ProcessedData")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.memory.fraction", "0.6")
    .config("spark.memory.storageFraction", "0.5")
)

# Configure Spark with Delta Lake
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
# Load data
train_patron = spark.read.format("delta").load(f"{config.SILVER_DATA_PATH}/train_patrons")

In [4]:
# Peek at the data
train_patron.printSchema()
# display(train_patron.limit(5).toPandas())
train_patron.show(5, truncate=False)

# Quick row count
print(f"Row count: {train_patron.count()}")

root
 |-- Business_Date: date (nullable = true)
 |-- Day_of_Week: string (nullable = true)
 |-- Day_Type: string (nullable = true)
 |-- Mode: string (nullable = true)
 |-- Train_Number: string (nullable = true)
 |-- Line_Name: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Direction: string (nullable = true)
 |-- Origin_Station: string (nullable = true)
 |-- Destination_Station: string (nullable = true)
 |-- Station_Name: string (nullable = true)
 |-- Station_Latitude: double (nullable = true)
 |-- Station_Longitude: double (nullable = true)
 |-- Station_Chainage: integer (nullable = true)
 |-- Stop_Sequence_Number: integer (nullable = true)
 |-- Arrival_Time_Scheduled: timestamp (nullable = true)
 |-- Departure_Time_Scheduled: timestamp (nullable = true)
 |-- Passenger_Boardings: integer (nullable = true)
 |-- Passenger_Alightings: integer (nullable = true)
 |-- Passenger_Arrival_Load: integer (nullable = true)
 |-- Passenger_Departure_Load: integer (nullable = tru

In [8]:
# Check for any remaining nulls in key columns

nulls = train_patron.select(
    [spark_sum(col(c).isNull().cast("int")).alias(c) for c in train_patron.columns]
)

null_counts_row = nulls.collect()[0].asDict()

nulls_df = pd.DataFrame(list(null_counts_row.items()), columns=["Column", "Null_Count"])
nulls_df = nulls_df.sort_values(by="Null_Count", ascending=False)

display(nulls_df)

Unnamed: 0,Column,Null_Count
0,Business_Date,0
1,Day_of_Week,0
2,Day_Type,0
3,Mode,0
4,Train_Number,0
5,Line_Name,0
6,Group,0
7,Direction,0
8,Origin_Station,0
9,Destination_Station,0


In [None]:
# Investigate any remaining nulls in key columns

null_passenger_rows = train_patron.filter(
    col("Passenger_Departure_Load").isNull() |
    col("Passenger_Arrival_Load").isNull() |
    col("Passenger_Boardings").isNull() |
    col("Passenger_Alightings").isNull()
)

print(f"Null passenger count rows: {null_passenger_rows.count()}")
display(null_passenger_rows.limit(10).toPandas())

# Issue Identified: There are null values present in key passenger columns.
# Solution:
# - Use coalesce to fill in missing values for passenger columns when Stop_Sequence_Number is 1
# - Fill in missing values for other Stop_Sequence_Numbers with the previous non-null value (forward fill)

Null passenger count rows: 0


Unnamed: 0,Business_Date,Day_of_Week,Day_Type,Mode,Train_Number,Line_Name,Group,Direction,Origin_Station,Destination_Station,Station_Name,Station_Latitude,Station_Longitude,Station_Chainage,Stop_Sequence_Number,Arrival_Time_Scheduled,Departure_Time_Scheduled,Passenger_Boardings,Passenger_Alightings,Passenger_Arrival_Load,Passenger_Departure_Load,Arrival_Time,Departure_Time,Arrival_Time_Bucket,Departure_Time_Bucket,year,month,day


In [None]:
# Check if negative passenger counts still exist

passenger_columns = [
    "Passenger_Boardings",
    "Passenger_Alightings",
    "Passenger_Arrival_Load",
    "Passenger_Departure_Load",
]

neg_check = train_patron.select([
    spark_min(col(c)).alias(c) for c in passenger_columns
])

display(neg_check.toPandas())

Unnamed: 0,Passenger_Boardings,Passenger_Alightings,Passenger_Arrival_Load,Passenger_Departure_Load
0,0,0,0,0


In [23]:
# Count 'Unknown' Group values

group_unknown = train_patron.select(
    count(when(col("Group").isNull(), 1)).alias("null_group_count"),
    (count(when(col("Group").isNull(), 1)) / lit(train_patron.count())).alias("null_group_pct")
)

group_unknown.show()

+-------------------+-----------------+
|unknown_group_count|unknown_group_pct|
+-------------------+-----------------+
|                  0|              0.0|
+-------------------+-----------------+



In [24]:
# Peek at the 'Unknown' Group data

group_unknown = train_patron.filter(col("Group").isNull())

group_unknown.show(5, truncate=False)

+-------------+-----------+--------+----+------------+---------+-----+---------+--------------+-------------------+------------+----------------+-----------------+----------------+--------------------+----------------------+------------------------+-------------------+--------------------+----------------------+------------------------+------------+--------------+-------------------+---------------------+----+-----+---+
|Business_Date|Day_of_Week|Day_Type|Mode|Train_Number|Line_Name|Group|Direction|Origin_Station|Destination_Station|Station_Name|Station_Latitude|Station_Longitude|Station_Chainage|Stop_Sequence_Number|Arrival_Time_Scheduled|Departure_Time_Scheduled|Passenger_Boardings|Passenger_Alightings|Passenger_Arrival_Load|Passenger_Departure_Load|Arrival_Time|Departure_Time|Arrival_Time_Bucket|Departure_Time_Bucket|year|month|day|
+-------------+-----------+--------+----+------------+---------+-----+---------+--------------+-------------------+------------+----------------+-------

In [25]:
# Check which lines are labeled as 'Unknown'

group_unknown_line = train_patron.filter(col("Group") == "Unknown").select("Line_Name").distinct()

group_unknown_line.show()

# Issue identified: 'Stony Point' is the only line affected (seems to be a rural line).
# Solution: I will be using 'Stony Point' for the Group.

+---------+
|Line_Name|
+---------+
+---------+



In [1]:
# Stop the Spark session
spark.stop()

NameError: name 'spark' is not defined