In [4]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
import os

In [5]:
spark = SparkSession.builder \
    .appName("FlightDelayPrediction") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

In [7]:
# Load the saved Random Forest model
model_path = f"file://{os.path.abspath('flight_delay_rf_model')}"
rf_model = RandomForestClassificationModel.load(model_path)

# Load new data
new_data_path =f"file://{os.path.abspath('may_2025.csv')}"
may_df = spark.read.csv(new_data_path, header=True, inferSchema=False)

                                                                                

In [8]:
if "FL_DATE" in may_df.columns:
    may_df = may_df.drop("FL_DATE")

# Convert required columns to correct types
int_cols = ["YEAR", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", 
            "CRS_DEP_TIME", "CRS_ARR_TIME"]
float_cols = ["DISTANCE", "CRS_ELAPSED_TIME"]

for col in int_cols:
    may_df = may_df.withColumn(col, F.expr(f"try_cast({col} as int)"))
for col in float_cols:
    may_df = may_df.withColumn(col, F.expr(f"try_cast({col} as double)"))

In [9]:
# Define features (must match training features)
numerical_features = [
    'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
    'CRS_DEP_TIME', 'CRS_ARR_TIME',
    'DISTANCE', 'CRS_ELAPSED_TIME'
]

categorical_features = [
    'OP_UNIQUE_CARRIER',  # Airline
    'ORIGIN',             # Origin airport
    'DEST'                # Destination airport
]

# Select only required columns and drop nulls
may_df_clean = may_df.select(numerical_features + categorical_features).dropna()
print(f"âœ“ After cleaning: {may_df_clean.count():,} rows")

[Stage 17:>                                                       (0 + 12) / 12]

âœ“ After cleaning: 667,586 rows


                                                                                

In [10]:
# Encode categorical features (must match training encoding)
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_indexed", handleInvalid="keep")
    for col in categorical_features
]

pipeline = Pipeline(stages=indexers)
indexer_model = pipeline.fit(may_df_clean)
may_df_indexed = indexer_model.transform(may_df_clean)

# Prepare feature vector
indexed_categorical = [f"{col}_indexed" for col in categorical_features]
all_features = numerical_features + indexed_categorical

assembler = VectorAssembler(
    inputCols=all_features,
    outputCol="features",
    handleInvalid="skip"
)

may_df_features = assembler.transform(may_df_indexed)

                                                                                

In [11]:
# Make predictions
print("\nðŸ”® Making predictions...")
predictions = rf_model.transform(may_df_features)


ðŸ”® Making predictions...


In [12]:
# Show results
print(f"\nTotal predictions: {predictions.count():,}")
print("\nPrediction distribution:")
predictions.groupBy("prediction").count().orderBy("prediction").show()


Total predictions: 667,586

Prediction distribution:


25/12/04 20:57:06 WARN DAGScheduler: Broadcasting large task binary with size 20.8 MiB

+----------+------+
|prediction| count|
+----------+------+
|       0.0|644310|
|       1.0| 23276|
+----------+------+



25/12/04 20:57:10 WARN DAGScheduler: Broadcasting large task binary with size 20.8 MiB
                                                                                

In [13]:
# Show sample predictions with details
print("\nSample predictions:")
predictions.select(
    "OP_UNIQUE_CARRIER", "ORIGIN", "DEST", 
    "CRS_DEP_TIME", "prediction", "probability"
).show(10, truncate=False)


Sample predictions:


25/12/04 20:57:29 WARN DAGScheduler: Broadcasting large task binary with size 20.7 MiB


+-----------------+------+----+------------+----------+----------------------------------------+
|OP_UNIQUE_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|prediction|probability                             |
+-----------------+------+----+------------+----------+----------------------------------------+
|9E               |ABE   |ATL |615         |0.0       |[0.8720900919679332,0.12790990803206678]|
|9E               |ABE   |ATL |1307        |0.0       |[0.8343566124938726,0.16564338750612745]|
|9E               |ABE   |ATL |1721        |0.0       |[0.835034670961952,0.16496532903804792] |
|9E               |AEX   |ATL |605         |0.0       |[0.930576299475767,0.069423700524233]   |
|9E               |AEX   |ATL |1200        |0.0       |[0.8521998928604496,0.14780010713955044]|
|9E               |AEX   |ATL |1700        |0.0       |[0.8049884932453756,0.1950115067546244] |
|9E               |AGS   |ATL |535         |0.0       |[0.7655126585839932,0.23448734141600686]|
|9E               |AGS   |ATL 

In [15]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

# Define UDF to extract delay probability (index 1)
@udf(returnType=DoubleType())
def get_delay_prob(probability):
    return float(probability[1]) if probability else 0.0

# Apply the UDF
predictions_with_delay_prob = predictions.withColumn(
    "delay_probability", 
    get_delay_prob(F.col("probability"))
)

print("\nFlights with highest delay risk:")
predictions_with_delay_prob.select(
    "OP_UNIQUE_CARRIER", "ORIGIN", "DEST", 
    "CRS_DEP_TIME", "prediction", "delay_probability"
).orderBy(F.desc("delay_probability")).show(10, truncate=False)



Flights with highest delay risk:


25/12/04 20:59:23 WARN DAGScheduler: Broadcasting large task binary with size 20.8 MiB
[Stage 45:====>                                                   (1 + 11) / 12]

+-----------------+------+----+------------+----------+------------------+
|OP_UNIQUE_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|prediction|delay_probability |
+-----------------+------+----+------------+----------+------------------+
|OO               |SLC   |WYS |1325        |1.0       |0.8622533379469889|
|OO               |SLC   |WYS |1325        |1.0       |0.8618205990387859|
|OO               |COS   |DEN |1720        |1.0       |0.8486105501798048|
|OO               |COS   |DEN |1830        |1.0       |0.8407211619339582|
|OO               |COS   |DEN |1720        |1.0       |0.8385090820826845|
|DL               |SLC   |BZN |2247        |1.0       |0.8313764035873757|
|OO               |COS   |DEN |1830        |1.0       |0.8298196938368378|
|MQ               |CMI   |ORD |1630        |1.0       |0.8269252336348607|
|OO               |SPS   |DFW |1413        |1.0       |0.825372551561373 |
|OO               |RDD   |SFO |2021        |1.0       |0.8224167691385342|
+-----------------+------

                                                                                