## Data Cleaning Notebook:

Arun Agarwal

In [0]:
from pyspark.sql.functions import col, when, count, mean, lit, first, desc, isnan, sum, avg, countDistinct, lit, unix_timestamp, to_timestamp, datediff
from pyspark.sql.types import DoubleType, IntegerType, StringType
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


### Data Description:
4 sources of data:
1. Airlines Data: This is the raw data of flights information. You have 3 months, 6 months, 1 year, and full data from 2015 to 2019. Remember the maxima: "Test, Test, Test", so a lot of testing in smaller samples before scaling up! Location of the data? dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data/, dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_1y/, etc. (Below the dbutils to get the folders)
2. Weather Data: Raw data for weather information. Same as before, we are sharing 3 months, 6 months, 1 year
3. Stations data: Extra information of the location of the different weather stations. Location dbfs:/mnt/mids-w261/datasets_final_project_2022/datasets_final_project_2022/stations_data/stations_with_neighbors.parquet/
4. OTPW Data: This is our joined data (We joined Airlines and Weather). This is the main dataset for your project, the previous 3 are given for reference. You can attempt your own join for Extra Credit. Location dbfs:/mnt/mids-w261/OTPW_60M/OTPW_60M/ and more, several samples are given!

### Load Data:

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

path,name,size,modificationTime
dbfs:/mnt/mids-w261/HW5/,HW5/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_12M/,OTPW_12M/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_1D_CSV/,OTPW_1D_CSV/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_36M/,OTPW_36M/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_3M/,OTPW_3M/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_3M_2015.csv,OTPW_3M_2015.csv,1500620247,1741625185000
dbfs:/mnt/mids-w261/OTPW_3M_2015_delta/,OTPW_3M_2015_delta/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_60M/,OTPW_60M/,0,1763996765061
dbfs:/mnt/mids-w261/OTPW_60M_Backup/,OTPW_60M_Backup/,0,1763996765061
dbfs:/mnt/mids-w261/airport-codes_csv.csv,airport-codes_csv.csv,6232459,1740508595000


In [0]:
# Airline Data    
df_flights = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_3m/")
display(df_flights)

In [0]:
# Weather data
df_weather = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_3m/")
display(df_weather)

In [0]:
# Stations data      
df_stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/stations_with_neighbors.parquet/")
display(df_stations)

In [0]:
# OTPW
df_otpw = spark.read.format("csv").option("header","true").load(f"dbfs:/mnt/mids-w261/OTPW_3M_2015.csv")
display(df_otpw)

#### Group Checkpoint Folder:

In [0]:
# Create folder (RUN THIS ONCE)
section = "04"
number = "04"
folder_path = f"dbfs:/student-groups/Group_{section}_{number}"

# Check if folder exists
try:
    dbutils.fs.ls(folder_path)
    print(f"Folder already exists: {folder_path}")
except Exception as e:
    # If folder doesn't exist, create it
    dbutils.fs.mkdirs(folder_path)
    print(f"Created folder: {folder_path}")

Folder already exists: dbfs:/student-groups/Group_04_04


In [0]:
display(dbutils.fs.ls(f"{folder_path}"))
#display(dbutils.fs.ls(f"{folder_path}/checkpoints/"))

path,name,size,modificationTime
dbfs:/student-groups/Group_04_04/appendix_b_column_classification.csv,appendix_b_column_classification.csv,8138,1762036459000
dbfs:/student-groups/Group_04_04/checkpoints/,checkpoints/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_custom_3M_initial_features.parquet/,df_custom_3M_initial_features.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_custom_3M_test_data.parquet/,df_custom_3M_test_data.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_custom_3M_train_data.parquet/,df_custom_3M_train_data.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_otpw.parquet/,df_otpw.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_otpw_3M_baseline_features.parquet/,df_otpw_3M_baseline_features.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_otpw_3M_clean.parquet/,df_otpw_3M_clean.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_otpw_3M_features.parquet/,df_otpw_3M_features.parquet/,0,1763996805686
dbfs:/student-groups/Group_04_04/df_otpw_3M_initial_features.parquet/,df_otpw_3M_initial_features.parquet/,0,1763996805686


In [0]:
# df_otpw_3m = spark.read.parquet("dbfs:/student-groups/Group_4_4/2015_final_feature_engineered_data_with_dep_delay")

# print("Loaded pre-joined dataset")
# print(f"Num rows: {df_otpw_3m.count():,}")
# print(f"Num cols: {len(df_otpw_3m.columns)}")
# df_otpw_3m.printSchema()

Loaded pre-joined dataset
Num rows: 5,704,114
Num cols: 105
root
 |-- FL_DATE: date (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- departure_hour: integer (nullable = true)
 |-- prediction_utc: timestamp (nullable = true)
 |-- origin_obs_utc: timestamp (nullable = true)
 |-- asof_minutes: long (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- QUARTER: integer (nullable = true)
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_STATE_ABR: string (nullable = true)
 |-- HourlyDewPointTemperature: double (nullable = true)
 |-- HourlyPrecipitation: double 

In [0]:
# from pyspark.sql.functions import col, when, count, mean, lit, first, desc, isnan, sum as spark_sum, avg, countDistinct, lit, unix_timestamp, to_timestamp, datediff
# import pyspark.sql.functions as F

# # ============================================================================
# # 1. BASIC COUNTS AND NULL CHECKS
# # ============================================================================
# print("\n1. DATASET OVERVIEW")
# print("-" * 80)

# total_rows = df_otpw_3m.count()
# print(f"Total rows: {total_rows:,}")

# null_checks = df_otpw_3m.select(
#     F.count(F.when(F.col("DEP_DELAY").isNull(), 1)).alias("DEP_DELAY_nulls"),
#     F.count(F.when(F.col("DEP_DEL15").isNull(), 1)).alias("DEP_DEL15_nulls"),
#     F.count(F.when(F.col("DEP_DELAY").isNotNull(), 1)).alias("DEP_DELAY_non_null"),
#     F.count(F.when(F.col("DEP_DEL15").isNotNull(), 1)).alias("DEP_DEL15_non_null")
# ).collect()[0]

# print(f"\nDEP_DELAY:")
# print(f"  - Null values: {null_checks['DEP_DELAY_nulls']:,} ({null_checks['DEP_DELAY_nulls']/total_rows*100:.2f}%)")
# print(f"  - Non-null values: {null_checks['DEP_DELAY_non_null']:,} ({null_checks['DEP_DELAY_non_null']/total_rows*100:.2f}%)")

# print(f"\nDEP_DEL15:")
# print(f"  - Null values: {null_checks['DEP_DEL15_nulls']:,} ({null_checks['DEP_DEL15_nulls']/total_rows*100:.2f}%)")
# print(f"  - Non-null values: {null_checks['DEP_DEL15_non_null']:,} ({null_checks['DEP_DEL15_non_null']/total_rows*100:.2f}%)")

# # ============================================================================
# # 2. DEP_DELAY DESCRIPTIVE STATISTICS
# # ============================================================================
# print("\n\n2. DEP_DELAY DESCRIPTIVE STATISTICS")
# print("-" * 80)

# delay_stats = df_otpw_3m.select("DEP_DELAY").summary("count", "mean", "stddev", "min", "max", "25%", "50%", "75%")
# print("\nBasic Statistics:")
# for row in delay_stats.collect():
#     print(f"  {row[0]:>10s}: {row[1]}")

# # Additional percentiles
# percentiles = df_otpw_3m.select(
#     F.percentile_approx("DEP_DELAY", [0.01, 0.05, 0.10, 0.90, 0.95, 0.99]).alias("percentiles")
# ).collect()[0]['percentiles']

# print(f"\nAdditional Percentiles:")
# print(f"  1st percentile:  {percentiles[0]:.2f}")
# print(f"  5th percentile:  {percentiles[1]:.2f}")
# print(f"  10th percentile: {percentiles[2]:.2f}")
# print(f"  90th percentile: {percentiles[3]:.2f}")
# print(f"  95th percentile: {percentiles[4]:.2f}")
# print(f"  99th percentile: {percentiles[5]:.2f}")

# # Distribution bins
# print("\nDistribution of DEP_DELAY (minutes):")
# delay_distribution = df_otpw_3m.groupBy(
#     F.when(F.col("DEP_DELAY") < -30, "< -30 (Very early)")
#     .when((F.col("DEP_DELAY") >= -30) & (F.col("DEP_DELAY") < -15), "-30 to -15")
#     .when((F.col("DEP_DELAY") >= -15) & (F.col("DEP_DELAY") < 0), "-15 to 0")
#     .when((F.col("DEP_DELAY") >= 0) & (F.col("DEP_DELAY") < 15), "0 to 15 (On-time)")
#     .when((F.col("DEP_DELAY") >= 15) & (F.col("DEP_DELAY") < 30), "15 to 30")
#     .when((F.col("DEP_DELAY") >= 30) & (F.col("DEP_DELAY") < 60), "30 to 60")
#     .when((F.col("DEP_DELAY") >= 60) & (F.col("DEP_DELAY") < 120), "60 to 120")
#     .when(F.col("DEP_DELAY") >= 120, ">= 120 (Severe)")
#     .otherwise("NULL").alias("delay_range")
# ).agg(
#     F.count("*").alias("count")
# ).orderBy("delay_range")

# for row in delay_distribution.collect():
#     pct = (row['count'] / total_rows) * 100
#     print(f"  {row['delay_range']:20s}: {row['count']:>10,} ({pct:>5.2f}%)")

# # ============================================================================
# # 3. DEP_DEL15 ANALYSIS (Binary Classification Target)
# # ============================================================================
# print("\n\n3. DEP_DEL15 ANALYSIS (Classification Target)")
# print("-" * 80)

# del15_distribution = df_otpw_3m.groupBy("DEP_DEL15").agg(
#     F.count("*").alias("count")
# ).orderBy("DEP_DEL15")

# print("\nClass Distribution:")
# for row in del15_distribution.collect():
#     if row['DEP_DEL15'] is not None:
#         pct = (row['count'] / total_rows) * 100
#         label = "On-time (< 15 min)" if row['DEP_DEL15'] == 0 else "Delayed (>= 15 min)"
#         print(f"  DEP_DEL15 = {row['DEP_DEL15']}: {row['count']:>10,} ({pct:>5.2f}%) - {label}")
#     else:
#         pct = (row['count'] / total_rows) * 100
#         print(f"  DEP_DEL15 = NULL: {row['count']:>10,} ({pct:>5.2f}%)")

# # Calculate class imbalance ratio
# non_null_del15 = df_otpw_3m.filter(F.col("DEP_DEL15").isNotNull())
# class_counts = non_null_del15.groupBy("DEP_DEL15").count().collect()
# if len(class_counts) == 2:
#     count_0 = next((r['count'] for r in class_counts if r['DEP_DEL15'] == 0), 0)
#     count_1 = next((r['count'] for r in class_counts if r['DEP_DEL15'] == 1), 0)
#     if count_1 > 0:
#         imbalance_ratio = count_0 / count_1
#         print(f"\nClass Imbalance Ratio (0:1): {imbalance_ratio:.2f}:1")

# # ============================================================================
# # 4. RELATIONSHIP BETWEEN DEP_DELAY AND DEP_DEL15
# # ============================================================================
# print("\n\n4. RELATIONSHIP BETWEEN DEP_DELAY AND DEP_DEL15")
# print("-" * 80)

# # Statistics grouped by DEP_DEL15
# delay_by_del15 = df_otpw_3m.filter(F.col("DEP_DEL15").isNotNull()).groupBy("DEP_DEL15").agg(
#     F.count("DEP_DELAY").alias("count"),
#     F.mean("DEP_DELAY").alias("mean_delay"),
#     F.stddev("DEP_DELAY").alias("stddev_delay"),
#     F.min("DEP_DELAY").alias("min_delay"),
#     F.max("DEP_DELAY").alias("max_delay"),
#     F.expr("percentile_approx(DEP_DELAY, 0.5)").alias("median_delay")
# ).orderBy("DEP_DEL15")

# print("\nDEP_DELAY statistics by DEP_DEL15 class:")
# for row in delay_by_del15.collect():
#     label = "On-time" if row['DEP_DEL15'] == 0 else "Delayed"
#     print(f"\n  DEP_DEL15 = {row['DEP_DEL15']} ({label}):")
#     print(f"    Count:   {row['count']:>10,}")
#     print(f"    Mean:    {row['mean_delay']:>10.2f} minutes")
#     print(f"    Std Dev: {row['stddev_delay']:>10.2f} minutes")
#     print(f"    Median:  {row['median_delay']:>10.2f} minutes")
#     print(f"    Min:     {row['min_delay']:>10.2f} minutes")
#     print(f"    Max:     {row['max_delay']:>10.2f} minutes")

# # ============================================================================
# # 5. DATA QUALITY CHECKS
# # ============================================================================
# print("\n\n5. DATA QUALITY CHECKS")
# print("-" * 80)

# # Check for inconsistencies: DEP_DEL15=1 but DEP_DELAY < 15
# inconsistent_delayed = df_otpw_3m.filter(
#     (F.col("DEP_DEL15") == 1) & (F.col("DEP_DELAY") < 15)
# ).count()
# print(f"\nInconsistencies (DEP_DEL15=1 but DEP_DELAY < 15): {inconsistent_delayed:,}")

# # Check for inconsistencies: DEP_DEL15=0 but DEP_DELAY >= 15
# inconsistent_ontime = df_otpw_3m.filter(
#     (F.col("DEP_DEL15") == 0) & (F.col("DEP_DELAY") >= 15)
# ).count()
# print(f"Inconsistencies (DEP_DEL15=0 but DEP_DELAY >= 15): {inconsistent_ontime:,}")

# # Rows where both are null
# both_null = df_otpw_3m.filter(
#     F.col("DEP_DEL15").isNull() & F.col("DEP_DELAY").isNull()
# ).count()
# print(f"Rows where both DEP_DEL15 and DEP_DELAY are NULL: {both_null:,}")

# # Rows where DEP_DEL15 is null but DEP_DELAY is not
# del15_null_delay_not = df_otpw_3m.filter(
#     F.col("DEP_DEL15").isNull() & F.col("DEP_DELAY").isNotNull()
# ).count()
# print(f"Rows where DEP_DEL15 is NULL but DEP_DELAY is not: {del15_null_delay_not:,}")

# # Check for extreme values
# extreme_negative = df_otpw_3m.filter(F.col("DEP_DELAY") < -100).count()
# extreme_positive = df_otpw_3m.filter(F.col("DEP_DELAY") > 500).count()
# print(f"\nExtreme values:")
# print(f"  DEP_DELAY < -100 minutes: {extreme_negative:,}")
# print(f"  DEP_DELAY > 500 minutes:  {extreme_positive:,}")

# # ============================================================================
# # 6. CORRELATION ANALYSIS
# # ============================================================================
# print("\n\n6. CORRELATION ANALYSIS")
# print("-" * 80)

# # Calculate correlation between numeric representation
# correlation = df_otpw_3m.stat.corr("DEP_DELAY", "DEP_DEL15")
# print(f"\nPearson Correlation (DEP_DELAY vs DEP_DEL15): {correlation:.4f}")

# print("\n" + "=" * 80)
# print("ANALYSIS COMPLETE")
# print("=" * 80)


1. DATASET OVERVIEW
--------------------------------------------------------------------------------
Total rows: 5,704,114

DEP_DELAY:
  - Null values: 0 (0.00%)
  - Non-null values: 5,704,114 (100.00%)

DEP_DEL15:
  - Null values: 0 (0.00%)
  - Non-null values: 5,704,114 (100.00%)


2. DEP_DELAY DESCRIPTIVE STATISTICS
--------------------------------------------------------------------------------

Basic Statistics:
       count: 5704114
        mean: 0.18390077757912973
      stddev: 0.3874032884423996
         min: 0.0
         max: 1.0
         25%: 0.0
         50%: 0.0
         75%: 0.0

Additional Percentiles:
  1st percentile:  0.00
  5th percentile:  0.00
  10th percentile: 0.00
  90th percentile: 1.00
  95th percentile: 1.00
  99th percentile: 1.00

Distribution of DEP_DELAY (minutes):
  0 to 15 (On-time)   :  5,704,114 (100.00%)


3. DEP_DEL15 ANALYSIS (Classification Target)
--------------------------------------------------------------------------------

Class Distributio

In [0]:
# """
# Corrected script to add DEP_DELAY (continuous delay in minutes) back to the final dataset
# FIXED: Now selects actual DEP_DELAY instead of DEP_DEL15
# """

# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col
# from pyspark.sql import functions as F

# # Initialize Spark session (if not already available)
# spark = SparkSession.builder.getOrCreate()

# # Define file paths - adjust these to match your actual file locations
# BASE_PATH = "dbfs:/student-groups/Group_4_4/"

# # Load the final feature engineered dataset (without DEP_DELAY)
# df_final = df_otpw_3m
# if 'DEP_DELAY' in df_final.columns:
#     print("\nDropping existing (incorrect) DEP_DELAY column from final dataset...")
#     df_final = df_final.drop('DEP_DELAY')
#     print(f"Columns after drop: {len(df_final.columns)}")

# print(f"Final dataset shape: {df_final.count()} rows, {len(df_final.columns)} columns")
# print(f"Final dataset columns include DEP_DELAY: {'DEP_DELAY' in df_final.columns}")

# # Load the dataset before DEP_DELAY was removed (with DEP_DELAY)
# # This could be your merged dataset or an earlier feature engineered version
# dataset_with_dep_delay_path = f"{BASE_PATH}JOINED_1Y_2015.parquet"  # Adjust path as needed
# df_with_dep_delay = spark.read.parquet(dataset_with_dep_delay_path)

# print(f"\nDataset with dep_delay shape: {df_with_dep_delay.count()} rows, {len(df_with_dep_delay.columns)} columns")

# # Check which column name exists (DEP_DELAY vs dep_delay)
# has_dep_delay = 'DEP_DELAY' in df_with_dep_delay.columns
# has_dep_delay_lower = 'dep_delay' in df_with_dep_delay.columns

# if has_dep_delay:
#     print(f"Found 'DEP_DELAY' column in source dataset")
#     delay_col_name = 'DEP_DELAY'
# elif has_dep_delay_lower:
#     print(f"Found 'dep_delay' column in source dataset")
#     delay_col_name = 'dep_delay'
# else:
#     print("ERROR: Neither 'DEP_DELAY' nor 'dep_delay' found in source dataset!")
#     print(f"Available columns: {df_with_dep_delay.columns}")
#     raise ValueError("DEP_DELAY column not found")

# # Select only DEP_DELAY and the key columns needed for joining
# # Adjust join keys based on your data structure
# join_keys = ['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'ORIGIN_AIRPORT_ID']

# # CORRECTED: Select the actual DEP_DELAY (continuous minutes), not DEP_DEL15 (binary)
# if delay_col_name == 'dep_delay':
#     # If lowercase, alias it to uppercase for consistency
#     df_dep_delay_only = df_with_dep_delay.select(*join_keys, col('dep_delay').alias('DEP_DELAY'))
# else:
#     # If already uppercase, use as-is
#     df_dep_delay_only = df_with_dep_delay.select(*join_keys, 'DEP_DELAY')

# # Show sample of what we're about to join
# print("\nSample of DEP_DELAY values to be joined:")
# df_dep_delay_only.select('DEP_DELAY').describe().show()
# print("\nFirst 10 rows:")
# df_dep_delay_only.show(10)

# # Join DEP_DELAY back to the final dataset
# df_final_with_dep_delay = df_final.join(
#     df_dep_delay_only,
#     on=join_keys,
#     how='left'
# )

# print(f"\nFinal dataset with DEP_DELAY: {df_final_with_dep_delay.count()} rows, {len(df_final_with_dep_delay.columns)} columns")
# print(f"Includes DEP_DELAY: {'DEP_DELAY' in df_final_with_dep_delay.columns}")

# # Check for any nulls in DEP_DELAY after join
# null_count = df_final_with_dep_delay.filter(col('DEP_DELAY').isNull()).count()
# print(f"\nNull values in DEP_DELAY after join: {null_count}")

# # Verify DEP_DELAY is continuous (not just 0/1)
# print("\nDEP_DELAY statistics (should show continuous values, not just 0/1):")
# df_final_with_dep_delay.select('DEP_DELAY').describe().show()

# # Check distinct values to confirm it's not binary
# distinct_values = df_final_with_dep_delay.select('DEP_DELAY').distinct().count()
# print(f"Number of distinct DEP_DELAY values: {distinct_values} (should be >> 2)")

# # Display sample to verify both DEP_DELAY and DEP_DEL15
# print("\nSample of data with both DEP_DELAY (continuous) and DEP_DEL15 (binary):")
# df_final_with_dep_delay.select(
#     'FL_DATE', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 
#     'DEP_DELAY', 'DEP_DEL15'
# ).show(20)

# # Verify the relationship: DEP_DEL15 should be 1 when DEP_DELAY >= 15, and 0 otherwise
# print("\nVerifying DEP_DELAY and DEP_DEL15 relationship:")
# relationship_check = df_final_with_dep_delay.groupBy('DEP_DEL15').agg(
#     col('DEP_DEL15'),
#     F.min('DEP_DELAY').alias('min_delay'),
#     F.max('DEP_DELAY').alias('max_delay'),
#     F.mean('DEP_DELAY').alias('avg_delay'),
#     F.count('*').alias('count')
# ).orderBy('DEP_DEL15')
# relationship_check.show()

# # Save the updated dataset
# output_path = f"{BASE_PATH}2015_final_feature_engineered_data_with_dep_delay"
# df_final_with_dep_delay.write.mode('overwrite').parquet(output_path)

# print(f"\n{'='*80}")
# print(f"SUCCESS: Dataset with DEP_DELAY saved to: {output_path}")
# print(f"{'='*80}")


Dropping existing (incorrect) DEP_DELAY column from final dataset...
Columns after drop: 104
Final dataset shape: 5704114 rows, 104 columns
Final dataset columns include DEP_DELAY: False

Dataset with dep_delay shape: 5819079 rows, 75 columns
Found 'DEP_DELAY' column in source dataset

Sample of DEP_DELAY values to be joined:
+-------+------------------+
|summary|         DEP_DELAY|
+-------+------------------+
|  count|           5732926|
|   mean| 9.370158275198389|
| stddev|37.080942496787074|
|    min|             -82.0|
|    max|            1988.0|
+-------+------------------+


First 10 rows:
+----------+-----------------+-----------------+------+----+------------+-----------------+---------+
|   FL_DATE|OP_UNIQUE_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|ORIGIN_AIRPORT_ID|DEP_DELAY|
+----------+-----------------+-----------------+------+----+------------+-----------------+---------+
|2015-01-01|               DL|             1199|   STL| ATL|        1712|            15

#### Convert CSV to Parquet:

In [0]:
display(dbutils.fs.ls(f"{data_BASE_DIR}/OTPW_3M/OTPW_3M/"))

In [0]:
df_otpw_3m = (spark.read
               .option("header", "true")
               .option("inferSchema", "true")
               .csv("dbfs:/mnt/mids-w261/OTPW_3M/OTPW_3M/OTPW_3M_2015.csv.gz"))
df_otpw_3m.cache()

#### Save Parquet to Group Folder:

In [0]:
df_otpw_3m.write.parquet(f"{folder_path}/otpw_3m.parquet")

In [0]:
display(dbutils.fs.ls(f"{folder_path}"))

#### Load Parquet from Group Folder:

In [0]:
otpw_3m = spark.read.parquet(f"{folder_path}/otpw_3m.parquet")

In [0]:
print("Num rows: ", otpw_3m.count())
print("Num cols: ", len(otpw_3m.columns))

In [0]:
otpw_3m.printSchema()

### Data Preprocessing:

#### Comprehensive Type Casting (Data Type Standardization):

In [0]:
print("=== Standardizing Data Types ===")
integer_cols = [
    "QUARTER", "DAY_OF_MONTH", "DAY_OF_WEEK",
    "DEP_DELAY_NEW", "DEP_DEL15", "DEP_DELAY_GROUP",
    "TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN",
    "ARR_DELAY_NEW", "ARR_DEL15", "ARR_DELAY_GROUP",
    "CANCELLED", "DIVERTED",
    "FLIGHTS", "DISTANCE", "DISTANCE_GROUP",
    "YEAR", "MONTH",
    "OP_CARRIER_FL_NUM", "ORIGIN_AIRPORT_ID", "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID", "ORIGIN_STATE_FIPS", "ORIGIN_WAC",
    "DEST_AIRPORT_ID", "DEST_AIRPORT_SEQ_ID", "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS", "DEST_WAC", "CRS_DEP_TIME", "CRS_ARR_TIME",
    "HourlyPressureTendency", "Sunrise", "Sunset"
]

float_cols = [
    "DEP_DELAY", "ARR_DELAY",
    "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME",
    "CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",
    "FIRST_DEP_TIME", "TOTAL_ADD_GTIME", "LONGEST_ADD_GTIME",
    "origin_station_lat", "origin_station_lon", "origin_airport_lat", "origin_airport_lon", "origin_station_dis",
    "dest_station_lat", "dest_station_lon", "dest_airport_lat", "dest_airport_lon", "dest_station_dis",
    "LATITUDE", "LONGITUDE", "ELEVATION",
    "HourlyAltimeterSetting", "HourlyDewPointTemperature", "HourlyDryBulbTemperature", "HourlyPrecipitation",
    "HourlyPressureChange", "HourlyRelativeHumidity", "HourlySeaLevelPressure",
    "HourlyStationPressure", "HourlyVisibility", "HourlyWetBulbTemperature", "HourlyWindDirection",
    "HourlyWindGustSpeed", "HourlyWindSpeed",
    "DailyAverageDewPointTemperature", "DailyAverageDryBulbTemperature", "DailyAverageRelativeHumidity",
    "DailyAverageSeaLevelPressure", "DailyAverageStationPressure", "DailyAverageWetBulbTemperature",
    "DailyAverageWindSpeed", "DailyCoolingDegreeDays", "DailyDepartureFromNormalAverageTemperature",
    "DailyHeatingDegreeDays", "DailyMaximumDryBulbTemperature", "DailyMinimumDryBulbTemperature",
    "DailyPeakWindDirection", "DailyPeakWindSpeed", "DailyPrecipitation", "DailySnowDepth", "DailySnowfall",
    "DailySustainedWindDirection", "DailySustainedWindSpeed"
]

date_cols = [
    "FL_DATE",
    "sched_depart_date_time", "sched_depart_date_time_UTC",
    "four_hours_prior_depart_UTC", "two_hours_prior_depart_UTC",
    "DATE"
]

print(f"Schema before casting:")
original_types = dict(otpw_3m.dtypes)

# Cast columns according to type
cast_counts = {"integer": 0, "double": 0, "timestamp": 0, "unchanged": 0}

for c in otpw_3m.columns:
    if c in integer_cols:
        otpw_3m = otpw_3m.withColumn(c, col(c).cast("int"))
        cast_counts["integer"] += 1
    elif c in float_cols:
        otpw_3m = otpw_3m.withColumn(c, col(c).cast("double"))
        cast_counts["double"] += 1
    elif c in date_cols:
        otpw_3m = otpw_3m.withColumn(c, col(c).cast("timestamp"))
        cast_counts["timestamp"] += 1
    else:
        cast_counts["unchanged"] += 1

print(f"\nType Casting Summary:")
print(f"  Cast to integer: {cast_counts['integer']} columns")
print(f"  Cast to double: {cast_counts['double']} columns")
print(f"  Cast to timestamp: {cast_counts['timestamp']} columns")
print(f"  Unchanged (strings, etc.): {cast_counts['unchanged']} columns")

print(f"\n Data types standardized")

#### Dropping Rows Missing Important Values:

In [0]:

required_cols = ["DEP_DEL15", "FL_DATE", "CRS_DEP_TIME", "OP_CARRIER_FL_NUM"] 

otpw_3m.select([
    count(when(col(c).isNull(), c)).alias(f"{c}_missing_vals")
    for c in required_cols
]).show()


In [0]:
otpw_3m_clean = otpw_3m.dropna(subset=required_cols)
otpw_3m_clean.select([
    count(when(col(c).isNull(), c)).alias(f"{c}_missing_vals")
    for c in required_cols
]).show()

In [0]:
print("Num rows: ", otpw_3m_clean.count())
print("Num cols: ", len(otpw_3m_clean.columns))

#### Drop Duplicate Rows:

In [0]:
dup_groups = otpw_3m_clean.groupBy(required_cols) \
               .agg(count("*").alias("row_count")) \
               .filter("row_count > 1")
print("Num of duplicate groups: ", dup_groups.count())
dup_groups.show()

In [0]:
before = otpw_3m_clean.count()
otpw_3m_clean = otpw_3m_clean.dropDuplicates(subset=required_cols)
after = otpw_3m_clean.count()
print(f"Dropped {before - after} duplicate rows")

In [0]:
print("Num rows: ", otpw_3m_clean.count())
print("Num cols: ", len(otpw_3m_clean.columns))

#### Filter Out Canceled and Diverted Flights Data:

In [0]:
otpw_3m_clean = otpw_3m_clean.filter((col("CANCELLED") != 1) | col("CANCELLED").isNull())
print("Num rows: ", otpw_3m_clean.count())
print("Num cols: ", len(otpw_3m_clean.columns))

In [0]:
# Drop Diverted Flights:
print("=== Dropping Diverted Flights ===")
before_count = otpw_3m_clean.count()
otpw_3m_clean = otpw_3m_clean.filter((col("DIVERTED") != 1) | col("DIVERTED").isNull())
after_count = otpw_3m_clean.count()
diverted_dropped = before_count - after_count
print(f"Dropped {diverted_dropped:,} diverted flights ({diverted_dropped/before_count*100:.2f}%)")
print(f"Remaining flights: {after_count:,}")

#### Remove Columns with Insufficient or Unreliable Data:

In [0]:
total_rows = otpw_3m_clean.count()
null_stats = []
for c in otpw_3m_clean.columns:
    null_count = otpw_3m_clean.filter(col(c).isNull()).count()
    null_pct = null_count / total_rows
    null_stats.append((c, null_count, null_pct))
null_df = pd.DataFrame(null_stats, columns=["column", "null_count", "null_pct"])
null_df = null_df.sort_values(by="null_pct", ascending=False)
pd.set_option('display.max_rows', None)
display(null_df)

In [0]:
null_threshold = 0.5
important_sparse_cols_keep = [
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
    "WEATHER_DELAY",
    "CARRIER_DELAY",
    "NAS_DELAY",
    "HourlyWindGustSpeed",
    "HourlyPresentWeatherType",
    "HourlyPressureChange",
    "HourlyPressureTendency"
]
drop_cols = null_df[
    (null_df["null_pct"] > null_threshold) &
    (~null_df["column"].isin(important_sparse_cols_keep))
]["column"].tolist()
otpw_3m_filtered = otpw_3m_clean.drop(*drop_cols)

In [0]:
print("Num rows: ", otpw_3m_filtered.count())
print("Num cols: ", len(otpw_3m_filtered.columns))

In [0]:
# Save checkpoint to group folder
otpw_3m_filtered.write.parquet(f"{folder_path}/otpw_3m_clean.parquet")
display(dbutils.fs.ls(f"{folder_path}"))


#### Cast Certain Columns to Appropiate Data Types:

In [0]:
otpw_3m_imputed = spark.read.parquet(f"{folder_path}/otpw_3m_clean.parquet")

# Define columns that need type casting from string to numeric
string_to_double_cols = [
    # Hourly weather
    "HourlyAltimeterSetting",
    "HourlyDewPointTemperature",
    "HourlyDryBulbTemperature",
    "HourlyPrecipitation",
    "HourlyRelativeHumidity",
    "HourlySeaLevelPressure",
    "HourlyStationPressure",
    "HourlyVisibility",
    "HourlyWetBulbTemperature",
    "HourlyWindDirection",
    "HourlyWindGustSpeed",
    "HourlyWindSpeed",
    # Daily weather
    "DailyAverageDryBulbTemperature",
    "DailyMinimumDryBulbTemperature",
    "DailyCoolingDegreeDays",
    "DailyDepartureFromNormalAverageTemperature",
    "DailyHeatingDegreeDays",
    "DailyPeakWindDirection",
    "DailyPeakWindSpeed",
    "DailyPrecipitation",
    "DailySnowDepth",
    "DailySnowfall",
    "DailySustainedWindDirection",
    "DailySustainedWindSpeed",
    # Normals
    "AWND", "CDSD", "CLDD", "DSNW", "HDSD", "HTDD",
    "NormalsCoolingDegreeDay",
    "NormalsHeatingDegreeDay"
]

# Add all Monthly columns (they're all strings that should be numeric)
monthly_cols = [col for col in otpw_3m_imputed.columns if col.startswith("Monthly")]
string_to_double_cols.extend(monthly_cols)

# Add all ShortDurationPrecipitationValue columns
short_duration_cols = [col for col in otpw_3m_imputed.columns if "ShortDurationPrecipitationValue" in col]
string_to_double_cols.extend(short_duration_cols)

# Remove duplicates
string_to_double_cols = list(set(string_to_double_cols))

# Cast string columns to double (will convert non-numeric values to null)
print("=== Converting String Columns to Numeric ===")
cast_count = 0
for col_name in string_to_double_cols:
    if col_name in otpw_3m_imputed.columns:
        otpw_3m_imputed = otpw_3m_imputed.withColumn(
            col_name,
            col(col_name).cast(DoubleType())
        )
        cast_count += 1
        print(f"Converted {col_name} to DoubleType")

print(f"\nTotal columns converted: {cast_count}")
display(otpw_3m_imputed)

#### Impute Null Variables with Median:

In [0]:
#### Impute Null Variables with Tailored Strategy:

# Define variable types for imputation
continuous_cols = [
    "CRS_ELAPSED_TIME", "TAXI_OUT", "TAXI_IN", "WHEELS_OFF", "WHEELS_ON",
    "DISTANCE", "CRS_DEP_TIME", "CRS_ARR_TIME",
    "HourlyAltimeterSetting", "HourlyPrecipitation", "HourlyVisibility", 
    "HourlyWindSpeed", "HourlyWindGustSpeed", "HourlyWindDirection",
    "HourlyDryBulbTemperature", "HourlyDewPointTemperature", "HourlyWetBulbTemperature",
    "HourlyRelativeHumidity", "HourlySeaLevelPressure", "HourlyStationPressure"
]

categorical_cols = [
    "ORIGIN", "DEST", "OP_UNIQUE_CARRIER", "TAIL_NUM", 
    "ORIGIN_AIRPORT_SEQ_ID", "DEST_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID", "DEST_CITY_MARKET_ID",
    "ORIGIN_CITY_NAME", "DEST_CITY_NAME", "ORIGIN_STATE_ABR", "DEST_STATE_ABR",
    "HourlyPresentWeatherType", "CANCELLATION_CODE"
]

boolean_cols = [
    # Add boolean columns if they exist in your dataset
    # "IS_WEEKEND"
]

# 1. Cast continuous columns to DoubleType if needed
print("=== Casting Non-Numeric Continuous Columns ===")
for col_name in continuous_cols:
    if col_name in otpw_3m_imputed.columns:
        col_type = dict(otpw_3m_imputed.dtypes)[col_name]
        if col_type not in ["double", "float", "int", "bigint", "integer"]:
            otpw_3m_imputed = otpw_3m_imputed.withColumn(
                col_name,
                col(col_name).cast(DoubleType())
            )
            print(f"Cast {col_name} from {col_type} to DoubleType")

# 2. Impute continuous variables with median
print("\n=== Imputing Continuous Variables with Median ===")
impute_values = {}
for col_name in continuous_cols:
    if col_name in otpw_3m_imputed.columns:
        try:
            median_value = otpw_3m_imputed.select(col_name).na.drop().approxQuantile(col_name, [0.5], 0.01)[0]
            if median_value is not None:
                impute_values[col_name] = median_value
                null_count = otpw_3m_imputed.filter(col(col_name).isNull()).count()
                print(f"{col_name}: median = {median_value:.2f}, imputing {null_count} nulls")
        except Exception as e:
            print(f"Warning: Could not impute {col_name}: {e}")

otpw_3m_imputed = otpw_3m_imputed.fillna(impute_values)

# 2. CATEGORICAL VARIABLES: Impute with "UNK" (Unknown)
print("\n=== Imputing Categorical Variables with 'UNK' ===")
for col_name in categorical_cols:
    if col_name in otpw_3m_imputed.columns:
        col_type = dict(otpw_3m_imputed.dtypes)[col_name]
        
        # Cast integer categorical columns to string first
        if col_type in ["int", "bigint", "integer"]:
            print(f"Converting {col_name} from {col_type} to StringType for UNK imputation")
            otpw_3m_imputed = otpw_3m_imputed.withColumn(
                col_name,
                col(col_name).cast("string")
            )
        
        null_count = otpw_3m_imputed.filter(col(col_name).isNull()).count()
        if null_count > 0:
            otpw_3m_imputed = otpw_3m_imputed.withColumn(
                col_name,
                when(col(col_name).isNull(), lit("UNK")).otherwise(col(col_name))
            )
            print(f"{col_name}: {null_count} nulls replaced with 'UNK'")

# 3. BOOLEAN VARIABLES: Impute with majority class
if boolean_cols:
    print("\n=== Imputing Boolean Variables with Majority Class ===")
    for col_name in boolean_cols:
        if col_name in otpw_3m_imputed.columns:
            mode_row = (
                otpw_3m_imputed.groupBy(col_name)
                .agg(count("*").alias("count"))
                .orderBy(desc("count"))
                .filter(col(col_name).isNotNull())
                .first()
            )
            if mode_row:
                mode_value = mode_row[0]
                null_count = otpw_3m_imputed.filter(col(col_name).isNull()).count()
                if null_count > 0:  # Only impute if there are nulls
                    otpw_3m_imputed = otpw_3m_imputed.withColumn(
                        col_name,
                        when(col(col_name).isNull(), lit(mode_value)).otherwise(col(col_name))
                    )
                    print(f"{col_name}: {null_count} nulls replaced with majority class '{mode_value}'")
            else:
                print(f"Warning: Could not determine mode for {col_name}")

# Display imputed data
display(otpw_3m_imputed)

# Check remaining nulls
print("\n=== Remaining Null Values ===")
total_rows = otpw_3m_imputed.count()
null_stats = []
for c in otpw_3m_imputed.columns:
    null_count = otpw_3m_imputed.filter(col(c).isNull()).count()
    null_pct = null_count / total_rows
    if null_count > 0:  # Only show columns with remaining nulls
        null_stats.append((c, null_count, null_pct))

if null_stats:
    null_df = pd.DataFrame(null_stats, columns=["column", "null_count", "null_pct"])
    null_df = null_df.sort_values(by="null_pct", ascending=False)
    display(null_df)
else:
    print("✓ No remaining null values!")

print(f"\nFinal dataset shape:")
print(f"Num rows: {otpw_3m_imputed.count()}")
print(f"Num cols: {len(otpw_3m_imputed.columns)}")

# Save checkpoint to group folder
otpw_3m_imputed.write.mode("overwrite").parquet(f"{folder_path}/otpw_3m_imputed.parquet")
print(f"\n✓ Saved to: {folder_path}/otpw_3m_imputed.parquet")
display(dbutils.fs.ls(f"{folder_path}"))

In [0]:
total_rows = otpw_3m_imputed.count()
null_stats = []
for c in otpw_3m_imputed.columns:
    null_count = otpw_3m_imputed.filter(col(c).isNull()).count()
    null_pct = null_count / total_rows
    null_stats.append((c, null_count, null_pct))
null_df = pd.DataFrame(null_stats, columns=["column", "null_count", "null_pct"])
null_df = null_df.sort_values(by="null_pct", ascending=False)
pd.set_option('display.max_rows', None)
display(null_df)

In [0]:
print("Num rows: ", otpw_3m_imputed.count())
print("Num cols: ", len(otpw_3m_imputed.columns))

#### Convert Target Variable to Binary Integer:

In [0]:
print("=== Converting Target Variable DEP_DEL15 to Binary Integer ===")

# Check current type and distribution
print(f"Current DEP_DEL15 type: {dict(otpw_3m_imputed.dtypes)['DEP_DEL15']}")
print("\nDEP_DEL15 value distribution:")
otpw_3m_imputed.groupBy("DEP_DEL15").count().orderBy("DEP_DEL15").show()

# Convert to integer (0 or 1) for binary classification
otpw_3m_imputed = otpw_3m_imputed.withColumn(
    "DEP_DEL15",
    col("DEP_DEL15").cast(IntegerType())
)

print(f"\nNew DEP_DEL15 type: {dict(otpw_3m_imputed.dtypes)['DEP_DEL15']}")
print("\nConfirming binary values:")
otpw_3m_imputed.groupBy("DEP_DEL15").count().orderBy("DEP_DEL15").show()

# Also convert ARR_DEL15 if it's a target for arrival delay prediction
if "ARR_DEL15" in otpw_3m_imputed.columns:
    print("\n=== Converting ARR_DEL15 to Binary Integer ===")
    otpw_3m_imputed = otpw_3m_imputed.withColumn(
        "ARR_DEL15",
        col("ARR_DEL15").cast(IntegerType())
    )
    print(f"New ARR_DEL15 type: {dict(otpw_3m_imputed.dtypes)['ARR_DEL15']}")

#### Remove All Leakage Features:

In [0]:
print("\n=== Removing Data Leakage Features ===")

# Define all leakage features (features known only after flight completion)
leakage_features = [
    # Actual times (only known after flight)
    "DEP_TIME", "ARR_TIME", "WHEELS_OFF", "WHEELS_ON",
    
    # Actual delays (target-related)
    "DEP_DELAY", "DEP_DELAY_NEW", "DEP_DELAY_GROUP",
    "ARR_DELAY", "ARR_DELAY_NEW", "ARR_DELAY_GROUP",
    
    # Taxi times (only known after flight)
    "TAXI_OUT", "TAXI_IN",
    
    # Flight durations (only known after completion)
    "ACTUAL_ELAPSED_TIME", "AIR_TIME",
    
    # Delay breakdowns (only known after delay occurs)
    "CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", 
    "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",
    
    # Other post-flight info
    "FIRST_DEP_TIME", "TOTAL_ADD_GTIME", "LONGEST_ADD_GTIME",
    
    # Keep ARR_DEL15 if predicting departure delay, remove if predicting arrival
    "ARR_DEL15"  # Remove this since we're predicting DEP_DEL15
]

# Count and remove leakage features
existing_leakage = [col for col in leakage_features if col in otpw_3m_imputed.columns]
print(f"Found {len(existing_leakage)} leakage features to remove:")
for feat in existing_leakage:
    print(f"  - {feat}")

otpw_3m_clean_no_leakage = otpw_3m_imputed.drop(*existing_leakage)

print(f"\nColumns before: {len(otpw_3m_imputed.columns)}")
print(f"Columns after: {len(otpw_3m_clean_no_leakage.columns)}")
print(f"Removed: {len(otpw_3m_imputed.columns) - len(otpw_3m_clean_no_leakage.columns)} columns")


#### Create Comprehensive Column Classification Table (Appendix B):

In [0]:
# Define column categories based on functional type
column_classification = {
    # Target Variables
    "DEP_DEL15": {"functional_type": "Binary Target", "data_type": "integer", "description": "Departure delay ≥15 minutes (0/1)"},
    "ARR_DEL15": {"functional_type": "Binary Target", "data_type": "integer", "description": "Arrival delay ≥15 minutes (0/1)"},
    
    # Temporal Features
    "YEAR": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Year of flight"},
    "QUARTER": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Quarter (1-4)"},
    "MONTH": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Month (1-12)"},
    "DAY_OF_MONTH": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Day of month (1-31)"},
    "DAY_OF_WEEK": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Day of week (1-7)"},
    "FL_DATE": {"functional_type": "Temporal", "data_type": "date", "description": "Flight date"},
    
    # Flight Identifiers
    "OP_UNIQUE_CARRIER": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Unique carrier code"},
    "OP_CARRIER_AIRLINE_ID": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Airline ID"},
    "OP_CARRIER": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Carrier code"},
    "TAIL_NUM": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Tail number"},
    "OP_CARRIER_FL_NUM": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Flight number"},
    
    # Origin Airport Features
    "ORIGIN_AIRPORT_ID": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Origin airport ID"},
    "ORIGIN_AIRPORT_SEQ_ID": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin airport sequence ID"},
    "ORIGIN_CITY_MARKET_ID": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin city market ID"},
    "ORIGIN": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin airport code"},
    "ORIGIN_CITY_NAME": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin city name"},
    "ORIGIN_STATE_ABR": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin state abbreviation"},
    "ORIGIN_STATE_FIPS": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Origin state FIPS code"},
    "ORIGIN_STATE_NM": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Origin state name"},
    "ORIGIN_WAC": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Origin World Area Code"},
    
    # Destination Airport Features
    "DEST_AIRPORT_ID": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Destination airport ID"},
    "DEST_AIRPORT_SEQ_ID": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination airport sequence ID"},
    "DEST_CITY_MARKET_ID": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination city market ID"},
    "DEST": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination airport code"},
    "DEST_CITY_NAME": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination city name"},
    "DEST_STATE_ABR": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination state abbreviation"},
    "DEST_STATE_FIPS": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Destination state FIPS code"},
    "DEST_STATE_NM": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Destination state name"},
    "DEST_WAC": {"functional_type": "Categorical (Nominal)", "data_type": "integer", "description": "Destination World Area Code"},
    
    # Timing Features
    "CRS_DEP_TIME": {"functional_type": "Continuous (Cyclic)", "data_type": "integer", "description": "Scheduled departure time (HHMM)"},
    "CRS_ARR_TIME": {"functional_type": "Continuous (Cyclic)", "data_type": "integer", "description": "Scheduled arrival time (HHMM)"},
    "DEP_TIME": {"functional_type": "Continuous (Cyclic)", "data_type": "integer", "description": "Actual departure time (HHMM) - LEAKAGE"},
    "ARR_TIME": {"functional_type": "Continuous (Cyclic)", "data_type": "integer", "description": "Actual arrival time (HHMM) - LEAKAGE"},
    "DEP_TIME_BLK": {"functional_type": "Categorical (Ordinal)", "data_type": "string", "description": "Departure time block"},
    "ARR_TIME_BLK": {"functional_type": "Categorical (Ordinal)", "data_type": "string", "description": "Arrival time block"},
    
    # Flight Performance - LEAKAGE FEATURES
    "DEP_DELAY": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Departure delay in minutes"},
    "DEP_DELAY_NEW": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Departure delay (new calculation)"},
    "DEP_DELAY_GROUP": {"functional_type": "Categorical - LEAKAGE", "data_type": "integer", "description": "Departure delay group"},
    "ARR_DELAY": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Arrival delay in minutes"},
    "ARR_DELAY_NEW": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Arrival delay (new calculation)"},
    "ARR_DELAY_GROUP": {"functional_type": "Categorical - LEAKAGE", "data_type": "integer", "description": "Arrival delay group"},
    "TAXI_OUT": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Taxi out time in minutes"},
    "TAXI_IN": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Taxi in time in minutes"},
    "WHEELS_OFF": {"functional_type": "Continuous - LEAKAGE", "data_type": "integer", "description": "Wheels off time (HHMM)"},
    "WHEELS_ON": {"functional_type": "Continuous - LEAKAGE", "data_type": "integer", "description": "Wheels on time (HHMM)"},
    "ACTUAL_ELAPSED_TIME": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Actual elapsed time"},
    "AIR_TIME": {"functional_type": "Continuous - LEAKAGE", "data_type": "double", "description": "Air time in minutes"},
    
    # Flight Distance
    "CRS_ELAPSED_TIME": {"functional_type": "Continuous", "data_type": "double", "description": "Scheduled elapsed time"},
    "DISTANCE": {"functional_type": "Continuous", "data_type": "double", "description": "Flight distance in miles"},
    "DISTANCE_GROUP": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Distance group"},
    
    # Cancellation/Diversion
    "CANCELLED": {"functional_type": "Binary", "data_type": "double", "description": "Cancelled indicator (0/1)"},
    "CANCELLATION_CODE": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Cancellation reason code"},
    "DIVERTED": {"functional_type": "Binary", "data_type": "double", "description": "Diverted indicator (0/1)"},
    
    # Delay Breakdown - SPARSE/LEAKAGE
    "CARRIER_DELAY": {"functional_type": "Continuous - SPARSE/LEAKAGE", "data_type": "double", "description": "Carrier delay in minutes"},
    "WEATHER_DELAY": {"functional_type": "Continuous - SPARSE/LEAKAGE", "data_type": "double", "description": "Weather delay in minutes"},
    "NAS_DELAY": {"functional_type": "Continuous - SPARSE/LEAKAGE", "data_type": "double", "description": "NAS delay in minutes"},
    "SECURITY_DELAY": {"functional_type": "Continuous - SPARSE/LEAKAGE", "data_type": "double", "description": "Security delay in minutes"},
    "LATE_AIRCRAFT_DELAY": {"functional_type": "Continuous - SPARSE/LEAKAGE", "data_type": "double", "description": "Late aircraft delay in minutes"},
    
    # Hourly Weather Features (Origin)
    "HourlyAltimeterSetting": {"functional_type": "Continuous", "data_type": "double", "description": "Altimeter setting (converted from string)"},
    "HourlyDewPointTemperature": {"functional_type": "Continuous", "data_type": "double", "description": "Dew point temperature (converted from string)"},
    "HourlyDryBulbTemperature": {"functional_type": "Continuous", "data_type": "double", "description": "Dry bulb temperature (converted from string)"},
    "HourlyPrecipitation": {"functional_type": "Continuous", "data_type": "double", "description": "Precipitation amount (converted from string)"},
    "HourlyRelativeHumidity": {"functional_type": "Continuous", "data_type": "double", "description": "Relative humidity (converted from string)"},
    "HourlySeaLevelPressure": {"functional_type": "Continuous", "data_type": "double", "description": "Sea level pressure (converted from string)"},
    "HourlyStationPressure": {"functional_type": "Continuous", "data_type": "double", "description": "Station pressure (converted from string)"},
    "HourlyVisibility": {"functional_type": "Continuous", "data_type": "double", "description": "Visibility (converted from string)"},
    "HourlyWetBulbTemperature": {"functional_type": "Continuous", "data_type": "double", "description": "Wet bulb temperature (converted from string)"},
    "HourlyWindDirection": {"functional_type": "Continuous (Cyclic)", "data_type": "double", "description": "Wind direction in degrees (converted from string)"},
    "HourlyWindGustSpeed": {"functional_type": "Continuous", "data_type": "double", "description": "Wind gust speed (converted from string)"},
    "HourlyWindSpeed": {"functional_type": "Continuous", "data_type": "double", "description": "Wind speed (converted from string)"},
    "HourlyPresentWeatherType": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Weather condition codes"},
    "HourlyPressureChange": {"functional_type": "Continuous", "data_type": "double", "description": "Pressure change"},
    "HourlyPressureTendency": {"functional_type": "Categorical (Ordinal)", "data_type": "integer", "description": "Pressure tendency code"},
    
    # Location Features
    "origin_station_lat": {"functional_type": "Continuous", "data_type": "double", "description": "Origin station latitude"},
    "origin_station_lon": {"functional_type": "Continuous", "data_type": "double", "description": "Origin station longitude"},
    "origin_airport_lat": {"functional_type": "Continuous", "data_type": "double", "description": "Origin airport latitude"},
    "origin_airport_lon": {"functional_type": "Continuous", "data_type": "double", "description": "Origin airport longitude"},
    "dest_station_lat": {"functional_type": "Continuous", "data_type": "double", "description": "Destination station latitude"},
    "dest_station_lon": {"functional_type": "Continuous", "data_type": "double", "description": "Destination station longitude"},
    "dest_airport_lat": {"functional_type": "Continuous", "data_type": "double", "description": "Destination airport latitude"},
    "dest_airport_lon": {"functional_type": "Continuous", "data_type": "double", "description": "Destination airport longitude"},
    
    # Text/Metadata
    "NAME": {"functional_type": "Text/Metadata", "data_type": "string", "description": "Weather station name"},
    "REPORT_TYPE": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Weather report type"},
    "SOURCE": {"functional_type": "Categorical (Nominal)", "data_type": "string", "description": "Weather data source"},
    "HourlySkyConditions": {"functional_type": "Text", "data_type": "string", "description": "Sky conditions description"},
    "REM": {"functional_type": "Text", "data_type": "string", "description": "Remarks/comments"},
}

# Get actual columns from dataset
actual_columns = otpw_3m_imputed.columns
actual_dtypes = dict(otpw_3m_imputed.dtypes)

# Create comprehensive table
appendix_b_data = []
for col in actual_columns:
    if col in column_classification:
        info = column_classification[col]
        appendix_b_data.append({
            "Column Name": col,
            "Functional Type": info["functional_type"],
            "Current Data Type": actual_dtypes[col],
            "Expected Data Type": info["data_type"],
            "Description": info["description"]
        })
    else:
        # For columns not explicitly classified
        appendix_b_data.append({
            "Column Name": col,
            "Functional Type": "Unclassified",
            "Current Data Type": actual_dtypes[col],
            "Expected Data Type": actual_dtypes[col],
            "Description": "Not yet classified"
        })

# Create DataFrame
appendix_b = pd.DataFrame(appendix_b_data)

# Display the table
print("=" * 100)
print("APPENDIX B: Comprehensive Column Classification and Type Information")
print("=" * 100)
display(appendix_b)

# Save to CSV for documentation
appendix_b.to_csv(f"{folder_path.replace('dbfs:', '/dbfs')}/appendix_b_column_classification.csv", index=False)
print(f"\n✓ Saved Appendix B to: {folder_path}/appendix_b_column_classification.csv")

# Summary statistics by functional type
print("\n" + "=" * 100)
print("Summary by Functional Type:")
print("=" * 100)
summary = appendix_b.groupby("Functional Type").size().reset_index(name="Count")
summary = summary.sort_values("Count", ascending=False)
display(summary)

In [0]:
# Save checkpoint to group folder
otpw_3m_imputed.write.parquet(f"{folder_path}/otpw_3m_imputed.parquet")
display(dbutils.fs.ls(f"{folder_path}"))