## 1: Data Cleaning

### 1.1 Target Variable Overview and Label Cleaning
Before performing exploratory analysis, we cleaned the raw OTPW dataset to ensure data quality.

In [0]:
from pyspark.sql.functions import col

data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

path,name,size,modificationTime
dbfs:/mnt/mids-w261/HW5/,HW5/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_12M/,OTPW_12M/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_1D_CSV/,OTPW_1D_CSV/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_36M/,OTPW_36M/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_3M/,OTPW_3M/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_3M_2015.csv,OTPW_3M_2015.csv,1500620247,1741625185000
dbfs:/mnt/mids-w261/OTPW_3M_2015_delta/,OTPW_3M_2015_delta/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_60M/,OTPW_60M/,0,1763760471327
dbfs:/mnt/mids-w261/OTPW_60M_Backup/,OTPW_60M_Backup/,0,1763760471327
dbfs:/mnt/mids-w261/airport-codes_csv.csv,airport-codes_csv.csv,6232459,1740508595000


In [0]:
# OTPW: load the 12-month dataset with spark.read.csv + gzip compression
df_otpw = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("dbfs:/mnt/mids-w261/OTPW_12M/OTPW_12M/OTPW_12M_2015.csv.gz")
)

display(df_otpw)

QUARTER,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,YEAR,MONTH,origin_airport_name,origin_station_name,origin_station_id,origin_iata_code,origin_icao,origin_type,origin_region,origin_station_lat,origin_station_lon,origin_airport_lat,origin_airport_lon,origin_station_dis,dest_airport_name,dest_station_name,dest_station_id,dest_iata_code,dest_icao,dest_type,dest_region,dest_station_lat,dest_station_lon,dest_airport_lat,dest_airport_lon,dest_station_dis,sched_depart_date_time,sched_depart_date_time_UTC,four_hours_prior_depart_UTC,two_hours_prior_depart_UTC,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate,_row_desc
1,25,3,2015-03-25,AA,19805,AA,N793AA,1,12478,1247802,31703,JFK,"New York, NY",NY,36,New York,22,12892,1289203,32575,LAX,"Los Angeles, CA",CA,6,California,91,900,851.0,-9.0,0.0,0.0,-1.0,0900-0959,22.0,913.0,1202.0,19.0,1241,1221.0,-20.0,0.0,0.0,-2.0,1200-1259,0.0,,0.0,401.0,390.0,349.0,1.0,2475.0,10,,,,,,,,,2015,3,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,Los Angeles International Airport,LOS ANGELES INTERNATIONAL AIR,72295023174,LAX,KLAX,large_airport,US-CA,33.9,-118.4,33.9,-118.4,0.0,2015-03-25T09:00:00Z,2015-03-25T13:00:00Z,2015-03-25T09:00:00Z,2015-03-25T11:00:00Z,74486094789,2015-03-25T09:51:00Z,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,30.43,16.0,44,0.00,,-0.01,1.0,32.0,BKN:07 250,30.43,30.4,10.0,34.0,200,,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10503/25/15 09:51:02 METAR KJFK 251451Z 20006KT 10SM BKN250 07/M09 A3043 RMK AO2 SLP304 T00671089 51003 (BH),ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01,1
1,25,7,2015-01-25,AA,19805,AA,N786AA,20,14771,1477101,32457,SFO,"San Francisco, CA",CA,6,California,91,12478,1247802,31703,JFK,"New York, NY",NY,36,New York,22,1505,1455.0,-10.0,0.0,0.0,-1.0,1500-1559,15.0,1510.0,2320.0,5.0,2327,2325.0,-2.0,0.0,0.0,-1.0,2300-2359,0.0,,0.0,322.0,330.0,310.0,1.0,2586.0,11,,,,,,,,,2015,1,San Francisco International Airport,SAN FRANCISCO INTERNATIONAL A,72494023234,SFO,KSFO,large_airport,US-CA,37.6,-122.4,37.6,-122.4,0.0,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,2015-01-25T15:05:00Z,2015-01-25T23:05:00Z,2015-01-25T19:05:00Z,2015-01-25T21:05:00Z,72494023234,2015-01-25T19:56:00Z,37.6197,-122.3647,2.4,"SAN FRANCISCO INTERNATIONAL AIRPORT, CA US",FM-15,7,30.03,49.0,56,0.00,,,,77.0,CLR:00,30.03,30.01,10.0,52.0,000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET09001/25/15 19:56:02 METAR KSFO 260356Z 00000KT 10SM CLR 13/09 A3003 RMK AO2 SLP169 T01330094,,,,,,,,,,,1
4,18,3,2015-11-18,AA,19805,AA,N796AA,33,12478,1247803,31703,JFK,"New York, NY",NY,36,New York,22,12892,1289203,32575,LAX,"Los Angeles, CA",CA,6,California,91,800,800.0,0.0,0.0,0.0,0.0,0800-0859,20.0,820.0,1053.0,14.0,1125,1107.0,-18.0,0.0,0.0,-2.0,1100-1159,0.0,,0.0,385.0,367.0,333.0,1.0,2475.0,10,,,,,,,,,2015,11,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,Los Angeles International Airport,LOS ANGELES INTERNATIONAL AIR,72295023174,LAX,KLAX,large_airport,US-CA,33.9,-118.4,33.9,-118.4,0.0,2015-11-18T08:00:00Z,2015-11-18T13:00:00Z,2015-11-18T09:00:00Z,2015-11-18T11:00:00Z,74486094789,2015-11-18T09:51:00Z,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,30.52,41.0,57,0.00,,0.01,8.0,55.0,SCT:04 37 SCT:04 250,30.52,30.5,10.0,49.0,140,,13.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET11311/18/15 09:51:02 METAR KJFK 181451Z 14011KT 10SM SCT037 SCT250 14/05 A3052 RMK AO2 SLP336 T01390050 58003 $ (BH),ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01,1
3,12,7,2015-07-12,AA,19805,AA,N3MHAA,47,13930,1393003,30977,ORD,"Chicago, IL",IL,17,Illinois,41,14057,1405702,34057,PDX,"Portland, OR",OR,41,Oregon,92,1000,955.0,-5.0,0.0,0.0,-1.0,1000-1059,19.0,1014.0,1212.0,4.0,1223,1216.0,-7.0,0.0,0.0,-1.0,1200-1259,0.0,,0.0,263.0,261.0,238.0,1.0,1739.0,7,,,,,,,,,2015,7,Chicago O'Hare International Airport,CHICAGO O'HARE INTERNATIONAL,72530094846,ORD,KORD,large_airport,US-IL,42.0,-87.9,42.0,-87.9,0.0,Portland International Airport,PORTLAND INTERNATIONAL AIRPOR,72698024229,PDX,KPDX,large_airport,US-OR,45.6,-122.6,45.6,-122.6,0.0,2015-07-12T10:00:00Z,2015-07-12T15:00:00Z,2015-07-12T11:00:00Z,2015-07-12T13:00:00Z,72530094846,2015-07-12T11:49:00Z,41.96019,-87.93162,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-16,6,29.98,64.0,79,,,,,61.0,BKN:07 28 BKN:07 250,,29.26,10.0,69.0,000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET09307/12/15 11:49:02 SPECI KORD 121749Z 00000KT 10SM BKN028 BKN250 26/18 A2998 RMK AO2 FIBI (JR),,,,,,,,,,,1
3,3,4,2015-09-03,AA,19805,AA,N022AA,63,13303,1330303,32467,MIA,"Miami, FL",FL,12,Florida,33,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,1945,2109.0,84.0,84.0,1.0,5.0,1900-1959,23.0,2132.0,2240.0,6.0,2128,2246.0,78.0,78.0,1.0,5.0,2100-2159,0.0,,0.0,163.0,157.0,128.0,1.0,964.0,4,78.0,0.0,0.0,0.0,0.0,,,,2015,9,Miami International Airport,MIAMI INTERNATIONAL AIRPORT,72202012839,MIA,KMIA,large_airport,US-FL,25.8,-80.3,25.8,-80.3,0.0,George Bush Intercontinental Houston Airport,G BUSH INTERCONTINENTAL AP/HO,72243012960,IAH,KIAH,large_airport,US-TX,30.0,-95.4,30.0,-95.4,0.0,2015-09-03T19:45:00Z,2015-09-03T23:45:00Z,2015-09-03T19:45:00Z,2015-09-03T21:45:00Z,72202012839,2015-09-03T19:53:00Z,25.7881,-80.3169,8.8,"MIAMI INTERNATIONAL AIRPORT, FL US",FM-15,7,29.96,74.0,81,0.00,,,,79.0,SCT:04 48 BKN:07 80 BKN:07 250,29.96,29.93,10.0,76.0,320,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10709/03/15 19:53:01 METAR KMIA 040053Z 32004KT 10SM SCT048 BKN080 BKN250 27/23 A2996 RMK AO2 SLP146 T02720233,,,,,,,,,,2009-07-14,1
3,22,3,2015-07-22,AA,19805,AA,N3ENAA,82,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14771,1477102,32457,SFO,"San Francisco, CA",CA,6,California,91,1200,1159.0,-1.0,0.0,0.0,-1.0,1200-1259,15.0,1214.0,1331.0,12.0,1350,1343.0,-7.0,0.0,0.0,-1.0,1300-1359,0.0,,0.0,230.0,224.0,197.0,1.0,1464.0,6,,,,,,,,,2015,7,Dallas Fort Worth International Airport,DALLAS/FT WORTH INTERNATIONAL,72259003927,DFW,KDFW,large_airport,US-TX,32.9,-97.0,32.9,-97.0,0.0,San Francisco International Airport,SAN FRANCISCO INTERNATIONAL A,72494023234,SFO,KSFO,large_airport,US-CA,37.6,-122.4,37.6,-122.4,0.0,2015-07-22T12:00:00Z,2015-07-22T17:00:00Z,2015-07-22T13:00:00Z,2015-07-22T15:00:00Z,72259003927,2015-07-22T13:53:00Z,32.8978,-97.0189,170.7,"DAL FTW WSCMO AIRPORT, TX US",FM-15,7,29.85,68.0,97,0.00,,,,39.0,FEW:02 60 SCT:04 300,29.82,29.21,10.0,77.0,180,21.0,13.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10807/22/15 13:53:02 METAR KDFW 221953Z 18011G18KT 10SM FEW060 SCT300 36/20 A2985 RMK AO2 SLP098 T03610200 (NJ),,,,,,,,,,2009-05-27,1
3,16,7,2015-08-16,AA,19805,AA,N858AA,98,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14107,1410702,30466,PHX,"Phoenix, AZ",AZ,4,Arizona,81,1400,1358.0,-2.0,0.0,0.0,-1.0,1400-1459,14.0,1412.0,1408.0,11.0,1434,1419.0,-15.0,0.0,0.0,-1.0,1400-1459,0.0,,0.0,154.0,141.0,116.0,1.0,868.0,4,,,,,,,,,2015,8,Dallas Fort Worth International Airport,DALLAS/FT WORTH INTERNATIONAL,72259003927,DFW,KDFW,large_airport,US-TX,32.9,-97.0,32.9,-97.0,0.0,Phoenix Sky Harbor International Airport,PHOENIX SKY HARBOR INTL AIRPO,72278023183,PHX,KPHX,large_airport,US-AZ,33.4,-112.0,33.4,-112.0,0.0,2015-08-16T14:00:00Z,2015-08-16T19:00:00Z,2015-08-16T15:00:00Z,2015-08-16T17:00:00Z,72259003927,2015-08-16T15:00:00Z,32.8978,-97.0189,170.7,"DAL FTW WSCMO AIRPORT, TX US",FM-12,4,,60.0,97,,,0.06,8.0,29.0,74,29.9,29.33,9.94,73.0,140,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SYN06472259 32866 21410 10361 20156 39931 40126 58021 92053 555 91621=,,,,,,,,,,2009-05-27,1
4,6,2,2015-10-06,AA,19805,AA,N3KDAA,124,13303,1330303,32467,MIA,"Miami, FL",FL,12,Florida,33,14107,1410702,30466,PHX,"Phoenix, AZ",AZ,4,Arizona,81,705,702.0,-3.0,0.0,0.0,-1.0,0700-0759,16.0,718.0,844.0,9.0,915,853.0,-22.0,0.0,0.0,-2.0,0900-0959,0.0,,0.0,310.0,291.0,266.0,1.0,1972.0,8,,,,,,,,,2015,10,Miami International Airport,MIAMI INTERNATIONAL AIRPORT,72202012839,MIA,KMIA,large_airport,US-FL,25.8,-80.3,25.8,-80.3,0.0,Phoenix Sky Harbor International Airport,PHOENIX SKY HARBOR INTL AIRPO,72278023183,PHX,KPHX,large_airport,US-AZ,33.4,-112.0,33.4,-112.0,0.0,2015-10-06T07:05:00Z,2015-10-06T11:05:00Z,2015-10-06T07:05:00Z,2015-10-06T09:05:00Z,72202012839,2015-10-06T07:53:00Z,25.7881,-80.3169,8.8,"MIAMI INTERNATIONAL AIRPORT, FL US",FM-15,7,29.91,72.0,79,0.00,,,,79.0,FEW:02 28 SCT:04 80,29.91,29.88,10.0,74.0,240,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10610/06/15 07:53:02 METAR KMIA 061253Z 24005KT 10SM FEW028 SCT080 26/22 A2991 RMK AO2 SLP129 T02610222 (RLW),,,,,,,,,,2009-07-14,1
3,21,2,2015-07-21,AA,19805,AA,N3GJAA,143,14747,1474703,30559,SEA,"Seattle, WA",WA,53,Washington,93,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,1315,1314.0,-1.0,0.0,0.0,-1.0,1300-1359,17.0,1331.0,1853.0,12.0,1910,1905.0,-5.0,0.0,0.0,-1.0,1900-1959,0.0,,0.0,235.0,231.0,202.0,1.0,1660.0,7,,,,,,,,,2015,7,Seattle Tacoma International Airport,SEATTLE-TACOMA INTERNATIONAL,72793024233,SEA,KSEA,large_airport,US-WA,47.4,-122.3,47.4,-122.3,0.0,Dallas Fort Worth International Airport,DALLAS/FT WORTH INTERNATIONAL,72259003927,DFW,KDFW,large_airport,US-TX,32.9,-97.0,32.9,-97.0,0.0,2015-07-21T13:15:00Z,2015-07-21T20:15:00Z,2015-07-21T16:15:00Z,2015-07-21T18:15:00Z,72793024233,2015-07-21T16:53:00Z,47.4444,-122.3138,112.8,"SEATTLE TACOMA AIRPORT, WA US",FM-15,7,29.93,49.0,73,0.00,,,,43.0,FEW:02 40 BKN:07 75 BKN:07 220,29.94,29.47,10.0,59.0,290,,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10707/21/15 16:53:01 METAR KSEA 220053Z 29006KT 10SM FEW040 BKN075 BKN220 23/09 A2993 RMK AO2 SLP139 T02280094,,,,,,,,,,2007-05-17,1
1,1,7,2015-03-01,AA,19805,AA,N3FDAA,152,14107,1410702,30466,PHX,"Phoenix, AZ",AZ,4,Arizona,81,13303,1330303,32467,MIA,"Miami, FL",FL,12,Florida,33,1429,1620.0,111.0,111.0,1.0,7.0,1400-1459,15.0,1635.0,2207.0,5.0,2034,2212.0,98.0,98.0,1.0,6.0,2000-2059,0.0,,0.0,245.0,232.0,212.0,1.0,1972.0,8,36.0,0.0,0.0,0.0,62.0,,,,2015,3,Phoenix Sky Harbor International Airport,PHOENIX SKY HARBOR INTL AIRPO,72278023183,PHX,KPHX,large_airport,US-AZ,33.4,-112.0,33.4,-112.0,0.0,Miami International Airport,MIAMI INTERNATIONAL AIRPORT,72202012839,MIA,KMIA,large_airport,US-FL,25.8,-80.3,25.8,-80.3,0.0,2015-03-01T14:29:00Z,2015-03-01T21:29:00Z,2015-03-01T17:29:00Z,2015-03-01T19:29:00Z,72278023183,2015-03-01T17:51:00Z,33.4277,-112.0038,337.4,"PHOENIX AIRPORT, AZ US",FM-15,7,29.86,43.0,74,0.00,,,,33.0,FEW:02 60 FEW:02 90 SCT:04 190,29.83,28.69,10.0,57.0,190,24.0,14.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET11003/01/15 17:51:01 METAR KPHX 020051Z 19012G21KT 10SM FEW060 FEW090 SCT190 23/06 A2986 RMK AO2 SLP100 T02330061,N,0.5,mi,"TEMP, PRECIP",1107.0,"MXMN, SRG",33.4442,-112.0247,CONTRACTOR,2007-04-03,1


In [0]:
path = "dbfs:/mnt/mids-w261/OTPW_12M/OTPW_12M/OTPW_12M_2015.csv.gz"

df = (spark.read.format("csv")
      .option("header", "true")
      .option("inferSchema", "true") 
      .load(path)
     ).cache()

print(f"Rows: {df.count():,}, Cols: {len(df.columns)}")
df.printSchema()
display(df.limit(5))

Rows: 5,811,854, Cols: 216
root
 |-- QUARTER: integer (nullable = true)
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- FL_DATE: date (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- ORIGIN_CITY_MARKET_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- ORIGIN_STATE_FIPS: integer (nullable = true)
 |-- ORIGIN_STATE_NM: string (nullable = true)
 |-- ORIGIN_WAC: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- DEST_CITY_MARKET_ID: integer (

QUARTER,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,YEAR,MONTH,origin_airport_name,origin_station_name,origin_station_id,origin_iata_code,origin_icao,origin_type,origin_region,origin_station_lat,origin_station_lon,origin_airport_lat,origin_airport_lon,origin_station_dis,dest_airport_name,dest_station_name,dest_station_id,dest_iata_code,dest_icao,dest_type,dest_region,dest_station_lat,dest_station_lon,dest_airport_lat,dest_airport_lon,dest_station_dis,sched_depart_date_time,sched_depart_date_time_UTC,four_hours_prior_depart_UTC,two_hours_prior_depart_UTC,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate,_row_desc
1,25,3,2015-03-25,AA,19805,AA,N793AA,1,12478,1247802,31703,JFK,"New York, NY",NY,36,New York,22,12892,1289203,32575,LAX,"Los Angeles, CA",CA,6,California,91,900,851,-9.0,0.0,0.0,-1,0900-0959,22.0,913,1202,19.0,1241,1221,-20.0,0.0,0.0,-2,1200-1259,0.0,,0.0,401.0,390.0,349.0,1.0,2475.0,10,,,,,,,,,2015,3,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,Los Angeles International Airport,LOS ANGELES INTERNATIONAL AIR,72295023174,LAX,KLAX,large_airport,US-CA,33.9,-118.4,33.9,-118.4,0.0,2015-03-25T09:00:00Z,2015-03-25T13:00:00Z,2015-03-25T09:00:00Z,2015-03-25T11:00:00Z,74486094789,2015-03-25T09:51:00Z,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,30.43,16,44,0.0,,-0.01,1.0,32,BKN:07 250,30.43,30.4,10.0,34,200,,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10503/25/15 09:51:02 METAR KJFK 251451Z 20006KT 10SM BKN250 07/M09 A3043 RMK AO2 SLP304 T00671089 51003 (BH),ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01,1
1,25,7,2015-01-25,AA,19805,AA,N786AA,20,14771,1477101,32457,SFO,"San Francisco, CA",CA,6,California,91,12478,1247802,31703,JFK,"New York, NY",NY,36,New York,22,1505,1455,-10.0,0.0,0.0,-1,1500-1559,15.0,1510,2320,5.0,2327,2325,-2.0,0.0,0.0,-1,2300-2359,0.0,,0.0,322.0,330.0,310.0,1.0,2586.0,11,,,,,,,,,2015,1,San Francisco International Airport,SAN FRANCISCO INTERNATIONAL A,72494023234,SFO,KSFO,large_airport,US-CA,37.6,-122.4,37.6,-122.4,0.0,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,2015-01-25T15:05:00Z,2015-01-25T23:05:00Z,2015-01-25T19:05:00Z,2015-01-25T21:05:00Z,72494023234,2015-01-25T19:56:00Z,37.6197,-122.3647,2.4,"SAN FRANCISCO INTERNATIONAL AIRPORT, CA US",FM-15,7,30.03,49,56,0.0,,,,77,CLR:00,30.03,30.01,10.0,52,0,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET09001/25/15 19:56:02 METAR KSFO 260356Z 00000KT 10SM CLR 13/09 A3003 RMK AO2 SLP169 T01330094,,,,,,,,,,,1
4,18,3,2015-11-18,AA,19805,AA,N796AA,33,12478,1247803,31703,JFK,"New York, NY",NY,36,New York,22,12892,1289203,32575,LAX,"Los Angeles, CA",CA,6,California,91,800,800,0.0,0.0,0.0,0,0800-0859,20.0,820,1053,14.0,1125,1107,-18.0,0.0,0.0,-2,1100-1159,0.0,,0.0,385.0,367.0,333.0,1.0,2475.0,10,,,,,,,,,2015,11,John F Kennedy International Airport,JOHN F KENNEDY INTERNATIONAL,74486094789,JFK,KJFK,large_airport,US-NY,40.6,-73.8,40.6,-73.8,0.0,Los Angeles International Airport,LOS ANGELES INTERNATIONAL AIR,72295023174,LAX,KLAX,large_airport,US-CA,33.9,-118.4,33.9,-118.4,0.0,2015-11-18T08:00:00Z,2015-11-18T13:00:00Z,2015-11-18T09:00:00Z,2015-11-18T11:00:00Z,74486094789,2015-11-18T09:51:00Z,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,30.52,41,57,0.0,,0.01,8.0,55,SCT:04 37 SCT:04 250,30.52,30.5,10.0,49,140,,13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET11311/18/15 09:51:02 METAR KJFK 181451Z 14011KT 10SM SCT037 SCT250 14/05 A3052 RMK AO2 SLP336 T01390050 58003 $ (BH),ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01,1
3,12,7,2015-07-12,AA,19805,AA,N3MHAA,47,13930,1393003,30977,ORD,"Chicago, IL",IL,17,Illinois,41,14057,1405702,34057,PDX,"Portland, OR",OR,41,Oregon,92,1000,955,-5.0,0.0,0.0,-1,1000-1059,19.0,1014,1212,4.0,1223,1216,-7.0,0.0,0.0,-1,1200-1259,0.0,,0.0,263.0,261.0,238.0,1.0,1739.0,7,,,,,,,,,2015,7,Chicago O'Hare International Airport,CHICAGO O'HARE INTERNATIONAL,72530094846,ORD,KORD,large_airport,US-IL,42.0,-87.9,42.0,-87.9,0.0,Portland International Airport,PORTLAND INTERNATIONAL AIRPOR,72698024229,PDX,KPDX,large_airport,US-OR,45.6,-122.6,45.6,-122.6,0.0,2015-07-12T10:00:00Z,2015-07-12T15:00:00Z,2015-07-12T11:00:00Z,2015-07-12T13:00:00Z,72530094846,2015-07-12T11:49:00Z,41.96019,-87.93162,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-16,6,29.98,64,79,,,,,61,BKN:07 28 BKN:07 250,,29.26,10.0,69,0,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET09307/12/15 11:49:02 SPECI KORD 121749Z 00000KT 10SM BKN028 BKN250 26/18 A2998 RMK AO2 FIBI (JR),,,,,,,,,,,1
3,3,4,2015-09-03,AA,19805,AA,N022AA,63,13303,1330303,32467,MIA,"Miami, FL",FL,12,Florida,33,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,1945,2109,84.0,84.0,1.0,5,1900-1959,23.0,2132,2240,6.0,2128,2246,78.0,78.0,1.0,5,2100-2159,0.0,,0.0,163.0,157.0,128.0,1.0,964.0,4,78.0,0.0,0.0,0.0,0.0,,,,2015,9,Miami International Airport,MIAMI INTERNATIONAL AIRPORT,72202012839,MIA,KMIA,large_airport,US-FL,25.8,-80.3,25.8,-80.3,0.0,George Bush Intercontinental Houston Airport,G BUSH INTERCONTINENTAL AP/HO,72243012960,IAH,KIAH,large_airport,US-TX,30.0,-95.4,30.0,-95.4,0.0,2015-09-03T19:45:00Z,2015-09-03T23:45:00Z,2015-09-03T19:45:00Z,2015-09-03T21:45:00Z,72202012839,2015-09-03T19:53:00Z,25.7881,-80.3169,8.8,"MIAMI INTERNATIONAL AIRPORT, FL US",FM-15,7,29.96,74,81,0.0,,,,79,SCT:04 48 BKN:07 80 BKN:07 250,29.96,29.93,10.0,76,320,,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET10709/03/15 19:53:01 METAR KMIA 040053Z 32004KT 10SM SCT048 BKN080 BKN250 27/23 A2996 RMK AO2 SLP146 T02720233,,,,,,,,,,2009-07-14,1


In [0]:
from pyspark.sql import functions as F

total = df.count()

target_summary = (
    df.withColumn(
        "DEP_DEL15_status",
        F.when(F.col("DEP_DEL15").isNull(), "Null")
         .when(F.col("DEP_DEL15") == 1, "Delayed (≥15 min)")
         .otherwise("On Time (<15 min)")
    )
    .groupBy("DEP_DEL15_status")
    .agg(F.count("*").alias("count"))
    .withColumn("percentage", F.round(F.col("count")/F.lit(total)*100, 2))
    .orderBy("DEP_DEL15_status")
)

display(target_summary)


DEP_DEL15_status,count,percentage
Delayed (≥15 min),1055735,18.17
Null,86059,1.48
On Time (<15 min),4670060,80.35


In [0]:
# Remove rows where target variable is null
df = df.filter(df.DEP_DEL15.isNotNull())

print(f"After removing null targets: {df.count():,} rows remain.")

After removing null targets: 5,725,795 rows remain.


In [0]:
total = df.count()

# Count non-null values for every column
non_null_agg = df.agg(*[F.count(c).alias(c) for c in df.columns])

# Reshape into (column, non_null)
nulls_long = non_null_agg.selectExpr(
    "stack({}, {}) as (column, non_null)".format(
        len(df.columns),
        ", ".join([f"'{c}', `{c}`" for c in df.columns])
    )
).withColumn("null_count", F.lit(total) - F.col("non_null")) \
 .withColumn("null_pct", F.round(F.col("null_count") / F.lit(total) * 100, 2)) \
 .orderBy(F.desc("null_pct"))

display(nulls_long)

column,non_null,null_count,null_pct
MonthlyDaysWithGT010Precip,0,5725795,100.0
CLDD,0,5725795,100.0
ShortDurationEndDate180,0,5725795,100.0
MonthlyDaysWithGT90Temp,0,5725795,100.0
ShortDurationPrecipitationValue045,0,5725795,100.0
ShortDurationPrecipitationValue100,0,5725795,100.0
ShortDurationEndDate005,0,5725795,100.0
ShortDurationEndDate080,0,5725795,100.0
ShortDurationPrecipitationValue010,0,5725795,100.0
MonthlyGreatestSnowfallDate,0,5725795,100.0


In [0]:
# Drop columns with excessive missingness (>80%), always keep the target
threshold = 80.0
cols_to_drop = [r["column"] for r in nulls_long.filter(F.col("null_pct") > threshold).collect()
                if r["column"] != "DEP_DEL15"]

print(f"Drop {len(cols_to_drop)} columns (> {threshold}% null).")
df = df.drop(*cols_to_drop)
print(f"Remaining columns: {len(df.columns)}")

Drop 99 columns (> 80.0% null).
Remaining columns: 117


- removed columns with more than 80% missing values. 
- Kept the target column [DEP_DEL15] intact

### 1.2 Column Selection and Type Normalization (for EDA)

In [0]:
# Select relevant columns ----
cols_keep  = [
    "DEP_DEL15",
    "CRS_DEP_TIME", "DAY_OF_WEEK",
    "OP_UNIQUE_CARRIER", "ORIGIN", "DEST",
    "DISTANCE",
    "HourlyDryBulbTemperature",
    "HourlyWindSpeed",
    "HourlyPrecipitation"
]
df_base = df.select(*[c for c in cols_keep if c in df.columns])


In [0]:
df1 = df_base
row_cnt = df1.count()

null_report = (
    df1.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(f"{c}__nulls")
        for c in df1.columns
    ]).withColumn("rows", F.lit(row_cnt))
)

# Display as (column, nulls, null_rate)
long_nulls = []
for c in df1.columns:
    long_nulls.append(
        F.struct(F.lit(c).alias("column"),
                 F.col(f"{c}__nulls").alias("nulls"),
                 (F.col(f"{c}__nulls")/F.col("rows")).alias("null_rate"))
    )

null_long_df = null_report.select(F.array(*long_nulls).alias("arr")) \
                          .select(F.explode("arr").alias("rec")) \
                          .select("rec.*") \
                          .orderBy(F.desc("null_rate"))
null_long_df.show(truncate=False)

+------------------------+------+---------------------+
|column                  |nulls |null_rate            |
+------------------------+------+---------------------+
|HourlyPrecipitation     |626064|0.10934097361152469  |
|HourlyWindSpeed         |16745 |0.002924484722208881 |
|HourlyDryBulbTemperature|14728 |0.0025722192289455003|
|OP_UNIQUE_CARRIER       |0     |0.0                  |
|DEP_DEL15               |0     |0.0                  |
|DAY_OF_WEEK             |0     |0.0                  |
|ORIGIN                  |0     |0.0                  |
|DEST                    |0     |0.0                  |
|DISTANCE                |0     |0.0                  |
+------------------------+------+---------------------+



In [0]:
# Remove non-numeric characters before casting to double; unparseable values become null
def to_double_safe(colname: str):
    # Keep digits, dot, minus; empty after cleanup -> null
    cleaned = F.regexp_replace(F.col(colname), r"[^0-9\.\-]", "")
    return F.when(cleaned == "", None).otherwise(cleaned).cast("double")

df2 = df1
for c in ["HourlyDryBulbTemperature", "HourlyWindSpeed", "HourlyPrecipitation"]:
    if c in df2.columns:
        df2 = df2.withColumn(c, to_double_safe(c))

In [0]:
# ---- Step 6: Enforce valid ranges and canonical dtypes ----
# Ensure label is 0/1, hour in 0–23, weekday in 1–7, distance positive and plausible
if "DEP_DEL15" in df2.columns:
    df2 = df2.withColumn(
        "DEP_DEL15",
        F.when(F.col("DEP_DEL15").isin(0, 1), F.col("DEP_DEL15")).otherwise(None)
    ).withColumn("DEP_DEL15", F.col("DEP_DEL15").cast("int"))

if "CRS_DEP_HOUR" in df2.columns:
    df2 = df2.withColumn(
            "CRS_DEP_HOUR",
            F.when((F.col("CRS_DEP_HOUR").cast("int") >= 0) & (F.col("CRS_DEP_HOUR").cast("int") <= 23),
                   F.col("CRS_DEP_HOUR").cast("int"))
             .otherwise(F.lit(None))  # invalid hours become null
        )

if "DAY_OF_WEEK" in df2.columns:
    df2 = df2.withColumn(
            "DAY_OF_WEEK",
            F.when((F.col("DAY_OF_WEEK").cast("int") >= 1) & (F.col("DAY_OF_WEEK").cast("int") <= 7),
                   F.col("DAY_OF_WEEK").cast("int"))
             .otherwise(F.lit(None))
        )
    
if "DISTANCE" in df2.columns:
    df2 = df2.withColumn(
        "DISTANCE",
        F.when((F.col("DISTANCE") > 0) & (F.col("DISTANCE") < 5000), F.col("DISTANCE")).otherwise(None)
        .cast("double")
    )

# Airport/carrier codes: keep alphanumeric only (avoid punctuation/hidden chars)
for c in ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]:
    if c in df2.columns:
        df2 = df2.withColumn(c, F.regexp_replace(F.col(c), r"[^A-Z0-9]", ""))
        df2 = df2.withColumn(c, F.when(F.length(F.col(c)) == 0, None).otherwise(F.col(c)))


In [0]:
# Sanity-check distributions to confirm conversions look reasonable
qc_cols = [c for c in ["DEP_DEL15","CRS_DEP_HOUR","DAY_OF_WEEK","DISTANCE",
                       "HourlyDryBulbTemperature","HourlyWindSpeed","HourlyPrecipitation"] if c in df2.columns]

# Basic stats for numeric columns
df2.select([F.mean(c).alias(f"{c}_mean") for c in qc_cols if dict(df2.dtypes)[c] in ("int", "double")]).show()

# Cardinalities for key categorical IDs
for c in ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]:
    if c in df2.columns:
        print(c, "distinct =", df2.select(c).distinct().count())


+-------------------+-----------------+-----------------+-----------------------------+--------------------+------------------------+
|     DEP_DEL15_mean| DAY_OF_WEEK_mean|    DISTANCE_mean|HourlyDryBulbTemperature_mean|HourlyWindSpeed_mean|HourlyPrecipitation_mean|
+-------------------+-----------------+-----------------+-----------------------------+--------------------+------------------------+
|0.18438225608845585|3.932258664517329|823.9484504771826|            64.21350408771448|   8.867760931393182|    0.003307467744053...|
+-------------------+-----------------+-----------------+-----------------------------+--------------------+------------------------+

OP_UNIQUE_CARRIER distinct = 14
ORIGIN distinct = 320
DEST distinct = 322


In [0]:
# ---- Step 8: Baseline imputation (median for numeric fields) ----
# Keep it simple for Phase 1; can refine later (e.g., groupwise median by ORIGIN×hour)
median_cols = [c for c in ["HourlyDryBulbTemperature","HourlyWindSpeed","HourlyPrecipitation","DISTANCE"] if c in df2.columns]
if median_cols:
    med_row = df2.select([F.expr(f"percentile_approx({c}, 0.5)").alias(c) for c in median_cols]).first()
    med_map = {c: med_row[c] for c in median_cols if med_row[c] is not None}
    df3 = df2.fillna(med_map)
else:
    df3 = df2

# Optionally drop rows missing required keys for modeling (keep a copy before dropping)
required_keys = [c for c in ["ORIGIN","DEST","OP_UNIQUE_CARRIER","CRS_DEP_HOUR","DAY_OF_WEEK"] if c in df3.columns]
df_clean = df3.na.drop(subset=required_keys)  # or keep df3 if you want to impute keys later

df_clean.printSchema()
print("Rows before:", row_cnt, " | after basic cleaning:", df_clean.count())


root
 |-- DEP_DEL15: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DISTANCE: double (nullable = false)
 |-- HourlyDryBulbTemperature: double (nullable = false)
 |-- HourlyWindSpeed: double (nullable = false)
 |-- HourlyPrecipitation: double (nullable = false)

Rows before: 5725795  | after basic cleaning: 5725795


In [0]:
from functools import reduce
from pyspark.sql import functions as F

after_cnt = df_clean.count()
summaries = []

for c, t in df_clean.dtypes:
    # Make sure the column actually exists
    if c not in df_clean.columns:
        continue

    # Base summary: column name, dtype, null rate
    base = df_clean.select(
        F.lit(c).alias("column"),
        F.lit(t).alias("dtype"),
        (F.sum(F.col(c).isNull().cast("int")) / F.lit(after_cnt)).alias("null_rate")
    )

    # If numeric column, include min / median / max
    if t in ("int", "bigint", "double", "float"):
        base = df_clean.select(
            F.lit(c).alias("column"),
            F.lit(t).alias("dtype"),
            (F.sum(F.col(c).isNull().cast("int")) / F.lit(after_cnt)).alias("null_rate"),
            F.min(F.col(c)).alias("min"),
            F.expr(f"percentile_approx({c}, 0.5)").alias("median"),
            F.max(F.col(c)).alias("max")
        )

    summaries.append(base)

# Combine all summaries
dq_summary = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), summaries)
dq_summary.orderBy(F.desc("null_rate")).show(truncate=False)



+------------------------+------+---------+-----+------+------+
|column                  |dtype |null_rate|min  |median|max   |
+------------------------+------+---------+-----+------+------+
|OP_UNIQUE_CARRIER       |string|0.0      |NULL |NULL  |NULL  |
|HourlyWindSpeed         |double|0.0      |0.0  |8.0   |138.0 |
|DEP_DEL15               |int   |0.0      |0.0  |0.0   |1.0   |
|HourlyDryBulbTemperature|double|0.0      |-40.0|67.0  |116.0 |
|ORIGIN                  |string|0.0      |NULL |NULL  |NULL  |
|DAY_OF_WEEK             |int   |0.0      |1.0  |4.0   |7.0   |
|HourlyPrecipitation     |double|0.0      |0.0  |0.0   |5.76  |
|DEST                    |string|0.0      |NULL |NULL  |NULL  |
|DISTANCE                |double|0.0      |21.0 |650.0 |4983.0|
+------------------------+------+---------+-----+------+------+



## 2: Exploratory Data Analysis (EDA)

### 2.1: Time-Based Delay Patterns
In this section, we explore how flight delay rates vary across different time dimensions.
- **By Hour of Day:** Early-morning flights (5–8 AM) tend to have the lowest delay rates, while evening flights (5–9 PM) experience the highest delays due to accumulated congestion.
- **By Day of Week:** Delay patterns often rise toward the weekend, reflecting increased flight volume and operational complexity.

#### 2.1.1 Hourly Delay Rate

In [0]:
# 2.1: Extract scheduled departure hour (0–23)
# CRS_DEP_TIME is stored as HHMM (e.g., 945 = 9:45am)
df = df_clean.withColumn("CRS_DEP_HOUR", (F.col("CRS_DEP_TIME")/100).cast("int"))

# Average delay rate by hour of the day
delay_by_hour = (
    df_clean.groupBy("CRS_DEP_HOUR")
      .agg(F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct"))
      .orderBy(F.desc("delay_rate_pct"))
)
display(delay_by_hour)

{"ts": "2025-11-21 21:51:44.654", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `CRS_DEP_TIME` cannot be resolved. Did you mean one of the following? [`DEST`, `DEP_DEL15`, `DISTANCE`, `ORIGIN`, `DAY_OF_WEEK`]. SQLSTATE: 42703", "context": {"file": "<command-1792055957778472>, line 3 in cell [31]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o3031.withColumn.\n: org.apache.spark.sql.catalyst.ExtendedAnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `CRS_DEP_TIME` cannot be resolved. Did you mean one of the following? [`DEST`, `DEP_DEL15`, `DISTANCE`, `ORIGIN`, `DAY_OF_WEEK`]. SQLSTATE: 42703;\n'Project [DEP_DEL15#136696, DAY_OF_WEEK#136706, OP_UNIQUE_CARRIER#136736, ORIGIN#136756, DEST#136776, DISTANCE#1683

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1792055957778472>, line 3[0m
[1;32m      1[0m [38;5;66;03m# 2.1: Extract scheduled departure hour (0–23)[39;00m
[1;32m      2[0m [38;5;66;03m# CRS_DEP_TIME is stored as HHMM (e.g., 945 = 9:45am)[39;00m
[0;32m----> 3[0m df [38;5;241m=[39m df_clean[38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mCRS_DEP_HOUR[39m[38;5;124m"[39m, (F[38;5;241m.[39mcol([38;5;124m"[39m[38;5;124mCRS_DEP_TIME[39m[38;5;124m"[39m)[38;5;241m/[39m[38;5;241m100[39m)[38;5;241m.[39mcast([38;5;124m"[39m[38;5;124mint[39m[38;5;124m"[39m))
[1;32m      5[0m [38;5;66;03m# Average delay rate by hour of the day[39;00m
[1;32m      6[0m delay_by_hour [38;5;241m=[39m (
[1;32m      7[0m     df_clean[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mCRS_DEP_HOUR[39m[38;5;124m"[39m)


#### 2.1.2 Day-of-Week Delay Rate

In [0]:
# Average delay rate by day of week (1=Mon, 7=Sun)
delay_by_day = (
    df_clean.groupBy("DAY_OF_WEEK")
      .agg(F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct"))
      .orderBy(F.desc("delay_rate_pct"))
)
display(delay_by_day)

Databricks visualization. Run in Databricks to view.

DAY_OF_WEEK,delay_rate_pct
1,19.59
4,19.35
5,18.93
7,18.5
2,18.05
3,17.86
6,16.4


### 2.2 Airline Delay Patterns
In this section, we analyze **airline-level delay behavior** to identify carriers 
that tend to have higher on-time performance issues.  
Examining delay rates by carrier helps uncover whether delays are primarily influenced 
by **airline-specific operational efficiency**, **network size**, or **route congestion**. 

#### 2.2 Airline Delay Rate vs Flight Volume

In [0]:
# Airline-level delay statistics
delay_by_carrier = (
    df_clean.groupBy("OP_UNIQUE_CARRIER")
      .agg(
          F.count("*").alias("num_flights"),
          F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct")
      )
      .orderBy(F.desc("delay_rate_pct"))
)

display(delay_by_carrier)

Databricks visualization. Run in Databricks to view.

OP_UNIQUE_CARRIER,num_flights,delay_rate_pct
NK,115291,27.38
UA,508551,23.65
F9,90262,23.02
B6,258428,21.86
WN,1245811,21.3
MQ,280248,20.09
VX,61384,17.97
EV,557189,17.41
AA,715153,17.29
OO,579007,16.68


### 2.3 Airport Delay Patterns
In this section, we analyze **departure delay patterns by airport** to identify 
which airports tend to experience the highest delay rates.  
Understanding airport-level performance helps reveal the impact of local congestion, 
weather conditions, and hub operations on flight punctuality.


In [0]:
# Airport-level delay statistics
delay_by_airport = (
    df_clean.groupBy("ORIGIN")
      .agg(
          F.count("*").alias("num_flights"),
          F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct")
      )
      .orderBy(F.desc("delay_rate_pct"))
)

display(delay_by_airport)

ORIGIN,num_flights,delay_rate_pct
ADK,96,43.75
GST,71,40.85
ILG,97,40.21
STC,78,32.05
MVY,216,31.02
UST,150,30.67
CEC,174,29.31
OTH,296,29.05
PBG,292,28.08
ASE,3348,27.57


#### 2.3.1 Top 20 Busiest Airports

In [0]:
topN_busiest = (delay_by_airport.orderBy(F.desc("num_flights")).limit(20))
topN_sorted_by_busiest = topN_busiest.orderBy(F.desc("delay_rate_pct"))
display(topN_sorted_by_busiest)


Databricks visualization. Run in Databricks to view.

ORIGIN,num_flights,delay_rate_pct
BWI,92604,24.06
ORD,305355,23.39
EWR,108280,23.23
DEN,212166,22.19
LGA,103676,22.01
DFW,254303,21.43
LAS,144928,21.07
IAH,157780,20.82
LAX,210193,20.51
JFK,100201,20.5


**2.3.1 Observation**

Among the busiest airports, **ORD** (Chicago O’Hare), **ATL** (Atlanta), and **DFW** (Dallas/Fort Worth) 
handle the highest flight volumes but also experience significant delays.  
High delay rates are often observed at major hubs due to **air traffic congestion, 
weather sensitivity, and connecting flight dependencies**.  
Conversely, airports like **PHX** and **SEA** maintain relatively low delay percentages 
despite heavy traffic, suggesting more efficient scheduling and operational management.


#### 2.3.2 Top 20 Most Delayed Airports

In [0]:
topN_Most_Delay = (delay_by_airport.orderBy(F.desc("delay_rate_pct")).limit(20))
topN_sorted_by_delay = topN_Most_Delay.orderBy(F.desc("delay_rate_pct"))
display(topN_sorted_by_delay)

Databricks visualization. Run in Databricks to view.

ORIGIN,num_flights,delay_rate_pct
ADK,96,43.75
GST,71,40.85
ILG,97,40.21
STC,78,32.05
MVY,216,31.02
UST,150,30.67
CEC,174,29.31
OTH,296,29.05
PBG,292,28.08
ASE,3348,27.57


### 2.4 Weather Impact on Delays

#### 2.4.1 Temperature vs Delay Patterns 

In [0]:
# Calculate delay rate by temperature bins (every 5°F)
temp_delay = (
    df_clean
      .withColumn("Temp_bin", F.floor(F.col("HourlyDryBulbTemperature") / 5) * 5)
      .groupBy("Temp_bin")
      .agg(
          F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct"),
          F.count("*").alias("num_flights")
      )
      .orderBy("Temp_bin")
)
display(temp_delay)

Databricks visualization. Run in Databricks to view.

Temp_bin,delay_rate_pct,num_flights
-40,21.74,23
-35,13.33,45
-30,14.81,54
-25,11.67,120
-20,12.39,218
-15,20.2,401
-10,24.88,1720
-5,30.42,5316
0,34.79,12921
5,32.41,20420


#### 2.4.2 Wind Speed vs Delay Patterns

In [0]:
# Calculate delay rate and flight count by wind speed bins (every 5 mph)
wind_delay = (
    df_clean
      .withColumn("Wind_bin", F.floor(F.col("HourlyWindSpeed") / 2.5) * 2.5)
      .groupBy("Wind_bin")
      .agg(
          F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct"),
          F.count("*").alias("num_flights")
      )
      .orderBy("Wind_bin")
)
display(wind_delay)

Databricks visualization. Run in Databricks to view.

Wind_bin,delay_rate_pct,num_flights
0.0,17.05,497099
2.5,17.22,413185
5.0,18.01,1553545
7.5,18.33,1030277
10.0,18.63,818145
12.5,19.0,576658
15.0,19.45,494896
17.5,20.28,92141
20.0,20.96,161391
22.5,22.26,46828


#### 2.4.3 Precipitation vs Delay Patterns  


To ensure statistical reliability, bins with fewer than **100 flights** were excluded, as their delay rates fluctuate excessively due to small sample sizes.  
Additionally, the **no-precipitation group (0.00 inches)** was removed since it represents over 90% of all flights and dominates the visualization.  

In [0]:
from pyspark.sql.types import DecimalType

BIN = 0.01

df_precip_num = (
    df_clean
      .withColumn("precip_raw", F.col("HourlyPrecipitation").cast("string"))
      .withColumn(
          "precip_num",
          F.when(F.col("precip_raw") == "T", 0.0)
           .otherwise(F.regexp_replace("precip_raw", "[^0-9\\.]", ""))
           .cast("double")
      )
)
precip_binned = (
    df_precip_num
      .filter(F.col("precip_num").isNotNull())             
      .withColumn("bin_idx", F.floor(F.col("precip_num")/F.lit(BIN)).cast("int"))
      .withColumn("Precip_bin",
                  (F.col("bin_idx")*F.lit(BIN)).cast(DecimalType(4,2))) 
      .groupBy("bin_idx","Precip_bin")
      .agg(
          F.count("*").alias("num_flights"),
          F.round(F.mean("DEP_DEL15")*100, 2).alias("delay_rate_pct")
      )
      .orderBy("bin_idx")
      .drop("bin_idx")
)

display(precip_binned)

Precip_bin,num_flights,delay_rate_pct
0.0,5472731,17.98
0.01,81813,26.79
0.02,40316,26.93
0.03,25735,26.54
0.04,18551,27.09
0.05,13904,27.5
0.06,11417,29.17
0.07,8678,31.31
0.08,6597,30.2
0.09,5537,30.14


In [0]:
# Filter out bins with very few samples and precipitation = 0
precip_filtered = precip_binned.filter(
    (F.col("Precip_bin") > 0) & (F.col("num_flights") >= 100)
)
display(precip_filtered)

Databricks visualization. Run in Databricks to view.

Precip_bin,num_flights,delay_rate_pct
0.01,81813,26.79
0.02,40316,26.93
0.03,25735,26.54
0.04,18551,27.09
0.05,13904,27.5
0.06,11417,29.17
0.07,8678,31.31
0.08,6597,30.2
0.09,5537,30.14
0.1,4677,30.34


### 2.5 Distance Impact on Delays
Understanding how flight distance and route characteristics influence delay rates helps identify whether delays are primarily due to operational congestion or weather dependencies.


#### 2.5.1 Distance Delay Patterns

In [0]:
# Calculate delay rate by distance bin (every 250 miles)

distance_delay = (
    df_clean.filter(F.col("DISTANCE").isNotNull())
      .withColumn("distance_bin", (F.floor(F.col("DISTANCE") / 250) * 250))
      .groupBy("distance_bin")
      .agg(
          F.count("*").alias("num_flights"),
          F.round(F.mean("DEP_DEL15") * 100, 2).alias("delay_rate_pct")
      )
      .orderBy("distance_bin")
)

display(distance_delay)


Databricks visualization. Run in Databricks to view.

distance_bin,num_flights,delay_rate_pct
0,723594,16.14
250,1355425,17.85
500,1150274,17.95
750,872706,19.5
1000,593123,20.41
1250,252315,19.89
1500,271927,19.86
1750,135861,19.33
2000,94222,18.67
2250,156767,18.99


In [0]:
dbutils.fs.mkdirs("dbfs:/student-groups/Group_4_4/plots_phase2")

True