## EDA for Stations Data

In [0]:
import pyspark
from pyspark.sql.functions import col, concat, lit, regexp_replace, when, length, lpad, to_timestamp, max, rank
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import airporttime
from datetime import datetime, timedelta

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
#import airport codes, map ident on iata code to origin and neighbor_call
airport_codes = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/airport_codes_csv.csv")
codes_cols = ['ident', 'name', 'elevation_ft', 'iata_code', 'coordinates']
airport_codes = airport_codes.select(*codes_cols).filter(airport_codes.iata_code != 'null')
display(airport_codes)

# Inspect the Mount's Final Project folder 
display(dbutils.fs.ls("/mnt/mids-w261/datasets_final_project"))

# Load 2015 Q1 for Flights
df_airlines = spark.read.parquet("/mnt/mids-w261/datasets_final_project/parquet_airlines_data_3m/")
#display(df_airlines)

# Load the 2015 Q1 for Weather
df_weather = spark.read.parquet("/mnt/mids-w261/datasets_final_project/weather_data/*").filter(col('DATE') < "2015-04-01T00:00:00.000")
display(df_weather)

# Load weather station dataset
df_stations = spark.read.parquet("/mnt/mids-w261/datasets_final_project/stations_data/*")
#display(df_stations)

ident,name,elevation_ft,iata_code,coordinates
03N,Utirik Airport,4.0,UTK,"169.852005, 11.222"
07FA,Ocean Reef Club Airport,8.0,OCA,"-80.274803161621, 25.325399398804"
0AK,Pilot Station Airport,305.0,PQS,"-162.899994, 61.934601"
0CO2,Crested Butte Airpark,8980.0,CSE,"-106.928341, 38.851918"
0TE7,LBJ Ranch Airport,1515.0,JCY,"-98.62249755859999, 30.251800537100003"
13MA,Metropolitan Airport,418.0,PMX,"-72.31140136719999, 42.223300933800004"
13Z,Loring Seaplane Base,0.0,WLR,"-131.636993408, 55.6012992859"
16A,Nunapitchuk Airport,12.0,NUP,"-162.440454, 60.905591"
16K,Port Alice Seaplane Base,0.0,PTC,"-133.597, 55.803"
19AK,Icy Bay Airport,50.0,ICY,"-141.662002563, 59.96900177"


path,name,size
dbfs:/mnt/mids-w261/datasets_final_project/airlines/,airlines/,0
dbfs:/mnt/mids-w261/datasets_final_project/airlines_data/,airlines_data/,0
dbfs:/mnt/mids-w261/datasets_final_project/parquet_airlines_data/,parquet_airlines_data/,0
dbfs:/mnt/mids-w261/datasets_final_project/parquet_airlines_data_3m/,parquet_airlines_data_3m/,0
dbfs:/mnt/mids-w261/datasets_final_project/parquet_airlines_data_6m/,parquet_airlines_data_6m/,0
dbfs:/mnt/mids-w261/datasets_final_project/stations_data/,stations_data/,0
dbfs:/mnt/mids-w261/datasets_final_project/weather_data/,weather_data/,0
dbfs:/mnt/mids-w261/datasets_final_project/weather_data_6_hr/,weather_data_6_hr/,0
dbfs:/mnt/mids-w261/datasets_final_project/weather_data_single/,weather_data_single/,0


STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AW1,GA1,GA2,GA3,GA4,GE1,GF1,KA1,KA2,MA1,MD1,MW1,MW2,OC1,OD1,OD2,REM,EQD,AW2,AX4,GD1,AW5,GN1,AJ1,AW3,MK1,KA4,GG3,AN1,RH1,AU5,HL1,OB1,AT8,AW7,AZ1,CH1,RH3,GK1,IB1,AX1,CT1,AK1,CN2,OE1,MW5,AO1,KA3,AA3,CR1,CF2,KB2,GM1,AT5,AY2,MW6,MG1,AH6,AU2,GD2,AW4,MF1,AA1,AH2,AH3,OE3,AT6,AL2,AL3,AX5,IB2,AI3,CV3,WA1,GH1,KF1,CU2,CT3,SA1,AU1,KD2,AI5,GO1,GD3,CG3,AI1,AL1,AW6,MW4,AX6,CV1,ME1,KC2,CN1,UA1,GD5,UG2,AT3,AT4,GJ1,MV1,GA5,CT2,CG2,ED1,AE1,CO1,KE1,KB1,AI4,MW3,KG2,AA2,AX2,AY1,RH2,OE2,CU3,MH1,AM1,AU4,GA6,KG1,AU3,AT7,KD1,GL1,IA1,GG2,OD3,UG1,CB1,AI6,CI1,CV2,AZ2,AD1,AH1,WD1,AA4,KC1,IA2,CF3,AI2,AT1,GD4,AX3,AH4,KB3,CU1,CN4,AT2,CG1,CF1,GG1,MV2,CW1,GG4,AB1,AH5,CN3
3809099999,2015-01-01T00:00:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-12,99999,V020,"200,1,N,0077,1","00240,1,C,N",8000199,1131,991,103061,,"01,1,+00180,1,07,1","05,1,+00240,1,07,1","08,1,+00360,1,07,1",,"9,AGL ,+99999,+99999",08991011999001801999999,,,999999102131,"3,1,002,1,+999,9",511.0,,,39901441999.0,49901341999.0,SYN10603809 11358 82015 10113 20099 30213 40306 53002 69901 75165 887// 333 81706 85708 88712 90710 91128 91026=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,6000021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T00:50:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-15,99999,V020,"210,1,N,0077,1","00183,1,C,N",8000199,1101,1001,999999,,"02,1,+00122,1,99,9","04,1,+00183,1,99,9","08,1,+00305,1,99,9",,"9,AGL ,+99999,+99999",99999021999001221999999,,,102901999999,,511.0,,,,,MET079METAR EGDR 010050Z 21015KT 8000 -DZ FEW004 SCT006 OVC010 11/10 Q1029 YLO1=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T01:00:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-12,99999,V020,"210,1,N,0077,1","00300,1,9,N",8000199,1131,1011,103001,,"01,1,+00120,1,07,1","03,1,+00180,1,07,1","08,1,+00300,1,07,1",,"9,AGL ,+99999,+99999",08991011999001201999999,,,999999102061,"8,1,004,1,+999,9",511.0,,,39901341999.0,,SYN09403809 41258 82115 10113 20101 30206 40300 58004 75155 887// 333 81704 83706 88710 90710 91126=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T01:50:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-15,99999,V020,"200,1,N,0082,1","00244,1,9,N",8000199,1201,1001,999999,,"04,1,+00183,1,99,9","07,1,+00244,1,99,9","08,1,+00305,1,99,9",,"9,AGL ,+99999,+99999",99999041999001831999999,,,102901999999,,51.0,,1441.0,,,MET086METAR EGDR 010150Z 20016G28KT 8000 HZ SCT006 BKN008 OVC010 12/10 Q1029 REDZ YLO1=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T02:00:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-12,99999,V020,"200,1,N,0082,1","00240,1,C,N",8000199,1151,1001,102941,,"03,1,+00180,1,07,1","05,1,+00240,1,07,1","08,1,+00300,1,07,1",,"9,AGL ,+99999,+99999",08991031999001801999999,,,999999102011,"8,1,008,1,+999,9",201.0,,,39901491999.0,49901441999.0,SYN10003809 41358 82016 10115 20100 30201 40294 58008 72052 886// 333 83706 85708 88710 90710 91129 91028=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T02:50:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-15,99999,V020,"210,1,N,0093,1","00122,1,9,N",6000199,1101,1101,999999,,"02,1,+00061,1,99,9","07,1,+00122,1,99,9","08,1,+00213,1,99,9",,"9,AGL ,+99999,+99999",99999021999000611999999,,,102901999999,,511.0,,,,,MET079METAR EGDR 010250Z 21018KT 6000 -DZ FEW002 BKN004 OVC007 11/11 Q1029 YLO2=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T03:00:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-12,99999,V020,"210,1,N,0093,1","00120,1,C,N",6000199,1111,1061,102961,,"01,1,+00060,1,07,1","05,1,+00120,1,07,1","08,1,+00210,1,07,1",,"9,AGL ,+99999,+99999",08991011999000601999999,,,999999102031,"5,1,010,1,+999,9",501.0,,,39901441999.0,,SYN09403809 41156 82118 10111 20106 30203 40296 55010 75052 887// 333 81702 85704 88707 90710 91128=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T03:50:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-15,99999,V020,"200,1,N,0082,1","00122,1,9,N",6000199,1101,1101,999999,,"02,1,+00061,1,99,9","07,1,+00122,1,99,9","08,1,+00183,1,99,9",,"9,AGL ,+99999,+99999",99999021999000611999999,,,102801999999,,511.0,,1341.0,,,MET082METAR EGDR 010350Z 20016G26KT 6000 -DZ FEW002 BKN004 OVC006 11/11 Q1028 YLO2=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T04:00:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-12,99999,V020,"200,1,N,0082,1","00120,1,C,N",6000199,1131,1071,102901,,"01,1,+00060,1,07,1","05,1,+00120,1,07,1","08,1,+00180,1,07,1",,"9,AGL ,+99999,+99999",08991011999000601999999,,,999999101971,"7,1,010,1,+999,9",511.0,,,39901391999.0,49901341999.0,SYN10003809 41156 82016 10113 20107 30197 40290 57010 75152 887// 333 81702 85704 88706 90710 91127 91026=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3809099999,2015-01-01T04:50:00.000+0000,4,50.086092,-5.255711,81.38,"CULDROSE, UK",FM-15,99999,V020,"200,1,N,0082,1","00122,1,9,N",2500199,1101,1101,999999,,"04,1,+00061,1,99,9","08,1,+00122,1,99,9",,,"9,AGL ,+99999,+99999",99999041999000611999999,,,102801999999,,581.0,,1391.0,,,MET076METAR EGDR 010450Z 20016G27KT 2500 -RADZ SCT002 OVC004 11/11 Q1028 AMB=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Exploring how far away neighboring weather stations are from the airports
From this we can see that just going three neighbors away takes us out to 33km away on average from the original airport

In [0]:
# calculating the average distance of airports to neighboring weather stations to inform how many weather stations to retrieve data from for each flight
window = Window.partitionBy(df_stations['station_id']).orderBy(df_stations['distance_to_neighbor'])
for i in range(1, 11):
    print(f"RANK {i}:")
    df_stations_ranked = df_stations.select('*', rank().over(window).alias('dist_to_airport_rank')).filter(col('dist_to_airport_rank') == i).cache()
    df_stations_ranked.select(F.avg("distance_to_neighbor")).show()
    print("----------------------------------------------------------")