In [7]:
import sys,os,re
import glob
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf,lower,date_format

In [8]:
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.executor.memory", "70g") \
        .config("spark.driver.memory", "50g") \
        .config("spark.memory.offHeap.enabled",True) \
        .config("spark.memory.offHeap.size","16g") \
        .appName("Airline") \
        .getOrCreate()

Read flights, airports, and weather stations data.

In [13]:
flights_df = spark.read.option("header",True).csv('./data/flights/')
stations_df = spark.read.option("header",False).csv('./data/ghcnd-stations_clean.csv')
airports_df = spark.read.option("header",True).csv('./data/us-airports_clean.csv')
# apCodes_df = spark.read.option("header",True).csv('./data/airport-codes_clean.csv')

# Create tables for these dataframe in order to query
flights_df.createOrReplaceTempView("Airlines")
stations_df.createOrReplaceTempView("Stations")
airports_df.createOrReplaceTempView("Airports")
# apCodes_df.createOrReplaceTempView("AirportCodes")

Check all ORIGIN and DEST column in dataset, pick airport codes that appeared in dataset.

In [14]:
AP_CODEs = spark.sql(" \
    SELECT DISTINCT ORIGIN FROM AIRLINES \
    UNION \
    SELECT DISTINCT DEST FROM AIRLINES" \
)

# Create a table for it
AP_CODEs.createOrReplaceTempView("AirportCode")

In [15]:
APIDNAMEs = spark.read.option("header",False).csv('./data/apIdName.csv')
APIDNAMEs.createOrReplaceTempView("APIDNAME")
APIDNAMELOCAL = spark.sql(" \
        SELECT * FROM APIDNAME AS n \
        LEFT JOIN Airports AS p ON n._c0 = p.local_code \
    ")

AnalysisException: Path does not exist: file:/home/ax/Develop/CS543/Project1/data/apIdName.csv

In [16]:
# (ORIGIN, STATE, CITY, AP LOCAL_CODE, AP IATA_CODE, AP IDENT_CODE, AP NAME)
FULL_DF = spark.sql(" \
        SELECT c.ORIGIN, \
            p.local_region AS STATE, \
            p.municipality AS CITY, \
            p.local_code AS LOCAL, \
            p.iata_code AS IATA, \
            p.ident AS IDENT, \
            p.name AS NAME \
        FROM AirportCode AS c \
        LEFT JOIN Airports AS p ON \
        (c.ORIGIN = p.local_code OR c.ORIGIN = p.iata_code)" \
    )
    
FULL_DF.write.option("header",True).csv('./output/full')
FULL_DF.createOrReplaceTempView("FULL_AP")

                                                                                

In [17]:
ap_stations = stations_df.filter( \
    lower(col('_c2')).like('% ap%') | \
    lower(col('_c2')).like('% airport%')) \

ap_stations.createOrReplaceTempView("APTABLE")
ap_stations.write.csv('./output/ap_station')

sqlStr = "SELECT * FROM APTABLE AS t \
        LEFT JOIN FULL_AP AS p ON \
        t._c2 LIKE CONCAT('%', p.column ,'%') "

In [None]:
# Version2: Output AirportID,AirportName into ./output/airports.csv
spark.sql(" \
        SELECT i.ORIGIN, p.name \
        FROM AIRPORTID i \
        LEFT JOIN \
        AIRPORTS p ON i.ORIGIN = ( \
        CASE WHEN (p.iata_code IS NOT NULL) THEN (p.iata_code) \
        ELSE (p.local_code) END) \
        WHERE p.iso_country = 'US'" \
    ) \
    .sort("ORIGIN") \
    .write.options(heade=True, delimiter=',') \
    .csv("./output/airports")

In [None]:
# load stations info
apIdNameDF = spark.read.option("header",False).csv('./data/apIdName.csv')
apIdNames = apIdNameDF.collect()

name_seg = set()
for s in apIdNames:
    for i in str(s[1]).split():
        if i.isalpha():
            name_seg.add(i)

tl = list(name_seg)
tl.sort()
print(tl)

In [None]:
ghcndDF = spark.read.option("header",False).csv('./data/ghcnd-stations_clean.csv')
potentials = ghcndDF.filter( \
    lower(col('_c2')).like('% ap%') | \
    lower(col('_c2')).like('% airport%')).write.csv('test.csv')

In [None]:
full_airport_station =  spark.read.option("header",True).csv('./data/airports_stations.csv')
ghcnd_stations = spark.read.option("header",False).csv('./data/ghcnd_by_year/')

In [None]:
print(full_airport_station.head())
print(ghcnd_stations.head())

In [None]:
selected_airport_station = full_airport_station.select(col('ORIGIN'),col('STATION'))
ghcnd_stations_clean = ghcnd_stations.join( \
                            selected_airport_station, \
                            ghcnd_stations._c0 == selected_airport_station.STATION, \
                            "LeftOuter" \
                        )

In [None]:
# Keep not null rows and rename them based on GHCND documents
filter_null = ghcnd_stations_clean \
                .filter(ghcnd_stations_clean.ORIGIN.isNotNull()) \
                .select(col('_c0').alias("STATION_ID"), \
                        col('ORIGIN').alias("AIRPORT_CODE"), \
                        col('_c1').alias("DATE"), \
                        col('_c2').alias("ELEMENT"), \
                        col('_c3').alias("DATA_VALUE"), \
                        col('_c4').alias("M_FLAG"), \
                        col('_c5').alias("Q_FLAG"), \
                        col('_c6').alias("S_FLAG"), \
                        col('_c7').alias("OBS_TIME"))

In [None]:
filter_null.write.option("header",True).csv("./output/filter_all_v4")

In [None]:
airlines = spark.read.option("header",True).csv('./data/AIRLINES/')
airlines = airlines.select(date_format(col('FL_DATE'),"yyyyMMdd").alias("FL_DATE_T"),"*")
paired = airlines.join(filter_null, airlines.FL_DATE_T == filter_null.DATE, "leftouter") \
                    .drop('FL_DATE')

In [None]:
paired.write.option("header",True).csv("./output/FL_Weather")