In [2]:
import sys,os,re
import glob
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

Create or get a PySpark Instance

In [3]:
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.executor.memory", "70g") \
        .config("spark.driver.memory", "50g") \
        .config("spark.memory.offHeap.enabled",True) \
        .config("spark.memory.offHeap.size","16g") \
        .appName("Airline") \
        .getOrCreate()

22/11/03 19:44:27 WARN Utils: Your hostname, LinuxGUI resolves to a loopback address: 127.0.1.1; using 192.168.0.103 instead (on interface eno1)
22/11/03 19:44:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/03 19:44:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/11/03 19:44:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Read flights, airports, and weather stations data.

In [5]:
# All flights dataset
flights_df = spark.read.option("header",True).csv('./data/flights/')
# GHCND stations information
stations_df = spark.read.option("header",False).csv('./data/ghcnd-stations_clean.csv')
# United States airport information
airports_df = spark.read.option("header",True).csv('./data/us-airports_clean.csv')

# Create tables for these dataframe in order to query
flights_df.createOrReplaceTempView("Flights")
stations_df.createOrReplaceTempView("Stations")
airports_df.createOrReplaceTempView("Airports")

Check all ORIGIN and DEST column in dataset, pick airport codes that appeared in dataset.

In [6]:
apCode_df = spark.sql("SELECT DISTINCT ORIGIN FROM Flights \
                        UNION \
                        SELECT DISTINCT DEST FROM Flights")
apCode_df.createOrReplaceTempView("Airport_Code")

Append full airport information with airport codes appeared in airline delay and cancellation dataset

In [11]:
# (ORIGIN, STATE, CITY, AP LOCAL_CODE, AP IATA_CODE, AP IDENT_CODE, AP NAME)
airport_full_df = spark.sql(" \
        SELECT c.ORIGIN, \
            p.local_region AS STATE, \
            p.municipality AS CITY, \
            p.local_code AS LOCAL, \
            p.iata_code AS IATA, \
            p.ident AS IDENT, \
            p.name AS NAME \
        FROM Airport_Code AS c \
        LEFT JOIN Airports AS p ON \
        (c.ORIGIN = p.local_code OR c.ORIGIN = p.iata_code)" \
    )
    
airport_full_df.write.option("header",True).csv('./output/airports_full')
airport_full_df.createOrReplaceTempView("Airport_Full")

                                                                                

*Use Regex Matching to filter out weather stations located in airports. "rlike" keywords

In [10]:
ap_stations = stations_df.filter( \
    lower(col('_c2')).like('% ap%') | \
    lower(col('_c2')).like('% airport%')) \

ap_stations.createOrReplaceTempView("APTABLE")
ap_stations.write.csv('./output/ap_station')

sqlStr = "SELECT * FROM APTABLE AS t \
        LEFT JOIN FULL_AP AS p ON \
        t._c2 LIKE CONCAT('%', p.column ,'%') "

AnalysisException: path file:/home/ax/Develop/CS543/Project1/output/ap_station already exists.

In [None]:
# Version2: Output AirportID,AirportName into ./output/airports.csv
spark.sql(" \
        SELECT i.ORIGIN, p.name \
        FROM AIRPORTID i \
        LEFT JOIN \
        AIRPORTS p ON i.ORIGIN = ( \
        CASE WHEN (p.iata_code IS NOT NULL) THEN (p.iata_code) \
        ELSE (p.local_code) END) \
        WHERE p.iso_country = 'US'" \
    ) \
    .sort("ORIGIN") \
    .write.options(heade=True, delimiter=',') \
    .csv("./output/airports")

In [None]:
# load stations info
apIdNameDF = spark.read.option("header",False).csv('./data/apIdName.csv')
apIdNames = apIdNameDF.collect()

name_seg = set()
for s in apIdNames:
    for i in str(s[1]).split():
        if i.isalpha():
            name_seg.add(i)

tl = list(name_seg)
tl.sort()
print(tl)

In [None]:
ghcndDF = spark.read.option("header",False).csv('./data/ghcnd-stations_clean.csv')
potentials = ghcndDF.filter( \
    lower(col('_c2')).like('% ap%') | \
    lower(col('_c2')).like('% airport%')).write.csv('test.csv')

Load preprocessed airport-station dataset and GHCND weather data

In [4]:
full_airport_station =  spark.read.option("header",True).csv('./data/airports_stations.csv')
ghcnd_stations = spark.read.option("header",False).csv('./data/ghcnd_by_year/')

In [None]:
print(full_airport_station.head())
print(ghcnd_stations.head())

Assign each existing airport codes into weather data value and filter out weather data that not relate to our dataset.

In [None]:
selected_airport_station = full_airport_station.select(col('ORIGIN'),col('STATION'))
ghcnd_stations_clean = ghcnd_stations.join( \
                                            selected_airport_station, \
                                            ghcnd_stations._c0 == selected_airport_station.STATION, \
                                            "LeftOuter" \
                                        )

In [None]:
# Keep not null rows and rename them based on GHCND documents
weatherData_df = ghcnd_stations_clean \
                .filter(ghcnd_stations_clean.ORIGIN.isNotNull()) \
                .select(col('_c0').alias("STATION_ID"), \
                        col('ORIGIN').alias("AIRPORT_CODE"), \
                        col('_c1').alias("DATE"), \
                        col('_c2').alias("ELEMENT"), \
                        col('_c3').alias("DATA_VALUE"), \
                        col('_c4').alias("M_FLAG"), \
                        col('_c5').alias("Q_FLAG"), \
                        col('_c6').alias("S_FLAG"), \
                        col('_c7').alias("OBS_TIME"))

Ouput filtered weather dataset

In [None]:
weatherData_df.write.option("header",True).csv("./output/filtered_weatherData")