In [56]:
# Set spark environments
import os
os.environ["SPARK_HOME"] = '/home/ypang6/anaconda3/lib/python3.7/site-packages/pyspark'
os.environ["PYTHONPATH"] = '/home/ypang6/anaconda3/bin/python3.7'
os.environ['PYSPARK_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.7'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.7'

In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [58]:
spark = SparkSession \
        .builder \
        .appName("Terminal_Area_Flight_Data_Query") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

## Custom schema of the data
### References to IFF_2.15_Specs_Sherlock.doc

In [72]:
myschema = StructType([
    StructField("recType", ShortType(), True),  #1  //track point record type number
    StructField("recTime", StringType(), True),  #2  //seconds since midnigght 1/1/70 UTC
    StructField("fltKey", LongType(), True),  #3  //flight key
    StructField("bcnCode", IntegerType(), True),  #4  //digit range from 0 to 7
    StructField("cid", IntegerType(), True),  #5  //computer flight id
    StructField("Source", StringType(), True),  #6  //source of the record 
    StructField("msgType", StringType(), True),  #7
    StructField("acId", StringType(), True),  #8  //call sign
    StructField("recTypeCat", StringType(), True),  #9
    StructField("lat", DoubleType(), True),  #10
    StructField("lon", DoubleType(), True),  #11 
    StructField("alt", DoubleType(), True),  #12  //in 100s of feet
    StructField("significance", ShortType(), True),  #13 //digit range from 1 to 10
    StructField("latAcc", DoubleType(), True),  #14
    StructField("lonAcc", DoubleType(), True),  #15
    StructField("altAcc", DoubleType(), True),  #16
    StructField("groundSpeed", IntegerType(), True),  #17 //in knots
    StructField("course", DoubleType(), True),  #18  //in degrees from true north
    StructField("rateOfClimb", DoubleType(), True),  #19  //in feet per minute
    StructField("altQualifier", StringType(), True),  #20  //Altitude qualifier (the “B4 character”)
    StructField("altIndicator", StringType(), True),  #21  //Altitude indicator (the “C4 character”)
    StructField("trackPtStatus", StringType(), True),  #22  //Track point status (e.g., ‘C’ for coast)
    StructField("leaderDir", IntegerType(), True),  #23  //int 0-8 representing the direction of the leader line
    StructField("scratchPad", StringType(), True),  #24
    StructField("msawInhibitInd", ShortType(), True),  #25 // MSAW Inhibit Indicator (0=not inhibited, 1=inhibited)
    StructField("assignedAltString", StringType(), True),  #26 
    StructField("controllingFac", StringType(), True),  #27
    StructField("controllingSec", StringType(), True),  #28
    StructField("receivingFac", StringType(), True),  #29
    StructField("receivingSec", StringType(), True),  #30
    StructField("activeContr", IntegerType(), True),  #31  // the active control number
    StructField("primaryContr", IntegerType(), True),  #32  //The primary(previous, controlling, or possible next)controller number
    StructField("kybrdSubset", StringType(), True),  #33  //identifies a subset of controller keyboards
    StructField("kybrdSymbol", StringType(), True),  #34  //identifies a keyboard within the keyboard subsets
    StructField("adsCode", IntegerType(), True),  #35  //arrival departure status code
    StructField("opsType", StringType(), True),  #36  //Operations type (O/E/A/D/I/U)from ARTS and ARTS 3A data
    StructField("airportCode", StringType(), True),  #37 
    StructField("trackNumber", IntegerType(), True),  #38
    StructField("tptReturnType", StringType(), True),  #39
    StructField("modeSCode", StringType(), True),  #40
    StructField("sensorTrackNumberList", StringType(), True), #41 //a list of sensor/track number combinations
    StructField("spi", StringType(), True),  #42 // representing the Ident feature
    StructField("dvs", StringType(), True), #43 // indicate the aircraft is within a suppresion volumn area
    StructField("dupM3a", StringType(), True),  #44 // indicate 2 aircraft have the same mode 3a code
    StructField("tid", StringType(), True),  #45 //Aircraft Ident entered by pilot

])

In [73]:
date = 20190801

In [74]:
import glob
file_path = glob.glob("/media/ypang6/paralab/Research/data/ZTL/IFF_ZTL_{}*.csv".format(date))[0]

In [75]:
df = spark.read.csv(file_path, header=False, sep=",", schema=myschema)

In [76]:
df.printSchema()

root
 |-- recType: short (nullable = true)
 |-- recTime: string (nullable = true)
 |-- fltKey: long (nullable = true)
 |-- bcnCode: integer (nullable = true)
 |-- cid: integer (nullable = true)
 |-- Source: string (nullable = true)
 |-- msgType: string (nullable = true)
 |-- acId: string (nullable = true)
 |-- recTypeCat: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- alt: double (nullable = true)
 |-- significance: short (nullable = true)
 |-- latAcc: double (nullable = true)
 |-- lonAcc: double (nullable = true)
 |-- altAcc: double (nullable = true)
 |-- groundSpeed: integer (nullable = true)
 |-- course: double (nullable = true)
 |-- rateOfClimb: double (nullable = true)
 |-- altQualifier: string (nullable = true)
 |-- altIndicator: string (nullable = true)
 |-- trackPtStatus: string (nullable = true)
 |-- leaderDir: integer (nullable = true)
 |-- scratchPad: string (nullable = true)
 |-- msawInhibitInd: short (nullable = true)
 |

### Select Columns

In [77]:
cols = ['recType', 'recTime', 'acId', 'lat', 'lon', 'alt']
df_rec3 = df.select(*cols).filter(df['recType']==3).withColumn("recTime", df['recTime'].cast(IntegerType()))

In [78]:
df_rec3.show()

+-------+----------+-----+--------+---------+----+
|recType|   recTime| acId|     lat|      lon| alt|
+-------+----------+-----+--------+---------+----+
|      3|1564634988|SKQ74|36.10444|-79.43917|23.0|
|      3|1564634999|SKQ74|36.11417|-79.43611|26.0|
|      3|1564635012|SKQ74|36.12389| -79.4325|29.0|
|      3|1564635024|SKQ74|36.13333|-79.42889|32.0|
|      3|1564635036|SKQ74|36.14278|-79.42583|37.0|
|      3|1564635048|SKQ74| 36.1525|-79.42278|40.0|
|      3|1564635059|SKQ74|36.16083|   -79.42|43.0|
|      3|1564635072|SKQ74|36.17139|-79.41667|46.0|
|      3|1564635084|SKQ74|36.18056|-79.41389|48.0|
|      3|1564635096|SKQ74|36.19028|-79.41056|51.0|
|      3|1564635108|SKQ74|    36.2|-79.40722|54.0|
|      3|1564635120|SKQ74|   36.21|-79.40389|58.0|
|      3|1564635132|SKQ74|36.21944|-79.40056|60.0|
|      3|1564635143|SKQ74|36.22833|-79.39778|63.0|
|      3|1564635155|SKQ74|36.23833|-79.39444|66.0|
|      3|1564635167|SKQ74|36.24806|-79.39083|69.0|
|      3|1564635180|SKQ74|36.25

In [79]:
df.filter(df['recType']==2).show()

+-------+--------------+------+-------+---+------+-------+-------+----------+----+----+----+------------+------+------+------+-----------+------+-----------+------------+------------+-------------+---------+----------+--------------+-----------------+--------------+--------------+------------+------------+-----------+------------+-----------+-----------+-------+-------+-----------+-----------+-------------+---------+---------------------+----+----+------+----+
|recType|       recTime|fltKey|bcnCode|cid|Source|msgType|   acId|recTypeCat| lat| lon| alt|significance|latAcc|lonAcc|altAcc|groundSpeed|course|rateOfClimb|altQualifier|altIndicator|trackPtStatus|leaderDir|scratchPad|msawInhibitInd|assignedAltString|controllingFac|controllingSec|receivingFac|receivingSec|activeContr|primaryContr|kybrdSubset|kybrdSymbol|adsCode|opsType|airportCode|trackNumber|tptReturnType|modeSCode|sensorTrackNumberList| spi| dvs|dupM3a| tid|
+-------+--------------+------+-------+---+------+-------+-------+----