In [17]:
# Installing required packages
#!pip install pyspark
#!pip install findspark

In [1]:
import findspark
findspark.init()

In [2]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType


In [9]:
from pyspark.sql.functions import lit, when, length, trim, ltrim, rtrim, col, udf, substring 
import numpy as np

In [3]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = (SparkSession.builder 
                     .appName("Python Spark DataFrames basic example") 
                     .config("spark.some.config.option", "some-value") 
                     .getOrCreate()
        )

In [4]:
# Ler o dataset
path_data = '../datasets/airports.csv'
#RDD
rdd  = spark.sparkContext.wholeTextFiles(path_data)

#print("initial partition count:"+str(rdd.getNumPartitions()))

#Schema
schema = (StructType([ 
    StructField("faa",StringType(),True), 
    StructField("name",StringType(),True), 
    StructField("lat",FloatType(),True), 
    StructField("lon", FloatType(), True), 
    StructField("alt", IntegerType(), True), 
    StructField("tz", FloatType(), True),
    StructField("dst", StringType(), True )
                    ])
         )

#DataFrame
airports = spark.read.csv(path_data, header = True, schema = schema)

airports.printSchema()

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: float (nullable = true)
 |-- dst: string (nullable = true)



# Dicionário
#### - faa (string): Identificador do aeroporto determinado pela Federal Aviation Administration. Formato: 3-5 caracteres alfanuméricos.
#### - name (string): Nome do aeroporto.
#### - lat (float): Latitude do aeroporto. Intervalo de valores .
#### - lon (float): Longitude do aeroporto Intervalo de valores .
#### - alt (int): Altitude do aeroporto. Unidade de medida em pés. Intervalo de valores .
#### - tz (float): Fuso horário baseado no deslocamento de horas a partir de UTC/GMT. Intervalo de valores. Pode ser fuso fracionário [1]
#### - dst (category): Horário de verão. Descrição dos possíveis valores [2]:
     E (Europe)
     A (US/Canada)
     S (South America)
     O (Australia)
     Z (New Zealand)
     N (None)
     U (Unknown)


## Airport - Perguntas


In [None]:
df.createOrReplaceTempView('airports_table')

#### Pergunta 1


In [10]:
airports.where(length(airports.faa) == 3).distinct().show()

+---+--------------------+---------+-----------+----+----+---+------+
|faa|                name|      lat|        lon| alt|  tz|dst|qa_faa|
+---+--------------------+---------+-----------+----+----+---+------+
|2G9|Somerset County A...| 40.03887|  -79.01499|2275|-5.0|  A|  null|
|CWI|   Clinton Municipal| 41.83075|  -90.32897| 708|-6.0|  A|     F|
|DLL|Baraboo Wisconsin...|43.521786|  -89.77093| 979|-6.0|  A|     F|
|INJ|      Hillsboro Muni|32.083485|  -97.09723| 685|-6.0|  A|     F|
|IRK|Kirksville Region...|  40.0935|   -92.5449| 966|-6.0|  A|     F|
|KCL|Chignik Lagoon Ai...| 56.31111| -158.53416|  25|-9.0|  A|     F|
|SEA| Seattle Tacoma Intl|   47.449|  -122.3093| 433|-8.0|  A|     F|
|UNK|  Unalakleet Airport|63.888332| -160.79889|  21|-9.0|  A|     F|
|55J|Fernandina Beach ...|30.611834|   -81.4612|  16|-4.0|  A|  null|
|8M8|     Garland Airport|44.806526|  -84.27619|1218|-5.0|  A|  null|
|CZN|     Chisana Airport| 62.07111| -142.04834|1011|-9.0|  A|     F|
|ENA|          Kenai

In [12]:
airports = airports.withColumn('qa_faa', 
                        (when((airports.faa == '') | 
                              (airports.faa.isNull() == True) |
                              (airports.faa.rlike('\t')) |
                              (airports.faa.rlike(' +'))
                              , lit('M')) 
                        .when((length(airports.faa) < 3) | 
                              (length(airports.faa) > 5)  
                              , lit('F')) 
                        )
                   )

airports.groupBy('qa_faa').count().show()

+------+-----+
|qa_faa|count|
+------+-----+
|  null| 1397|
+------+-----+



#### Pergunta 2


In [13]:
airports = airports.withColumn('qa_name', 
                  (when(airports.name == None  |
                       (airports.name == ''), lit('M')
                       )
                  )
             )

airports.where(airports.qa_name == 'M').show()

+---+----+---+---+---+---+---+------+-------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|
+---+----+---+---+---+---+---+------+-------+
+---+----+---+---+---+---+---+------+-------+



#### Pergunta 3


In [17]:
airports = airports.withColumn('qa_lat', 
                 (when(airports.lat == None |
                       (airports.lat == ''), lit ('M')
                      )
                 .when((airports.lat < -180) | 
                       (airports.lat > 180), lit ('I'))
                 .when(airports.lat.rlike("^[a-zA-Z]+$"), lit('A'))
                 )
             )

airports.groupBy('qa_lat').count().show()

+------+-----+
|qa_lat|count|
+------+-----+
|  null| 1397|
+------+-----+



#### Pergunta 4


In [18]:
airports = airports.withColumn('qa_lon', 
                 (when((airports.lon == None) |
                      (airports.lon == ''),lit ('M')) 
                 .when((airports.lon < -180) | 
                       (airports.lon > 180), lit ('I')) 
                 .when(airports.lon.rlike("^[a-zA-Z]+$"), lit('A'))
                 )
             )

airports.groupBy('qa_lon').count().show()

+------+-----+
|qa_lon|count|
+------+-----+
|  null| 1397|
+------+-----+



#### Pergunta 5


In [19]:
airports = airports.withColumn('qa_alt', 
                 (when(((airports.alt == None) |
                        (airports.alt == '')
                       ), lit ('M')) 
                 .when((airports.alt < 0)  
                       , lit ('I')) 
                 .when(airports.alt.rlike("^[a-zA-Z]+$"), lit('A'))
                 )
             )

airports.groupBy('qa_alt').count().show()

+------+-----+
|qa_alt|count|
+------+-----+
|  null| 1395|
|     I|    2|
+------+-----+



#### Pergunta 6

In [20]:
airports = airports.withColumn('qa_tz',
                 (when((airports.tz.isNull()) | 
                        (airports.tz == ''),
                        lit('M'))
                 .when(
                        (airports.tz < - 11) |
                        (airports.tz > 14),
                        lit('I'))
                 .when(
                        (airports.tz.rlike("^[a-zA-Z]+$")),
                        lit('A'))
                 )
             )

airports.groupBy('qa_tz').count().show()

+-----+-----+
|qa_tz|count|
+-----+-----+
| null| 1397|
+-----+-----+



#### Pergunta 7

In [21]:
expected_categories = ['E', 'A', 'S', 'O', 'Z', 'N', 'U']
airports = airports.withColumn('qa_dst',
                 (when(
                     ((airports.dst.isNull()) |
                     (airports.dst == '')),
                     lit('M')
                     )
                  .when(
                      (~airports.dst.isin(expected_categories)),
                      lit('C')
                      )
                  .when(
                      (airports.dst.rlike("^[0-9]+$")),
                      lit('N')
                      )
                 )
             )

airports.groupBy('qa_dst').count().show()

+------+-----+
|qa_dst|count|
+------+-----+
|  null| 1397|
+------+-----+



## Saving as parquet file

In [22]:
airports.write.mode('overwrite').parquet('../datasets/qa_airports.parquet')

## Planes - Perguntas


#### tailnum (string): Identificação do avião. Formato "N-Number", é composto por 5-6 caracteres.
#### Primeira letra é sempre "N".
#### De 1 a 4 digitos seguidos por 1 letra (ex. N1234Z).
#### De 1 a 3 digitos seguidos por 2 letras (ex. N123AZ).
#### Não deve conter 0 (zero) como primeiro digito, e não deve conter as letras "I" ou "O".

In [5]:
path_data_planes = '../datasets/planes.csv'

#Schema
schema = (StructType([ 
    StructField("tailnum",StringType(),True), 
    StructField("year",IntegerType(),True), 
    StructField("type",StringType(),True), 
    StructField("manufacturer", StringType(), True), 
    StructField("model", StringType(), True), 
    StructField("engines", IntegerType(), True),
    StructField("seats", IntegerType(), True ),
    StructField("speed", IntegerType(), True),
    StructField("enginge", StringType(), True)
                    ])
         )

#DataFrame
planes = spark.read.csv(path_data_planes, header = True, schema = schema)

#### Pergunta 1

In [6]:
planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|  enginge|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 5 rows



#### Pergunta 1

In [7]:
planes.show(1)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|  enginge|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 1 row



In [13]:
planes = planes.withColumn("qa_tailnum",
                  (when(
                          (planes.tailnum.isNull()) |
                           (planes.tailnum == ''),
                           lit('M')
                        )
                   .when(
                         (length(trim(planes.tailnum))!= 6),
                       lit('S')
                      )
                   .when(
                         (planes.tailnum.rlike("^[N][0-9]{3}[a-zA-Z0-9]{2}$")) == False,
                        lit('F')
                        )
                   .when(
                       (planes.tailnum.rlike("^[N]") == False ),
                       lit('FN')
                       )
                   .when(
                       (planes.tailnum.rlike("^[N][I|O|0]") == True),
                       lit('FE')
                       )
                  )
              )

planes.groupBy('qa_tailnum').count().show()

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|      null| 2609|
|         S|   19|
+----------+-----+



#### Pergunta 2

In [33]:
planes = planes.withColumn("qa_year",
                  (when((planes.year.isNull()) |
                        (planes.year == ''),
                        lit('M')
                      )
                   .when((planes.year < 1950),
                         lit('I')
                       )
                  )
)

planes.groupBy('qa_year').count().show()

+-------+-----+
|qa_year|count|
+-------+-----+
|   null| 2567|
|      M|   60|
|      I|    1|
+-------+-----+



#### Pergunta 3


In [34]:
engines = ['Fixed wing multi engine',
           'Fixed wing single engine',
           'Rotorcraft'
          ]

planes = planes.withColumn("qa_type",
                  (when((planes.type.isNull()) |
                        (planes.type == ''),
                        lit('M')
                       )
                   .when(~planes.type.isin(engines),
                         lit('C')
                        )
                  )
)

planes.groupBy('qa_type').count().show()

+-------+-----+
|qa_type|count|
+-------+-----+
|   null| 2628|
+-------+-----+



#### Pergunta 4


In [45]:
manufacturer = ['AIRBUS',
                'BOEING',
                'BOMBARDIER',
                'CESSNA',
                'EMBRAER',
                'SIKORSKY',
                'CANADAIR',
                'PIPER',
                'MCDONNELL DOUGLAS',
                'CIRRUS',
                'BELL',
                'KILDALL GARY',
                'LAMBERT RICHARD',
                'BARKER JACK',
                'ROBINSON HELICOPTER',
                'GULFSTREAM',
                'MARZ BARRY',
               ]

planes = planes.withColumn("qa_manufacturer",
                     (when((planes.manufacturer.isNull()) |
                           (planes.manufacturer == ''),
                           lit('M')
                         )
                      .when(~planes.manufacturer.isin(manufacturer),
                            lit('C')
                          )
                     )
                 )

planes.groupBy('qa_manufacturer').count().show()
planes.select('qa_manufacturer', 'manufacturer').distinct().sort('qa_manufacturer').show(truncate = False)

+---------------+-----+
|qa_manufacturer|count|
+---------------+-----+
|           null| 2007|
|              C|  621|
+---------------+-----+

+---------------+-----------------------------+
|qa_manufacturer|manufacturer                 |
+---------------+-----------------------------+
|null           |CESSNA                       |
|null           |EMBRAER                      |
|null           |BOEING                       |
|null           |KILDALL GARY                 |
|null           |CANADAIR                     |
|null           |AIRBUS                       |
|null           |LAMBERT RICHARD              |
|null           |PIPER                        |
|null           |SIKORSKY                     |
|null           |BELL                         |
|null           |MARZ BARRY                   |
|null           |MCDONNELL DOUGLAS            |
|C              |GULFSTREAM AEROSPACE         |
|C              |BOMBARDIER INC               |
|C              |CIRRUS DESIGN CORP    

#### Pergunta 5


In [47]:
planes = planes.withColumn('qa_model',
                     (when((planes.model.isNull()) |
                           (planes.model == ''),
                           lit('M')
                         )
                     )
                     .when((planes.manufacturer == 'AIRBUS INDUSTRIE') &
                           (~planes.model.rlike("^[A]")), 
                           lit ('F')
                     )
                     .when((planes.manufacturer == 'BOEING') &
                           (~planes.model.rlike("^[7]")), 
                           lit ('F')
                     )
                     .when(
                           ((planes.manufacturer == 'BOMBARDIER') | (planes.manufacturer == 'CANADAIR')) &
                           (~planes.model.rlike("^[CL]")), 
                           lit ('F')
                     )
                     .when((planes.manufacturer == 'MCDONELL DOUGLAS') &
                           ((~planes.model.rlike("^[MD]")) |
                            (~planes.model.rlike("^[DC]"))
                           )
                           , lit ('F')
                     )
                 )

planes.groupBy('qa_model').count().show()

+--------+-----+
|qa_model|count|
+--------+-----+
|       F|   15|
|    null| 2613|
+--------+-----+



### Pergunta 6

In [49]:
planes = planes.withColumn('qa_engines',
                     (when((planes.engines.isNull()) |
                           (planes.engines == ''),
                           lit('M')
                         )
                      .when(~planes.engines.between(1, 4),
                            lit('I')
                          )
                      .when(planes.engines.rlike('[a-zA-Z]+$'),
                            lit('A')
                          )
                     )
                 )

planes.groupBy('qa_engines').count().show()

+----------+-----+
|qa_engines|count|
+----------+-----+
|      null| 2628|
+----------+-----+



### Pergunta 7

In [51]:
planes = planes.withColumn('qa_seats',
                      (when((planes.seats.isNull()) |
                            (planes.seats == ''),
                            lit('M')
                          )
                       .when(~planes.seats.between(2,500),
                             lit('I')
                           )
                       .when(planes.seats.rlike('[a-zA-Z]+$'),
                             lit('A')
                           )
                      )
                 )

planes.groupBy('qa_seats').count().show()

+--------+-----+
|qa_seats|count|
+--------+-----+
|    null| 2628|
+--------+-----+



### Pergunta 8

In [52]:
planes = planes.withColumn('qa_speed',
                     (when((planes.speed.isNull()) |
                           (planes.speed == ''),
                           lit('M')
                         )
                      .when(~planes.speed.between(50,150),
                            lit('I')
                          )
                      .when(planes.speed.rlike('[a-zA-Z]+$'),
                            lit('A')
                          )
                     )
                 )

planes.groupBy('qa_speed').count().show()

+--------+-----+
|qa_speed|count|
+--------+-----+
|    null|    6|
|       M| 2622|
+--------+-----+



### Pergunta 9

In [60]:
enginge_categories = ['Turbo-fan',
                      'Turbo-jet',
                      'Turbo-prop',
                      'Turbo-shaft',
                      '4 Cycle'
                     ]

planes = planes.withColumn('qa_enginge',
                     (when((planes.enginge.isNull()) |
                           (planes.enginge == ''),
                           lit('M')
                         )
                      .when(~planes.enginge.isin(enginge_categories),
                            lit('C')
                          )
                     )
                 )

planes.groupBy('qa_enginge').count().show()
planes.select('qa_enginge', 'enginge').distinct().show()

+----------+-----+
|qa_enginge|count|
+----------+-----+
|      null| 2618|
|         C|   10|
+----------+-----+

+----------+-------------+
|qa_enginge|      enginge|
+----------+-------------+
|         C|Reciprocating|
|      null|    Turbo-jet|
|      null|      4 Cycle|
|      null|    Turbo-fan|
|      null|   Turbo-prop|
|      null|  Turbo-shaft|
+----------+-------------+



In [54]:
planes.select(planes.speed).distinct().show(50)

+-----+
|speed|
+-----+
|  108|
|  126|
| null|
|  112|
|  107|
|   90|
+-----+



In [61]:
planes.write.mode('overwrite').parquet('../datasets/qa_planes.parquet')

## Flights


#### year (int), month (int), day (int): Ano, Mês, Dia de partida. hour (int), minute (int): Hora e Minuto agendada para partida.
#### dep_time (string), arr_time (string): Horario real de partida/chegada do voo no horário local. Formato: HHMM ou HMM.
#### dep_delay (int), arr_delay (int): Atraso de partida/chegada do voo em minutos. Valores negativos representam partidas/chegadas antecipadas.
#### carrier (string): Identificador da empresa aérea.
#### tailnum (string): Identificador do avião. Veja dataset planes.
#### flight (string): Identificador do vôo. Formato: 4 dígitos (preenchidos com zero a esquerda caso necessário).
#### origin (string), dest (string): Identificadores dos aeroportos de origem e destino. Veja dataset airports.
#### air_time (int): Tempo de vôo. Unidade de medida em minutos. Intervalo de dados .

In [11]:
flights_schema = (StructType([ 
    StructField("year",IntegerType(),True), 
    StructField("month",IntegerType(),True), 
    StructField("day",IntegerType(),True), 
    StructField("dep_time", StringType(), True), 
    StructField("dep_delay", IntegerType(), True), 
    StructField("arr_time", StringType(), True),
    StructField("arr_delay", IntegerType(), True ),
    StructField("carrier", StringType(), True),
    StructField("tailnum", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("origin", StringType(), True),
    StructField("dest", StringType(), True),
    StructField("air_time", IntegerType(), True),
    StructField("distance", IntegerType(), True),
    StructField("hour", IntegerType(), True),
    StructField("minute", IntegerType(), True)
                    ])
         )
flights = spark.read.csv('../datasets/flights.csv', header = True, schema = flights_schema)

In [12]:
flights = flights.select('year', 
                         'month',
                         'day',
                         'hour',
                         'minute',
                         'dep_time',
                         'arr_time',
                         'dep_delay',
                         'arr_delay',
                         'carrier',
                         'tailnum',
                         'flight',
                         'origin',
                         'dest',
                         'air_time',
                         'distance'
                        )

In [14]:
flights.printSchema()


root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)



#### Pergunta 1

In [64]:
flights = flights.withColumn('qa_year_month_day',
                                (when((flights.year.isNull()) |
                                      (flights.year == ''),
                                      lit('MY')
                                    )
                                 .when((flights.month.isNull()) |
                                      (flights.month == ''),
                                      lit('MM')
                                    )
                                 .when((flights.day.isNull()) |
                                      (flights.day == ''),
                                      lit('MD')
                                    )
                                 .when((flights.year< 1950), 
                                      lit('IY')
                                    )
                                 .when((~flights.month.between(1,12)), 
                                      lit('IM')
                                )
                                 .when(((flights.year == 2) & (~flights.day.between(1,29))) |
                                       ((flights.year != 2) & (~flights.day.between(1,31))), 
                                       lit('IY'))
                            ))

#### Pergunta 2

In [65]:
flights = flights.withColumn('qa_hour_minute',
                      (when((flights.hour.isNull()) |
                            (flights.hour == ''),
                            lit('MH')
                          )
                       .when((flights.minute.isNull()) |
                             (flights.minute == ''),
                             lit('MM')
                           )
                       .when(~flights.hour.between(0,23), lit('IH'))
                       .when(~flights.minute.between(0,59), lit('IM'))
                      )
                  )

#### Pergunta 3

In [66]:
flights = flights.withColumn('qa_dep_arr_time',
                       (when((flights.dep_time.isNull()) |
                             (flights.dep_time == ''),
                             lit('MD')
                           )
                        .when((flights.arr_time.isNull()) |
                              (flights.arr_time == ''),
                              lit('MA')
                            )
                        .when((length(flights.dep_time) < 3 ) |
                              (length(flights.dep_time) > 4 ),
                              lit('FD')
                             )
                        .when((length(flights.arr_time) < 3 ) |
                              (length(flights.arr_time) > 4 ),
                              lit('FD')
                             )
                        .when((length(flights.dep_time) == 3) &
                              (~flights.dep_time.substr(1,1).cast('int').between(0, 23)) &
                              (~flights.dep_time.substr(2,3).cast('int').between(0,59)),
                              lit('FD')
                            )
                        .when((length(flights.dep_time) == 4) &
                              (~flights.dep_time.substr(1,2).cast('int').between(0, 23)) &
                              (~flights.dep_time.substr(3,4).cast('int').between(0,59)), 
                              lit('FD')
                            )
                        .when((length(flights.arr_time) == 3) &
                              (~flights.arr_time.substr(1,1).cast('int').between(0, 23)) &
                              (~flights.arr_time.substr(2,3).cast('int').between(0,59)), 
                              lit('FD')
                            )
                        .when((length(flights.dep_time) == 4) &
                              (~flights.arr_time.substr(1,2).cast('int').between(0, 23)) &
                              (~flights.arr_time.substr(3,4).cast('int').between(0,59)), 
                              lit('FD')
                           )  
                       )
                  )

flights.groupBy('qa_dep_arr_time').count().show()

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
|           null| 9710|
|             FD|  290|
+---------------+-----+



#### Pergunta 4

In [68]:
flights = flights.withColumn('qa_dep_arr_delay',
                      (when((flights.dep_delay.isNull()) |
                            (flights.dep_delay == ''),
                            lit('MD')
                          )
                       .when((flights.arr_delay.isNull()) |
                            (flights.arr_delay == ''),
                            lit('MA')
                           )
                      )
                  )

flights.groupBy('qa_dep_arr_delay').count().show()

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
|            null| 9925|
|              MD|   48|
|              MA|   27|
+----------------+-----+



#### Pergunta 5

In [70]:
flights = flights.withColumn('qa_carrier',
                      (when((flights.carrier.isNull()) |
                            (flights.carrier == ''),
                            lit('M')
                          )
                       .when((flights.carrier.rlike("[a-zA-Z-0-9]+$") == False),
                             lit('F')
                           )
                      )
                  )

flights.groupBy('qa_carrier').count().show()

+----------+-----+
|qa_carrier|count|
+----------+-----+
|      null|10000|
+----------+-----+



In [71]:
tailnum = flights.tailnum


In [73]:
flights = flights.withColumn('qa_tailnum',
                      (when((tailnum.isNull()) |
                            (tailnum == ''),
                            lit('M')
                          )
                       .when(length(tailnum )!= 6,
                            lit('S')
                           )
                       .when(tailnum.rlike("^[N][0-9]{3}[a-zA-Z0-9]{2}$") == False,
                            lit('F')
                            )
                       .when(tailnum.rlike('^[N]') == False,
                            lit('FN')
                            )
                       .when(tailnum.rlike("^[N][I|O|0]") == True,
                               lit('FE')
                            )
                      )
                  )

flights.groupBy('qa_tailnum').count().show()

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         F|  435|
|      null| 9507|
|         S|   58|
+----------+-----+



In [84]:
qa_flight = flights.flight
flights   = flights.withColumn('qa_flight',(
                        when((qa_flight.isNull()) |
                            (qa_flight == ''),
                            lit('M'))
                       .when((length(qa_flight) != 4) &
                             (~qa_flight.rlike("^[0-9]{4}$")),
                             lit('F')
                           )
                      )
                  )

flights.groupBy('qa_flight').count().show()

+---------+-----+
|qa_flight|count|
+---------+-----+
|        F| 6158|
|     null| 3842|
+---------+-----+



In [85]:
origin = flights.origin
dest   = flights.dest

flights = flights.withColumn('qa_origin_dest',
                      (when((origin.isNull()) |
                            (origin == ''),
                            lit('MO')
                          )
                       .when((dest.isNull()) |
                            (dest == ''),
                            lit('MD')
                      )
                       .when(origin.rlike("[a-zA-Z0-9]{3}") == False,
                            lit('FO')
                            )
                       .when(dest.rlike("[a-zA-Z0-9]{3}") == False,
                            lit('FD')
                            )
                  )
                )

flights.groupBy('qa_origin_dest').count().show()

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
|          null|10000|
+--------------+-----+



In [86]:
air_time = flights.air_time 

flights = flights.withColumn('qa_air_time',
                      (when((air_time.isNull()) |
                            (air_time == ''),
                            lit('M')
                          )
                       .when(~air_time.between(20,500),
                            lit('I')
                            )
                      )
                  )

flights.groupBy('qa_air_time').count().show()

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
|       null| 9925|
|          M|   75|
+-----------+-----+



In [87]:
distance = flights.distance 

flights = flights.withColumn('qa_distance',
                      (when((distance.isNull()) |
                            (distance == ''),
                            lit('M')
                          )
                       .when(~distance.between(50,3000),
                            lit('I')
                            )
                      )
                  )

flights.groupBy('qa_distance').count().show()

+-----------+-----+
|qa_distance|count|
+-----------+-----+
|       null|10000|
+-----------+-----+



In [88]:
flights = flights.withColumn('qa_distance_airtime',
                  (when((air_time.isNull()) |
                        (air_time == '') |
                        (distance.isNull()) |
                        (distance == ''),
                        lit('M')
                      )
                   .when(air_time >= (distance * 0.1) + 30,
                         lit('TL')
                       )
                   .when(air_time <= (distance * 0.1) + 10,
                         lit('TS')
                       )
                   .when((air_time > (distance * 0.1) + 10) &
                         (air_time < (distance * 0.1) + 30),
                         lit('TR')
                        )
                  )
                )

flights.groupBy('qa_distance_airtime').count().show()

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+
|                  M|   75|
|                 TR| 4831|
|                 TS|   67|
|                 TL| 5027|
+-------------------+-----+



In [89]:
flights.write.mode('overwrite').parquet('../datasets/qa_flights.parquet')