In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [33]:
spark = SparkSession.builder \
    .appName("ExploracaoDadosVoos") \
    .getOrCreate()

## Carregando os Dados

In [34]:
data_path = "../data/raw/alljoined_airlines.csv"

df = spark.read.csv(data_path, header=True, inferSchema=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- FL_DATE: string (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- DEP_DELAY_NEW: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_DELAY_GROUP: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- ARR_DELAY_NEW: string (nullable = true)
 |-- ARR_DEL15: string (nullable = true)
 |-- ARR_DELAY_GROUP: string (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: integer (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: string (nullable = true)
 |-- NAS_DELAY: string (nullable = true)
 |-- SECURITY_DELAY: string (nullable

In [35]:
df.show(5)

+---+----+-----+-------------+-----------------+-----------------+-----------------+---------------+---------+-------------+---------+---------------+---------+-------------+---------+---------------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+
|_c0|YEAR|MONTH|      FL_DATE|OP_UNIQUE_CARRIER|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|DEP_DELAY|DEP_DELAY_NEW|DEP_DEL15|DEP_DELAY_GROUP|ARR_DELAY|ARR_DELAY_NEW|ARR_DEL15|ARR_DELAY_GROUP|CANCELLED|CANCELLATION_CODE|DIVERTED|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+---+----+-----+-------------+-----------------+-----------------+-----------------+---------------+---------+-------------+---------+---------------+---------+-------------+---------+---------------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+
|  1|2018|    1|1/26/18 00:00|               UA|             1252| 

### Número total de registros

In [36]:
print(f"Número total de registros: {df.count():,}")

Número total de registros: 19,174,431


In [37]:
print("Colunas disponíveis:")
print(df.columns)

Colunas disponíveis:
['_c0', 'YEAR', 'MONTH', 'FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'ARR_DELAY_GROUP', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']


### Verificando Valores Nulos

In [38]:
from pyspark.sql.functions import isnan, when, count

In [39]:
df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()

+---+----+-----+-------+-----------------+-----------------+-----------------+---------------+---------+-------------+---------+---------------+---------+-------------+---------+---------------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+
|_c0|YEAR|MONTH|FL_DATE|OP_UNIQUE_CARRIER|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|DEP_DELAY|DEP_DELAY_NEW|DEP_DEL15|DEP_DELAY_GROUP|ARR_DELAY|ARR_DELAY_NEW|ARR_DEL15|ARR_DELAY_GROUP|CANCELLED|CANCELLATION_CODE|DIVERTED|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+---+----+-----+-------+-----------------+-----------------+-----------------+---------------+---------+-------------+---------+---------------+---------+-------------+---------+---------------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+
|  0|   0|    0|      0|                0|                0|                0|       

### Voos por Ano

In [40]:
df.groupBy("year").count().withColumnRenamed("count", "voos_por_ano").orderBy("year").show()

+----+------------+
|year|voos_por_ano|
+----+------------+
|2018|     4867146|
|2019|     5020429|
|2020|     3067969|
|2021|     3892457|
|2022|     2326430|
+----+------------+



In [50]:
df.groupBy("month").count().withColumnRenamed("count", "voos_por_mes").orderBy("month").show()

+-----+------------+
|month|voos_por_mes|
+-----+------------+
|    1|     1756591|
|    2|     1628973|
|    3|     1947389|
|    4|     1314594|
|    5|     1682637|
|    6|     1767378|
|    7|     1929255|
|    8|     1507554|
|    9|     1359196|
|   10|     1446441|
|   11|     1408958|
|   12|     1425465|
+-----+------------+



In [51]:
spark.stop()