In [1]:
import findspark
findspark.init()

In [2]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType, BinaryType, DateType


In [3]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [4]:
airports_path_data = '../data/qa_airports.parquet'
planes_path_data   = '../data/qa_planes.parquet'
flights_path_data  = '../data/qa_flights.parquet'


airports = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(airports_path_data))

planes   = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(planes_path_data))

flights  = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(flights_path_data))

## Perguntas para Qualidade

### Unificando os datasets

In [5]:
planes  = planes.withColumnRenamed('tailnum', 'planes_tailnum').withColumnRenamed('qa_tailnum', 'planes_qa_tailnum')
flights = flights.withColumnRenamed('tailnum', 'flights_tailnum').withColumnRenamed('qa_tailnum', 'flights_qa_tailnum')

In [6]:
origin_to_join = airports.select('faa',
                                   'qa_faa',
                                   'qa_name',
                                   'qa_lat',
                                   'qa_lon',
                                   'qa_alt',
                                   'qa_tz',
                                   'qa_dst')

origin_to_join = (airports.withColumnRenamed('faa', 'origin_faa')
                         .withColumnRenamed('qa_faa', 'origin_qa_faa')
                         .withColumnRenamed('qa_name', 'origin_qa_name')
                         .withColumnRenamed('qa_lat', 'origin_qa_lat')
                         .withColumnRenamed('qa_lon', 'origin_qa_lon')
                         .withColumnRenamed('qa_alt', 'origin_qa_alt')
                         .withColumnRenamed('qa_tz', 'origin_qa_tz')
                         .withColumnRenamed('qa_dst', 'origin_qa_dst'))


dest_to_join = (airports.withColumnRenamed('faa', 'dest_faa')
                         .withColumnRenamed('qa_faa', 'dest_qa_faa')
                         .withColumnRenamed('qa_name', 'dest_qa_name')
                         .withColumnRenamed('qa_lat', 'dest_qa_lat')
                         .withColumnRenamed('qa_lon', 'dest_qa_lon')
                         .withColumnRenamed('qa_alt', 'dest_qa_alt')
                         .withColumnRenamed('qa_tz', 'dest_qa_tz')
                         .withColumnRenamed('qa_dst', 'dest_qa_dst'))

planes_to_join   = planes.select( 'planes_tailnum',
                                    'planes_qa_tailnum',
                                    'qa_year',
                                    'qa_type',
                                    'qa_manufacturer',
                                    'qa_model',
                                    'qa_engines',
                                    'qa_seats',
                                    'qa_speed',
                                    'qa_enginge')

flights_to_join  = flights.select( 'flights_tailnum',
                                    'origin',
                                    'dest',
                                    'qa_year_month_day',
                                    'qa_hour_minute',
                                    'qa_dep_arr_time',
                                    'qa_dep_arr_delay',
                                    'qa_carrier',
                                    'flights_qa_tailnum',
                                    'qa_flight',
                                    'qa_origin_dest',
                                    'qa_air_time',
                                    'qa_distance',
                                    'qa_distance_airtime')

In [7]:
df = flights_to_join.join(origin_to_join, 
                     (flights_to_join.origin == origin_to_join.origin_faa),                    
                    'left'
                    )

df = df.join(dest_to_join, 
                     (flights_to_join.dest == dest_to_join.dest_faa),                    
                    'left'
                    )

df = df.join(planes_to_join, 
                     (df.flights_tailnum == planes_to_join.planes_tailnum),
                    'left'
                    )

#### M - dado faltante
#### F  - indica que não respeita o formato esperado 
#### I   - indica que o valor excede o intervalo esperado
#### S - Indica que não tem exatamente 5 caracteres.
#### T

#### Formatando os dados para conter apenas valores M, F, I, S, T e null

In [8]:
for column in df.schema.names:
    if 'qa_' in column:
        df = df.withColumn(column, (
                          F.when(F.col(column).startswith('M'), 'M')
                           .when(F.col(column).startswith('F'), 'F')
                           .when(F.col(column).startswith('I'), 'I')
                           .when(F.col(column).startswith('S'), 'S')
                           .when(F.col(column).startswith('T'), 'T')
                           .otherwise(F.col(column))
        ))

#### Criando o dataframe base

#### Função para juntar os groupbys em um dataframe.
#### Uso de union para juntar todos eles

In [4]:
qualidade_data  = '../data/datasets_relatorios/qualidade.parquet'


qualidade = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(qualidade_data))

qualidade.show(5)

+-------------------+---+---+---+---+----+-----+
|                qa_|  M|  F|  I|  S|   T| null|
+-------------------+---+---+---+---+----+-----+
|qa_distance_airtime| 75|  0|  0|  0|9925|    0|
| flights_qa_tailnum|  0|435|  0| 58|   0| 9507|
|  qa_year_month_day|  0|  0|  0|  0|   0|10000|
|  planes_qa_tailnum|  0|  0|  0| 44|   0| 9956|
|   qa_dep_arr_delay| 75|  0|  0|  0|   0| 9925|
+-------------------+---+---+---+---+----+-----+
only showing top 5 rows



### Checar flights_qa_tailnum, qa_dep_arr, planes_qa_tailnum 

### qa_manufacturer -> classificações C não entraram no dataset
### qa_enginge          -> classificações C não entraram no dataset


In [5]:
qualidade.createOrReplaceTempView('qualidade')

In [112]:
#qualidade.withColumn('check_quantidade',(
#                    qualidade.M + qualidade.F + qualidade.I + qualidade.S + qualidade.T + qualidade.null 
# )).show(100)

+-------------------+----+----+---+---+----+-----+----------------+
|                qa_|   M|   F|  I|  S|   T| null|check_quantidade|
+-------------------+----+----+---+---+----+-----+----------------+
|qa_distance_airtime|  75|   0|  0|  0|9925|    0|           10000|
| flights_qa_tailnum|   0| 435|  0| 58|   0| 9507|           10000|
|  qa_year_month_day|   0|   0|  0|  0|   0|10000|           10000|
|  planes_qa_tailnum|   0|   0|  0| 44|   0| 9956|           10000|
|   qa_dep_arr_delay|  75|   0|  0|  0|   0| 9925|           10000|
|    qa_dep_arr_time|   0| 290|  0|  0|   0| 9710|           10000|
|    qa_manufacturer|   0|   0|  0|  0|   0| 8356|            8356|
|     qa_hour_minute|  48|   0|  1|  0|   0| 9951|           10000|
|     qa_origin_dest|   0|   0|  0|  0|   0|10000|           10000|
|     origin_qa_name|   0|   0|  0|  0|   0|10000|           10000|
|      origin_qa_faa|   0|   0|  0|  0|   0|10000|           10000|
|      origin_qa_lat|   0|   0|  0|  0|   0|1000

In [130]:
#qualidade.show(100)

### Pergunta 3

In [6]:
qualidade.groupby('qa_').agg({'M': 'max'}).sort('max(M)', ascending = False).show(5)

spark.getOrCreate().sql('''
            SELECT *
            FROM qualidade
            ORDER BY M DESC 
            LIMIT 5
''').show()

+-------------------+------+
|                qa_|max(M)|
+-------------------+------+
|           qa_speed|  9443|
|            qa_year|    94|
|   qa_dep_arr_delay|    75|
|qa_distance_airtime|    75|
|        qa_air_time|    75|
+-------------------+------+
only showing top 5 rows

+-------------------+----+---+---+---+----+----+
|                qa_|   M|  F|  I|  S|   T|null|
+-------------------+----+---+---+---+----+----+
|           qa_speed|9443|  0|  0|  0|   0| 557|
|            qa_year|  94|  0|  8|  0|   0|9898|
|qa_distance_airtime|  75|  0|  0|  0|9925|   0|
|   qa_dep_arr_delay|  75|  0|  0|  0|   0|9925|
|        qa_air_time|  75|  0|  0|  0|   0|9925|
+-------------------+----+---+---+---+----+----+



### Pergunta 4

In [12]:
qualidade.groupby('qa_').agg({'F': 'max'}).sort('max(F)', ascending = False).show(5)

spark.getOrCreate().sql('''
    SELECT * 
    FROM qualidade
    ORDER BY F desc
    LIMIT 5
''').show()

+------------------+------+
|               qa_|max(F)|
+------------------+------+
|         qa_flight|  6158|
|flights_qa_tailnum|   435|
|   qa_dep_arr_time|   290|
|          qa_model|     9|
| qa_year_month_day|     0|
+------------------+------+
only showing top 5 rows

+-------------------+---+----+---+---+----+----+
|                qa_|  M|   F|  I|  S|   T|null|
+-------------------+---+----+---+---+----+----+
|          qa_flight|  0|6158|  0|  0|   0|3842|
| flights_qa_tailnum|  0| 435|  0| 58|   0|9507|
|    qa_dep_arr_time|  0| 290|  0|  0|   0|9710|
|           qa_model|  0|   9|  0|  0|   0|9991|
|qa_distance_airtime| 75|   0|  0|  0|9925|   0|
+-------------------+---+----+---+---+----+----+



### Pergunta 5

In [13]:
qualidade.groupby('qa_').agg({'I': 'max'}).sort('max(I)', ascending = False).show(5)

spark.getOrCreate().sql('''
        SELECT *
        FROM qualidade
        ORDER BY I desc
        LIMIT 5
''').show()

+------------------+------+
|               qa_|max(I)|
+------------------+------+
|           qa_year|     8|
|    qa_hour_minute|     1|
| qa_year_month_day|     0|
|flights_qa_tailnum|     0|
| planes_qa_tailnum|     0|
+------------------+------+
only showing top 5 rows

+-------------------+---+---+---+---+----+-----+
|                qa_|  M|  F|  I|  S|   T| null|
+-------------------+---+---+---+---+----+-----+
|            qa_year| 94|  0|  8|  0|   0| 9898|
|     qa_hour_minute| 48|  0|  1|  0|   0| 9951|
|qa_distance_airtime| 75|  0|  0|  0|9925|    0|
| flights_qa_tailnum|  0|435|  0| 58|   0| 9507|
|  qa_year_month_day|  0|  0|  0|  0|   0|10000|
+-------------------+---+---+---+---+----+-----+



### Perguntas para Negócio

In [54]:
airports_proc_path_data = '../data/datasets_transformados/airports.parquet'
planes_proc_path_data   = '../data/datasets_transformados/planes.parquet'
flights_proc_path_data  = '../data/datasets_transformados/flights.parquet'


airports_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(airports_proc_path_data))

planes_proc   = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(planes_proc_path_data))

flights_proc  = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(flights_proc_path_data))



In [22]:
df2 = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load('../data/datasets_relatorios/transformado1.parquet'))

df2.createOrReplaceTempView('df2')

In [15]:
#df2.schema.names

### Pergunta 2

In [64]:
df2.groupBy('dest_region').agg(F.countDistinct('dest_region', 'dest_name').alias('count')).show()
df2.groupBy('origin_region').agg(F.countDistinct('origin_region', 'origin_name').alias('count')).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT dest_region,
            COUNT (DISTINCT dest_name) as count
            FROM df2
            GROUP BY dest_region
''').show()

spark.getOrCreate().sql('''
            SELECT origin_region,
            COUNT (DISTINCT origin_name) as count
            FROM df2
            GROUP BY origin_region
''').show()

+-------------+-----+
|  dest_region|count|
+-------------+-----+
|       ALASKA|    9|
|MAINLAND-EAST|   24|
|MAINLAND-WEST|   36|
+-------------+-----+

+-------------+-----+
|origin_region|count|
+-------------+-----+
|MAINLAND-WEST|    2|
+-------------+-----+

******SQL******

+-------------+-----+
|  dest_region|count|
+-------------+-----+
|       ALASKA|    9|
|MAINLAND-EAST|   24|
|MAINLAND-WEST|   36|
+-------------+-----+

+-------------+-----+
|origin_region|count|
+-------------+-----+
|MAINLAND-WEST|    2|
+-------------+-----+



### Pergunta 3

In [42]:
df2 = df2.withColumn('aux_alt',(
                F.when((df2.origin_alt - df2.dest_alt > 0), df2.origin_alt - df2.dest_alt)
                .otherwise(None)
))

In [65]:
#df2.groupBy('dest_region', 'origin_region').agg({'aux_alt': 'max'}).sort('max(aux_alt)', ascending =False).show(5)
df2.select('flights_origin','flights_dest','aux_alt').distinct().sort('aux_alt', ascending = False).show(5)

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT 
                distinct 
                flights_origin,
                flights_dest,
                    CASE 
                        WHEN
                            origin_alt - dest_alt > 0 THEN origin_alt - dest_alt
                            ELSE 0
                        END AS diff_alt
            from df2
            ORDER BY diff_alt desc
            limit 5
''').show()

+--------------+------------+-------+
|flights_origin|flights_dest|aux_alt|
+--------------+------------+-------+
|           SEA|         MSY|    429|
|           SEA|         MIA|    425|
|           SEA|         OAK|    424|
|           SEA|         FLL|    424|
|           SEA|         SBA|    423|
+--------------+------------+-------+
only showing top 5 rows

******SQL******

+--------------+------------+--------+
|flights_origin|flights_dest|diff_alt|
+--------------+------------+--------+
|           SEA|         MSY|     429|
|           SEA|         MIA|     425|
|           SEA|         OAK|     424|
|           SEA|         FLL|     424|
|           SEA|         SBA|     423|
+--------------+------------+--------+



### Pergunta 4

In [66]:
#df2.select(F.mean('flights_dep_delay'), F.when(df2.flights_dep_delay > 0, 1)).show()
#df2.select(F.mean('flights_arr_delay')).show()

df2.where(df2.flights_dep_delay > 0).agg(F.ceil(F.mean('flights_dep_delay')).alias('atraso médio')).show()
df2.where(df2.flights_arr_delay > 0).agg(F.ceil(F.mean('flights_arr_delay')).alias('atraso médio')).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT CEIL(AVG(flights_dep_delay))
            FROM df2
                WHERE flights_dep_delay >= 0      
''').show()

spark.getOrCreate().sql('''
            SELECT CEIL(AVG(flights_arr_delay))
            FROM df2
                WHERE flights_arr_delay >= 0      
''').show()

+------------+
|atraso médio|
+------------+
|          22|
+------------+

+------------+
|atraso médio|
+------------+
|          23|
+------------+

******SQL******

+----------------------------+
|CEIL(avg(flights_dep_delay))|
+----------------------------+
|                          22|
+----------------------------+

+----------------------------+
|CEIL(avg(flights_arr_delay))|
+----------------------------+
|                          23|
+----------------------------+



### Pergunta 5

In [102]:
df2.where(df2.flights_dep_delay > 0).groupBy('dest_region', 'origin_region').agg(F.round(F.mean("flights_dep_delay"),2).alias('avg_flights_dep_delay')).show()
df2.where(df2.flights_arr_delay > 0).groupBy('dest_region', 'origin_region').agg(F.round(F.mean("flights_arr_delay"),2).alias('avg_flights_arr_delay')).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT distinct dest_region, 
                            origin_region,
                            CEIL(avg(flights_dep_delay)) as avg_flights_dep_delay
            FROM df2 
            WHERE flights_dep_delay >=0
            GROUP BY dest_region, origin_region
''').show()

spark.getOrCreate().sql('''
            SELECT distinct dest_region, 
                            origin_region,
                            CEIL(avg(flights_arr_delay)) as avg_flights_arr_delay
            FROM df2 
            WHERE flights_arr_delay >=0
            GROUP BY dest_region, origin_region
''').show()

+-------------+-------------+---------------------+
|  dest_region|origin_region|avg_flights_dep_delay|
+-------------+-------------+---------------------+
|MAINLAND-EAST|MAINLAND-WEST|                26.33|
|       ALASKA|MAINLAND-WEST|                20.82|
|MAINLAND-WEST|MAINLAND-WEST|                25.98|
+-------------+-------------+---------------------+

+-------------+-------------+---------------------+
|  dest_region|origin_region|avg_flights_arr_delay|
+-------------+-------------+---------------------+
|MAINLAND-EAST|MAINLAND-WEST|                28.53|
|       ALASKA|MAINLAND-WEST|                22.21|
|MAINLAND-WEST|MAINLAND-WEST|                23.79|
+-------------+-------------+---------------------+

******SQL******

+-------------+-------------+---------------------+
|  dest_region|origin_region|avg_flights_dep_delay|
+-------------+-------------+---------------------+
|MAINLAND-EAST|MAINLAND-WEST|                   22|
|       ALASKA|MAINLAND-WEST|                

### Pergunta 6

In [103]:
df2.where(df2.flights_dep_delay > 0).groupBy(F.year('flights_dep_datetime')).agg({"flights_dep_delay": 'sum'}).show()
df2.where(df2.flights_arr_delay > 0).groupBy(F.year('flights_dep_datetime')).agg({"flights_arr_delay": 'sum'}).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
                SELECT  YEAR(flights_dep_datetime),
                        SUM(flights_dep_delay) as sum_flights_dep_delay
                FROM df2
                where flights_dep_delay >= 0 
                GROUP BY YEAR(flights_dep_datetime)

''').show()

spark.getOrCreate().sql('''
                SELECT  YEAR(flights_dep_datetime),
                        SUM(flights_arr_delay) as sum_flights_arr_delay
                FROM df2
                where flights_arr_delay >= 0 
                GROUP BY YEAR(flights_dep_datetime)

''').show()
#df2.select('flights_dep_datetime').show(1)

+--------------------------+----------------------+
|year(flights_dep_datetime)|sum(flights_dep_delay)|
+--------------------------+----------------------+
|                      2014|                 88314|
+--------------------------+----------------------+

+--------------------------+----------------------+
|year(flights_dep_datetime)|sum(flights_arr_delay)|
+--------------------------+----------------------+
|                      2014|                 91820|
+--------------------------+----------------------+

******SQL******

+--------------------------+---------------------+
|year(flights_dep_datetime)|sum_flights_dep_delay|
+--------------------------+---------------------+
|                      2014|                88314|
+--------------------------+---------------------+

+--------------------------+---------------------+
|year(flights_dep_datetime)|sum_flights_arr_delay|
+--------------------------+---------------------+
|                      2014|                91820|
+

### Pergunta 7

In [79]:
df2.where(df2.flights_dep_delay >= 0).groupBy(F.year('flights_dep_datetime'), 'dest_region',).agg({"flights_dep_delay": 'sum'}).show()
df2.where(df2.flights_arr_delay >= 0).groupBy(F.year('flights_dep_datetime'), 'dest_region').agg({"flights_arr_delay": 'sum'}).show()

df2.where(df2.flights_dep_delay >= 0).groupBy(F.year('flights_dep_datetime'), 'origin_region',).agg({"flights_dep_delay": 'sum'}).show()
df2.where(df2.flights_arr_delay >= 0).groupBy(F.year('flights_dep_datetime'), 'origin_region').agg({"flights_arr_delay": 'sum'}).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT YEAR(flights_dep_datetime) as year_datetime,
                   dest_region,
                   SUM(flights_dep_delay) as sum_flights_dep_delay
            FROM df2
            WHERE flights_dep_delay >= 0
            GROUP BY YEAR(flights_dep_datetime), dest_region        
''').show()

+--------------------------+-------------+----------------------+
|year(flights_dep_datetime)|  dest_region|sum(flights_dep_delay)|
+--------------------------+-------------+----------------------+
|                      2014|       ALASKA|                  7515|
|                      2014|MAINLAND-EAST|                 21538|
|                      2014|MAINLAND-WEST|                 59261|
+--------------------------+-------------+----------------------+

+--------------------------+-------------+----------------------+
|year(flights_dep_datetime)|  dest_region|sum(flights_arr_delay)|
+--------------------------+-------------+----------------------+
|                      2014|       ALASKA|                  8640|
|                      2014|MAINLAND-EAST|                 22938|
|                      2014|MAINLAND-WEST|                 60242|
+--------------------------+-------------+----------------------+

+--------------------------+-------------+----------------------+
|year(fl

### Pergunta 8

In [80]:
df2.select(F.ceil(F.mean('flights_air_time'))).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
        SELECT ceil(avg(flights_air_time)) as avg_flights_air_time
        FROM df2
''').show()

+---------------------------+
|CEIL(avg(flights_air_time))|
+---------------------------+
|                        153|
+---------------------------+

******SQL******

+--------------------+
|avg_flights_air_time|
+--------------------+
|                 153|
+--------------------+



### Pergunta 9

In [83]:
df2.groupBy('dest_region',).agg(F.round(F.mean("flights_air_time"),2).alias('avg_dest_flights_air_time')).show()

df2.groupBy('origin_region',).agg(F.round(F.mean("flights_air_time"),2).alias('avg_origin_flights_dep_delay')).show()

print('******SQL******\n')

spark.getOrCreate().sql('''
        SELECT dest_region,
               CEIL(avg(flights_air_time)) as avg_flights_air_time
        FROM df2
        GROUP BY dest_region
''').show()

+-------------+-------------------------+
|  dest_region|avg_dest_flights_air_time|
+-------------+-------------------------+
|       ALASKA|                   227.87|
|MAINLAND-EAST|                   237.11|
|MAINLAND-WEST|                   115.39|
+-------------+-------------------------+

+-------------+----------------------------+
|origin_region|avg_origin_flights_dep_delay|
+-------------+----------------------------+
|MAINLAND-WEST|                      152.87|
+-------------+----------------------------+

******SQL******

+-------------+--------------------+
|  dest_region|avg_flights_air_time|
+-------------+--------------------+
|       ALASKA|                 228|
|MAINLAND-EAST|                 238|
|MAINLAND-WEST|                 116|
+-------------+--------------------+



### Pergunta 10

In [98]:
df2 = df2.withColumn('origin_dest', (
            F.concat_ws('-',df2.flights_origin, df2.flights_dest)
))

In [86]:
df2.groupBy('flights_origin', 'flights_dest').agg(F.ceil(F.mean('flights_air_time'))).show(5)

print('******SQL******\n')

spark.getOrCreate().sql('''
            SELECT flights_origin,
                   flights_dest,
                   ceil(avg(flights_air_time)) as avg_flights_air_time
            FROM df2
            GROUP BY flights_origin, flights_dest
            LIMIT 5
''').show()

+--------------+------------+---------------------------+
|flights_origin|flights_dest|CEIL(avg(flights_air_time))|
+--------------+------------+---------------------------+
|           SEA|         RNO|                         75|
|           SEA|         DTW|                        220|
|           SEA|         CLE|                        234|
|           SEA|         LAX|                        127|
|           PDX|         SEA|                         35|
+--------------+------------+---------------------------+
only showing top 5 rows

******SQL******

+--------------+------------+--------------------+
|flights_origin|flights_dest|avg_flights_air_time|
+--------------+------------+--------------------+
|           SEA|         RNO|                  75|
|           SEA|         DTW|                 220|
|           SEA|         CLE|                 234|
|           SEA|         LAX|                 127|
|           PDX|         SEA|                  35|
+--------------+------------

### Pergunta 11

In [70]:
df2.groupBy(F.year('flights_dep_datetime')).agg({'flights_air_time': 'sum'}).show()

+--------------------------+---------------------+
|year(flights_dep_datetime)|sum(flights_air_time)|
+--------------------------+---------------------+
|                      2014|              1528696|
+--------------------------+---------------------+



### Pergunta 12

In [71]:
df2.groupBy('dest_region').agg({'flights_air_time': 'sum'}).show()
df2.groupBy('origin_region').agg({'flights_air_time': 'sum'}).show()

+-------------+---------------------+
|  dest_region|sum(flights_air_time)|
+-------------+---------------------+
|       ALASKA|               230607|
|MAINLAND-EAST|               508366|
|MAINLAND-WEST|               789723|
+-------------+---------------------+

+-------------+---------------------+
|origin_region|sum(flights_air_time)|
+-------------+---------------------+
|MAINLAND-WEST|              1528696|
+-------------+---------------------+



### Pergunta 13

In [100]:
df2.select(F.ceil(F.mean('flights_distance'))).show()

+---------------------------+
|CEIL(avg(flights_distance))|
+---------------------------+
|                       1209|
+---------------------------+



### Pergunta 14

In [103]:
df2.groupBy('origin_region').agg({'flights_distance': 'avg'}).show()
df2.groupBy('dest_region').agg({'flights_distance': 'avg'}).show()

+-------------+---------------------+
|origin_region|avg(flights_distance)|
+-------------+---------------------+
|MAINLAND-WEST|            1208.1516|
+-------------+---------------------+

+-------------+---------------------+
|  dest_region|avg(flights_distance)|
+-------------+---------------------+
|       ALASKA|    1741.653162055336|
|MAINLAND-EAST|   2042.3983208955224|
|MAINLAND-WEST|    867.9224137931035|
+-------------+---------------------+



### Pergunta 15

#### Para negócios faz mais sentido ver a média das diferenças ou desvio padrão. Se esses valores não se aproximarem a 0 tem algo de errado.


In [126]:
df2.groupBy('flights_origin', 'flights_dest').agg(F.ceil(F.mean('flights_distance'))).show(5, truncate = False)

+--------------+------------+---------------------------+
|flights_origin|flights_dest|CEIL(avg(flights_distance))|
+--------------+------------+---------------------------+
|SEA           |RNO         |564                        |
|SEA           |DTW         |1927                       |
|SEA           |CLE         |2021                       |
|SEA           |LAX         |954                        |
|PDX           |SEA         |129                        |
+--------------+------------+---------------------------+
only showing top 5 rows



### Pergunta 16

In [75]:
df2.groupBy(F.year('flights_dep_datetime')).agg({'flights_distance': 'sum'}).show(5, truncate = False)

+--------------------------+---------------------+
|year(flights_dep_datetime)|sum(flights_distance)|
+--------------------------+---------------------+
|2014                      |12081516             |
+--------------------------+---------------------+



### Pergunta 17

In [76]:
df2.groupBy(F.year('flights_dep_datetime'), 'origin_region').agg({'flights_distance': 'sum'}).show(5, truncate = False)
df2.groupBy(F.year('flights_dep_datetime'), 'dest_region').agg({'flights_distance': 'sum'}).show(5, truncate = False)

+--------------------------+-------------+---------------------+
|year(flights_dep_datetime)|origin_region|sum(flights_distance)|
+--------------------------+-------------+---------------------+
|2014                      |MAINLAND-WEST|12081516             |
+--------------------------+-------------+---------------------+

+--------------------------+-------------+---------------------+
|year(flights_dep_datetime)|dest_region  |sum(flights_distance)|
+--------------------------+-------------+---------------------+
|2014                      |ALASKA       |1762553              |
|2014                      |MAINLAND-EAST|4378902              |
|2014                      |MAINLAND-WEST|5940061              |
+--------------------------+-------------+---------------------+



### Pergunta 18

In [125]:
df2.groupBy('flights_origin', 'flights_dest', 'planes_seats').agg({'planes_seats': 'avg'}).show(5)

+--------------+------------+------------+-----------------+
|flights_origin|flights_dest|planes_seats|avg(planes_seats)|
+--------------+------------+------------+-----------------+
|           SEA|         PSP|         149|            149.0|
|           PDX|         PHX|         199|            199.0|
|           PDX|         EWR|         191|            191.0|
|           PDX|         SJC|         149|            149.0|
|           PDX|         SFO|         200|            200.0|
+--------------+------------+------------+-----------------+
only showing top 5 rows



### Pergunta 19

In [107]:
df2.groupBy(F.year('flights_dep_datetime')).agg({'planes_seats': 'sum'}).show()

+--------------------------+-----------------+
|year(flights_dep_datetime)|sum(planes_seats)|
+--------------------------+-----------------+
|                      2014|        1509544.0|
+--------------------------+-----------------+



### Pergunta 20

In [79]:
df2.groupBy('flights_dest').agg({'flights_dest': 'count'}).sort('count(flights_dest)', ascending = False).show(5)

+------------+-------------------+
|flights_dest|count(flights_dest)|
+------------+-------------------+
|         SFO|                787|
|         LAX|                615|
|         DEN|                586|
|         PHX|                530|
|         LAS|                520|
+------------+-------------------+
only showing top 5 rows



### Pergunta 21

In [80]:
df2.groupBy('flights_dest').agg({'planes_seats': 'sum'}).sort('sum(planes_seats)', ascending = False).show(5)

+------------+-----------------+
|flights_dest|sum(planes_seats)|
+------------+-----------------+
|         SFO|         119635.0|
|         PHX|          96317.0|
|         LAX|          91406.0|
|         DEN|          88218.0|
|         LAS|          76354.0|
+------------+-----------------+
only showing top 5 rows



### Pergunta 22

In [94]:
df2.groupBy('flights_origin', 'flights_dest').agg({'flights_distance': 'max'}).filter(df2.flights_origin == 'PDX').sort('max(flights_distance)', ascending = False).show(5)

+--------------+------------+---------------------+
|flights_origin|flights_dest|max(flights_distance)|
+--------------+------------+---------------------+
|           PDX|         LIH|                 2631|
|           PDX|         KOA|                 2607|
|           PDX|         HNL|                 2603|
|           PDX|         OGG|                 2562|
|           PDX|         BOS|                 2537|
+--------------+------------+---------------------+
only showing top 5 rows



### Pergunta 23

In [112]:
df2.groupBy('flights_dest', F.month('flights_dep_datetime').alias('month')).count().sort('count', ascending = False).show(5)

+------------+-----+-----+
|flights_dest|month|count|
+------------+-----+-----+
|         LAX|    5|   77|
|         SFO|   12|   76|
|         SFO|    8|   75|
|         SFO|    5|   73|
|         SFO|    7|   71|
+------------+-----+-----+
only showing top 5 rows



### Pergunta 24

In [83]:
df2.groupBy('planes_model').count().sort('count', ascending = False).show(5)

+------------+-----+
|planes_model|count|
+------------+-----+
|     737-890| 1463|
|     737-7H4|  851|
|   737-990ER|  664|
|    A320-232|  612|
|     737-790|  581|
+------------+-----+
only showing top 5 rows



### Pergunta 25

In [84]:
df2.where(df2.planes_model.isNotNull()).groupby('planes_model', 'flights_dest').count().sort('count', ascending = False).show(5)

+------------+------------+-----+
|planes_model|flights_dest|count|
+------------+------------+-----+
|     737-7H4|         OAK|  141|
|     737-890|         ANC|  138|
|     737-790|         SNA|  122|
|     737-7H4|         SMF|  114|
|     737-890|         LAX|  110|
+------------+------------+-----+
only showing top 5 rows



### Pergunta 26

In [101]:
df2.where(df2.planes_engines.isNotNull()).groupBy('flights_haul_duration').agg(F.ceil(F.mean(df2.planes_engines)).alias('avg_engines')).sort('avg_engines').show()

+---------------------+-----------+
|flights_haul_duration|avg_engines|
+---------------------+-----------+
|            LONG-HAUL|          2|
|          MEDIUM-HAUL|          2|
|           SHORT-HAUL|          2|
+---------------------+-----------+



### Pergunta 27

In [86]:
df2.groupBy('flights_dep_season').agg({'flights_dep_season': 'count'}).show()

+------------------+-------------------------+
|flights_dep_season|count(flights_dep_season)|
+------------------+-------------------------+
|            WINTER|                     2149|
|            SPRING|                     2560|
|              FALL|                     2373|
|            SUMMER|                     2918|
+------------------+-------------------------+



### Pergunta 28

In [88]:
df2.groupBy('flights_dep_season', 'flights_dest').count().sort('count', ascending = False).show()

+------------------+------------+-----+
|flights_dep_season|flights_dest|count|
+------------------+------------+-----+
|            SUMMER|         SFO|  217|
|            SPRING|         SFO|  199|
|              FALL|         SFO|  198|
|            SPRING|         LAX|  176|
|            WINTER|         SFO|  173|
|            SUMMER|         DEN|  172|
|            SUMMER|         LAX|  163|
|            SPRING|         DEN|  151|
|            SPRING|         PHX|  148|
|              FALL|         DEN|  145|
|            SPRING|         LAS|  145|
|            SUMMER|         ANC|  145|
|            WINTER|         LAX|  139|
|              FALL|         LAX|  137|
|            SUMMER|         LAS|  136|
|            SUMMER|         PHX|  134|
|            WINTER|         PHX|  131|
|              FALL|         LAS|  130|
|            SUMMER|         ORD|  129|
|            SUMMER|         DFW|  122|
+------------------+------------+-----+
only showing top 20 rows



### Pergunta 29

In [94]:
df2.where(df2.flights_dep_delay_category != 'ANTECIPATED').groupBy('flights_dep_delay_category').count().show()

+--------------------------+-----+
|flights_dep_delay_category|count|
+--------------------------+-----+
|                     MAJOR|  395|
|                     MINOR| 3065|
|                    INTIME|  646|
+--------------------------+-----+



### Pergunta 30

In [91]:
df2.where(df2.flights_dep_delay_category != 'ANTECIPATED').groupBy('flights_dep_delay_category', 'flights_dest', 'flights_origin').count().sort('count', ascending = False).show(5)

+--------------------------+------------+--------------+-----+
|flights_dep_delay_category|flights_dest|flights_origin|count|
+--------------------------+------------+--------------+-----+
|                     MINOR|         SFO|           SEA|  159|
|                     MINOR|         DEN|           SEA|  145|
|                     MINOR|         ANC|           SEA|  145|
|                     MINOR|         LAX|           SEA|  118|
|                     MINOR|         LAS|           SEA|  114|
+--------------------------+------------+--------------+-----+
only showing top 5 rows

