In [80]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

In [81]:
spark = SparkSession.builder.appName("teste").config("spark.some.config.option", "some-value").getOrCreate()

In [82]:
# mapeando schema do arquivo
departamentSchema = StructType([
    StructField('Date (MM/DD/YYYY)', StringType(), False),
    StructField('Flight Number', StringType(), False),
    StructField('Destination Airport', StringType(), False),
    StructField('Actual elapsed time (Minutes)', StringType(), False) 

])

In [83]:
# Leitura do arquivo csv
departament_df = spark.read.format('csv').options(Header=True).load('./dataFile/AA_DFW_2014_Departures_Short.csv', schema=departamentSchema)

In [85]:
# Renomeando colunas do dataframe
departament_df = departament_df.withColumnRenamed('Date (MM/DD/YYYY)', 'data')\
.withColumnRenamed('Flight Number', 'flight_number')\
.withColumnRenamed('Destination Airport', 'destination_airport')\
.withColumnRenamed('Actual elapsed time (Minutes)', 'actual_elapsed_time')

# Convertendo a String da coluna data para o formato date
departament_df = departament_df.withColumn('data', F.to_date(departament_df.data,'dd/mm/yyyy' ))

# Verificando a quantidade de registros do dataframe
print('O arquivo contém : ' , departament_df.count() , 'linhas')

O arquivo contém :  157198 linhas


In [86]:
# Return rows where 'Destination Airport' equals 'STL'
stl_df = departament_df.filter(departament_df.destination_airport.like('STL'))
print('O dataframe filtrado contém : ' , stl_df.count() , 'linhas')

O dataframe filtrado contém :  2798 linhas


In [87]:
# Selecionando apenas uma coluna
filter_col_df = stl_df.select('flight_number')

In [89]:
# With Column date
withColumn_df = departament_df.withColumn('ano', F.year('data'))

In [93]:
withColumn_df.withColumn('flight_number', withColumn_df['flight_number'].cast(IntegerType()))

DataFrame[data: date, flight_number: int, destination_airport: string, actual_elapsed_time: string, ano: int]

In [98]:
#Entradas distintas
withColumn_df.select('ano').distinct().show()

+----+
| ano|
+----+
|2014|
+----+

