# Data Cleaning in PySpark

+ Schemas
+ May contain various data types
+ Can filter garbage data during import
+ Improves read performance 

In [None]:
import pyspark.sql.types
peopleSchema = StructType([
    StructField('name', StringType(), True), # Boolean is if data can be null or not
    StructField('age', IntegerType(), True),
    StructField('city',StringType(), True)    
])

In [None]:
people_df = spark.read.format('csv').load(name = 'rawdata.csv', 
                                          schema = peopleSchema)

In [None]:
# Import the pyspark.sql.types library
from pyspark.sql.types import *

# Define a new schema using the StructType method
people_schema = StructType([
  # Define a StructField for each field
  StructField('name',StringType(), False),
  StructField('age', IntegerType(),False),
  StructField('city',StringType(), False)
])

In [None]:
# Load the CSV file
aa_dfw_df = spark.read.format('csv').options(Header=True).load('AA_DFW_2018.csv.gz')

# Add the airport column using the F.lower() method
aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport']))

# Drop the Destination Airport column
aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport'])

# Show the DataFrame
aa_dfw_df.show()

Difficulties with CSV files
+ No Defined schema
+ Nested data requires special handling
+ Slow to parse
+ If schema is not provided, all data must be read before inferring schema
+ Predicate pushdown - Ordering tasks to do least amount of work. Filtering the data before processing is one of the optimizations for large datasets. In CSV this cannot be done.
+ Any intermediate use requires redefining schemas

Parquet Format
+ A Columnar data format
+ Supported in Spark 
+ Supports predicate pushdown
+ Automatically stores schema information
+ Binary file format

In [None]:
# Reading parquet files
df = spark.read.format('parquet').load('filename.parquet')
df = spark.read.parquet('filename.parquet')
df.createOrReplaceTempView('flights')

# Writing parquet files
df.write.format('parquet').save('filename.parquet')
df.write.parquet('filename.parquet')

In [None]:
# View the row count of df1 and df2
print("df1 Count: %d" % df1.count())
print("df2 Count: %d" % df2.count())

# Combine the DataFrames into one
df3 = df1.union(df2)

# Save the df3 DataFrame in Parquet format
df3.write.parquet('AA_DFW_ALL.parquet', mode='overwrite')

# Read the Parquet file into a new DataFrame and run a count
print(spark.read.parquet('AA_DFW_ALL.parquet').count())

In [None]:
# Read the Parquet file into flights_df
flights_df = spark.read.parquet('AA_DFW_ALL.parquet')

# Register the temp table
flights_df.createOrReplaceTempView('flights')

# Run a SQL query of the average flight duration
avg_duration = spark.sql('SELECT avg(flight_duration) from flights').collect()[0]
print('The average flight time is: %d' % avg_duration)