# Data Cleaning in PySpark

+ Schemas
+ May contain various data types
+ Can filter garbage data during import
+ Improves read performance 

In [None]:
import pyspark.sql.types
peopleSchema = StructType([
    StructField('name', StringType(), True), # Boolean is if data can be null or not
    StructField('age', IntegerType(), True),
    StructField('city',StringType(), True)    
])

In [None]:
people_df = spark.read.format('csv').load(name = 'rawdata.csv', 
                                          schema = peopleSchema)

In [None]:
# Import the pyspark.sql.types library
from pyspark.sql.types import *

# Define a new schema using the StructType method
people_schema = StructType([
  # Define a StructField for each field
  StructField('name',StringType(), False),
  StructField('age', IntegerType(),False),
  StructField('city',StringType(), False)
])

In [None]:
# Load the CSV file
aa_dfw_df = spark.read.format('csv').options(Header=True).load('AA_DFW_2018.csv.gz')

# Add the airport column using the F.lower() method
aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport']))

# Drop the Destination Airport column
aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport'])

# Show the DataFrame
aa_dfw_df.show()

Difficulties with CSV files
+ No Defined schema
+ Nested data requires special handling
+ Slow to parse
+ If schema is not provided, all data must be read before inferring schema
+ Predicate pushdown - Ordering tasks to do least amount of work. Filtering the data before processing is one of the optimizations for large datasets. In CSV this cannot be done.
+ Any intermediate use requires redefining schemas

Parquet Format
+ A Columnar data format
+ Supported in Spark 
+ Supports predicate pushdown
+ Automatically stores schema information
+ Binary file format

In [None]:
df = spark.read.format('parquet').load('filename.parquet')
df = spark.read.parquet('filename.parquet')