# Data Cleaning in PySpark

+ Schemas
+ May contain various data types
+ Can filter garbage data during import
+ Improves read performance 

In [None]:
import pyspark.sql.types
peopleSchema = StructType([
    StructField('name', StringType(), True), # Boolean is if data can be null or not
    StructField('age', IntegerType(), True),
    StructField('city',StringType(), True)    
])

In [None]:
people_df = spark.read.format('csv').load(name = 'rawdata.csv', 
                                          schema = peopleSchema)

In [None]:
# Import the pyspark.sql.types library
from pyspark.sql.types import *

# Define a new schema using the StructType method
people_schema = StructType([
  # Define a StructField for each field
  StructField('name',StringType(), False),
  StructField('age', IntegerType(),False),
  StructField('city',StringType(), False)
])

In [None]:
# Load the CSV file
aa_dfw_df = spark.read.format('csv').options(Header=True).load('AA_DFW_2018.csv.gz')

# Add the airport column using the F.lower() method
aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport']))

# Drop the Destination Airport column
aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport'])

# Show the DataFrame
aa_dfw_df.show()

Difficulties with CSV files
+ No Defined schema
+ Nested data requires special handling
+ Slow to parse
+ If schema is not provided, all data must be read before inferring schema
+ Predicate pushdown - Ordering tasks to do least amount of work. Filtering the data before processing is one of the optimizations for large datasets. In CSV this cannot be done.
+ Any intermediate use requires redefining schemas

Parquet Format
+ A Columnar data format
+ Supported in Spark 
+ Supports predicate pushdown
+ Automatically stores schema information
+ Binary file format

In [None]:
# Reading parquet files
df = spark.read.format('parquet').load('filename.parquet')
df = spark.read.parquet('filename.parquet')
df.createOrReplaceTempView('flights')

# Writing parquet files
df.write.format('parquet').save('filename.parquet')
df.write.parquet('filename.parquet')

In [None]:
# View the row count of df1 and df2
print("df1 Count: %d" % df1.count())
print("df2 Count: %d" % df2.count())

# Combine the DataFrames into one
df3 = df1.union(df2)

# Save the df3 DataFrame in Parquet format
df3.write.parquet('AA_DFW_ALL.parquet', mode='overwrite')

# Read the Parquet file into a new DataFrame and run a count
print(spark.read.parquet('AA_DFW_ALL.parquet').count())

In [None]:
# Read the Parquet file into flights_df
flights_df = spark.read.parquet('AA_DFW_ALL.parquet')

# Register the temp table
flights_df.createOrReplaceTempView('flights')

# Run a SQL query of the average flight duration
avg_duration = spark.sql('SELECT avg(flight_duration) from flights').collect()[0]
print('The average flight time is: %d' % avg_duration)

### Manipulating DataFrames

In [None]:
# Filter / Where
voter_df.filter(voter_df.date > '1/1/2019')
voter_df.where(voter_df.date > '1/1/2019')
voter_df.filter(voter_df['name'].isNotNUll())  # Remove Nulls
voter_df.where(~ voter_df['name'].isNull())
voter_df.filter(voter_df.date.year > 1800) 
voter_df.where(voter_df['_C0'].contains('VOTE')) 
voter_df.where(~voter_df._c1.isNull()) # using Negation

# Select
voter_df.select(voter_df.name)

# withColumn to create new column - (name_of_column, command to create)
voter_df.withColumn('year', voter_df.date.year)

# drop
voter_df.drop('unused_column')

In [None]:
# String transformations contained in pyspark.sql.functions
import pyspark.sql.functions as F

# Applied per column
voter_df.withColumn('lower', F.lower('name'))
voter_df.withColumn('splits', F.split('name',' ')) # intermediary columns
voter_df.withColumn('year', voter_df['_c4'].cast(IntegerType()))

In [None]:
# ArrayType
.size(column) # length of array type column
.getItem(index) # Retrieves the items with the index from the list

In [None]:
# Show the distinct VOTER_NAME entries
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

# Filter voter_df where the VOTER_NAME is 1-20 characters in length
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')

# Filter out voter_df where the VOTER_NAME contains an underscore
voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))

# Show the distinct VOTER_NAME entries again
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

In [None]:
# Add a new column called splits separated on whitespace
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))

# Create a new column called first_name based on the first item in splits
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))

# Get the last entry of the splits list and create a column called last_name
voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - (F.size('splits')-1)))

# Drop the splits column
voter_df = voter_df.drop('splits')

# Show the voter_df DataFrame
voter_df.show()

In [None]:
# conditionals
# .when(if_condition, then)
df.select(df.Name, df.Age, F.when(df.Age >= 18, "Adult"))

In [None]:
df.select(df.Name, df.Age,
         F.when(df.Age >= 18, "Adult")
          .when(df.Age < 18, "Minor"))

# otherwise() is like else
df.select(df.Name, df.Age,
         F.when(df.Age >= 18, "Adult")
          .otherwise("Minor"))

In [None]:
# Add a column to voter_df for any voter with the title **Councilmember**
voter_df = voter_df.withColumn('random_val',
                               when(voter_df.TITLE == 'Councilmember', F.rand()))

# Show some of the DataFrame rows, noting whether the when clause worked
voter_df.show()

In [None]:
# Add a column to voter_df for a voter based on their position
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE == 'Councilmember', F.rand())
                               .when(voter_df.TITLE == 'Mayor', 2)
                               .otherwise(0))

# Show some of the DataFrame rows
voter_df.show()

# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 0).show()

#### UDF

In [None]:
+ Python Method
+ wrapped via pyspark.sql.functions.udf method
+ stored as a variable and called like a normal spark function

In [None]:
# Define a python method
def reverseString(mystr):
    return mystr[::-1]

#wrap the function and store as a variable
udfReverseString = F.udf(reverseString, StringType())

#Use with Spark
user_df = user_df_withColumn('ReverseName', udfReverseString(user_df.Name))

In [None]:

def getFirstAndMiddle(names):
  # Return a space separated string of names
  return ' '.join(names[:-1])

# Define the method as a UDF
udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType())

# Create a new column using your UDF
voter_df = voter_df.withColumn('first_and_middle_name', udfFirstAndMiddle(voter_df.splits))

# Show the DataFrame
voter_df.show()

#### Assigning IDs

+ Sequential ID's create bottleneck in spark
+ Monotonically increasing IDs can be used in spark
+ pyspark.sql.functions.monotonically_increasing_id()
+ This is not sequential and completely parallel
+ IDs are provided based on the partitions

In [None]:
# Select all the unique council voters
voter_df = df.select(df["VOTER NAME"]).distinct()

# Count the rows in voter_df
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df.count())

# Add a ROW_ID
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id())

# Show the rows with 10 highest IDs in the set
voter_df.orderBy(voter_df.ROW_ID.desc()).show(10)