In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

Create a view over DataFrame.

In [5]:
flightData2012 = spark\
 .read\
 .option("inferSchema", "true")\
 .option("header", "true")\
 .csv("./data/2012-summary.csv")

flightData2012.createOrReplaceTempView("flight_data_2012")

Check execuation plans

In [7]:
sqlWay = spark.sql("""
 SELECT DEST_COUNTRY_NAME, count(1)
 FROM flight_data_2012
 GROUP BY DEST_COUNTRY_NAME
 """)

dataFrameWay = flightData2012\
 .groupBy("DEST_COUNTRY_NAME")\
 .count()

sqlWay.explain()

dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#40, 200), ENSURE_REQUIREMENTS, [plan_id=55]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#40] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/chevron/Documents/py-projects/data/2012-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#40, 200), ENSURE_REQUIREMENTS, [plan_id=68]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#40] Batched: false, DataFilters: [], Format: CSV, Location: InMemory

Find the maximum number of flights to and from any given location.

In [8]:
# with SQL
spark.sql("SELECT max(count) from flight_data_2012").take(1)

[Row(max(count)=347452)]

In [9]:
# with DataFrame
from pyspark.sql.functions import max

flightData2012.select(max("count")).take(1)

[Row(max(count)=347452)]

Find the top five destination countries in the data.

In [10]:
# with SQL
maxSql = spark.sql("""
 SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
 FROM flight_data_2012
 GROUP BY DEST_COUNTRY_NAME
 ORDER BY sum(count) DESC
 LIMIT 5
 """)

maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           384342|
|           Canada|             8034|
|           Mexico|             5983|
|   United Kingdom|             1852|
|            Japan|             1538|
+-----------------+-----------------+



In [11]:
# with DataFrame
from pyspark.sql.functions import desc

flightData2012\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           384342|
|           Canada|             8034|
|           Mexico|             5983|
|   United Kingdom|             1852|
|            Japan|             1538|
+-----------------+-----------------+



Explain plan for the above transformations and action.

In [12]:
flightData2012\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#131L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#40,destination_total#131L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[sum(count#42)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#40, 200), ENSURE_REQUIREMENTS, [plan_id=238]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[partial_sum(count#42)])
            +- FileScan csv [DEST_COUNTRY_NAME#40,count#42] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/chevron/Documents/py-projects/data/2012-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




In [13]:
maxSql.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#79L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#40,destination_total#79L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[sum(count#42)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#40, 200), ENSURE_REQUIREMENTS, [plan_id=255]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#40], functions=[partial_sum(count#42)])
            +- FileScan csv [DEST_COUNTRY_NAME#40,count#42] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/chevron/Documents/py-projects/data/2012-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


