In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('SDG_Chapter02').getOrCreate()

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import pandas as pd
flightData2015 = pd.read_csv('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-summary.csv')
flightData2015.head()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,15
1,United States,Croatia,1
2,United States,Ireland,344
3,Egypt,United States,15
4,United States,India,62


In [5]:
flightData2015 = spark\
  .read\
  .option('inferSchema', 'true')\
  .option('header', 'true')\
  .csv('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-summary.csv')

In [6]:
flightData2015.take(4)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15)]

In [7]:
flightData2015.count()

256

In [8]:
flightData2015.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Sint Maarten', count=325),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Marshall Islands', count=39),
 

In [9]:
flightData2015.sort('count').explain()

== Physical Plan ==
*(2) Sort [count#12 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#12 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,ORIGIN_COUNTRY_NAME#11,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [10]:
spark.conf.set('spark.sql.shuffle.partitions', '5')
flightData2015.sort('count').take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [11]:
spark.conf.set('spark.sql.shuffle.partitions', '15')
flightData2015.sort('count').take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [12]:
spark.conf.set('spark.sql.shuffle.partitions', '15')
flightData2015.sort('count').explain()

== Physical Plan ==
*(2) Sort [count#12 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#12 ASC NULLS FIRST, 15)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,ORIGIN_COUNTRY_NAME#11,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [13]:
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [15]:
flightData2015.createOrReplaceTempView('flightData2015')

In [16]:
flightData2015.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [17]:
flightData2015

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [18]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flightData2015
GROUP BY DEST_COUNTRY_NAME
""")

In [19]:
dataFrameWay = flightData2015\
  .groupBy('DEST_COUNTRY_NAME')\
  .count()

In [20]:
sqlWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 15)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [21]:
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 15)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [22]:
sqlWay.take(2)

[Row(DEST_COUNTRY_NAME='Pakistan', count(1)=1),
 Row(DEST_COUNTRY_NAME='Japan', count(1)=1)]

In [23]:
spark.sql('SELECT max(count) from flightData2015').take(1)

[Row(max(count)=370002)]

In [24]:
from pyspark.sql.functions import max
flightData2015.select(max('count')).take(3)

[Row(max(count)=370002)]

In [25]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flightData2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 10
""")

In [26]:
maxSql.show()

+------------------+-----------------+
| DEST_COUNTRY_NAME|destination_total|
+------------------+-----------------+
|     United States|           411352|
|            Canada|             8399|
|            Mexico|             7140|
|    United Kingdom|             2025|
|             Japan|             1548|
|           Germany|             1468|
|Dominican Republic|             1353|
|       South Korea|             1048|
|       The Bahamas|              955|
|            France|              935|
+------------------+-----------------+



In [27]:
maxSql.collect()

[Row(DEST_COUNTRY_NAME='United States', destination_total=411352),
 Row(DEST_COUNTRY_NAME='Canada', destination_total=8399),
 Row(DEST_COUNTRY_NAME='Mexico', destination_total=7140),
 Row(DEST_COUNTRY_NAME='United Kingdom', destination_total=2025),
 Row(DEST_COUNTRY_NAME='Japan', destination_total=1548),
 Row(DEST_COUNTRY_NAME='Germany', destination_total=1468),
 Row(DEST_COUNTRY_NAME='Dominican Republic', destination_total=1353),
 Row(DEST_COUNTRY_NAME='South Korea', destination_total=1048),
 Row(DEST_COUNTRY_NAME='The Bahamas', destination_total=955),
 Row(DEST_COUNTRY_NAME='France', destination_total=935)]

In [28]:
from pyspark.sql.functions import desc
flightData2015\
  .groupBy('DEST_COUNTRY_NAME')\
  .sum('count')\
  .withColumnRenamed('sum(count)', 'destination_total')\
  .sort(desc('destination_total'))\
  .limit(10)\
  .show()

+------------------+-----------------+
| DEST_COUNTRY_NAME|destination_total|
+------------------+-----------------+
|     United States|           411352|
|            Canada|             8399|
|            Mexico|             7140|
|    United Kingdom|             2025|
|             Japan|             1548|
|           Germany|             1468|
|Dominican Republic|             1353|
|       South Korea|             1048|
|       The Bahamas|              955|
|            France|              935|
+------------------+-----------------+



In [30]:
flightData2015\
  .groupBy('DEST_COUNTRY_NAME')\
  .sum('count')\
  .withColumnRenamed('sum(count)',  'destination_total')\
  .sort(desc('destination_total'))\
  .limit(10)\
  .show()

+------------------+-----------------+
| DEST_COUNTRY_NAME|destination_total|
+------------------+-----------------+
|     United States|           411352|
|            Canada|             8399|
|            Mexico|             7140|
|    United Kingdom|             2025|
|             Japan|             1548|
|           Germany|             1468|
|Dominican Republic|             1353|
|       South Korea|             1048|
|       The Bahamas|              955|
|            France|              935|
+------------------+-----------------+

