In [13]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [21]:
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("C:/Users/pilla/Documents/Spark-The-Definitive-Guide-master/Spark-The-Definitive-Guide-master/data/flight-data/csv/2015-summary.csv")

In [11]:
flightData2015.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [19]:
import findspark
findspark.init()
import pyspark
import random


In [20]:
myrange = spark.range(1000).toDF("number")

In [22]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#84 ASC NULLS FIRST], true, 0
+- *(1) Project [DEST_COUNTRY_NAME#82, ORIGIN_COUNTRY_NAME#83, count#84]
   +- BatchScan[DEST_COUNTRY_NAME#82, ORIGIN_COUNTRY_NAME#83, count#84] CSVScan Location: InMemoryFileIndex[file:/C:/Users/pilla/Documents/Spark-The-Definitive-Guide-master/Spark-The-Defi..., ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




#### Set Output Shuffle to 5 partitions

#### Default Shuffle value = 200

In [24]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [25]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

#### Convert Dataframe into a table or view

In [26]:
flightData2015.createOrReplaceTempView("flights_data_2015")

In [29]:
sqlway = spark.sql("""SELECT DEST_COUNTRY_NAME, count(1) FROM flights_data_2015 
                      group by DEST_COUNTRY_NAME""")

In [30]:
sqlway.explain()

== Physical Plan ==
*(1) HashAggregate(keys=[DEST_COUNTRY_NAME#82], functions=[count(1)])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#82], functions=[partial_count(1)])
   +- *(1) Project [DEST_COUNTRY_NAME#82]
      +- BatchScan[DEST_COUNTRY_NAME#82] CSVScan Location: InMemoryFileIndex[file:/C:/Users/pilla/Documents/Spark-The-Definitive-Guide-master/Spark-The-Defi..., ReadSchema: struct<DEST_COUNTRY_NAME:string>




#### Data Frame way

In [31]:
dataFrameway = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

#### Max Functions dataframe way and SQL Way

In [34]:
spark.sql("SELECT MAX(count) from flights_data_2015").take(1)

[Row(max(count)=370002)]

In [36]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

#### Top 5 Destination SQL Way and DataFrame Way

In [38]:
maxsql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
                     FROM flights_data_2015
                     group by DEST_COUNTRY_NAME
                     ORDER BY destination_total desc
                     LIMIT 5""")
maxsql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [40]:
from pyspark.sql.functions import desc
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)","destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [41]:
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)","destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#185L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#82,destination_total#185L])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#82], functions=[sum(cast(count#84 as bigint))])
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#82], functions=[partial_sum(cast(count#84 as bigint))])
      +- *(1) Project [DEST_COUNTRY_NAME#82, count#84]
         +- BatchScan[DEST_COUNTRY_NAME#82, count#84] CSVScan Location: InMemoryFileIndex[file:/C:/Users/pilla/Documents/Spark-The-Definitive-Guide-master/Spark-The-Defi..., ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




### End of Chapter 1 and 2 from Spark The Definitive Guide