In [1]:
from spark_init import start_spark

spark = start_spark()
spark


In [2]:
flightData2015 = spark\
    .read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv("../data/flight-data/csv/2015-summary.csv")


In [25]:
flightData2015.explain(True)

== Parsed Logical Plan ==
UnresolvedDataSource format: csv, isStreaming: false, paths: 1 provided

== Analyzed Logical Plan ==
DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int
Relation [DEST_COUNTRY_NAME#17,ORIGIN_COUNTRY_NAME#18,count#19] csv

== Optimized Logical Plan ==
Relation [DEST_COUNTRY_NAME#17,ORIGIN_COUNTRY_NAME#18,count#19] csv

== Physical Plan ==
FileScan csv [DEST_COUNTRY_NAME#17,ORIGIN_COUNTRY_NAME#18,count#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/ryanh/dv/spark-definitive-guide-ryan/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>



In [3]:
spark.conf.set("spark.sql.shuffle.partitions", "5")
flightData2015.sort("count").take(5)

[Row(DEST_COUNTRY_NAME='Malta', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Gibraltar', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1)]

In [4]:
flightData2015.createOrReplaceGlobalTempView("flight_data_2015")

In [6]:
spark.sql("SELECT max(count) FROM global_temp.flight_data_2015").take(1)

[Row(max(count)=370002)]

In [7]:
flightData2015.agg({"count": "max"}).take(1)

[Row(max(count)=370002)]

In [8]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)


[Row(max(count)=370002)]

In [12]:
maxSql = spark.sql("""
    SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
    FROM global_temp.flight_data_2015
    GROUP BY DEST_COUNTRY_NAME
    ORDER BY destination_total DESC
    LIMIT 5
""")

maxSql.show()



+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [18]:
nextSql = spark.sql("""
    SELECT ORIGIN_COUNTRY_NAME FROM global_temp.flight_data_2015
    WHERE ORIGIN_COUNTRY_NAME
        LIKE '%Federated States%'
""")

nextSql.show()

+--------------------+
| ORIGIN_COUNTRY_NAME|
+--------------------+
|Federated States ...|
+--------------------+



In [None]:
from pyspark.sql.functions import desc

flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort("destination_total")\
    .limit(5)\
    .show()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [24]:
from pyspark.sql.functions import desc

flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)", "destination_total").sort(desc("destination_total")).limit(5).show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+

