In [1]:
import pandas as pd
import numpy as np
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Chapters 1 and 2

In [3]:
myRange = spark.range(1000).toDF("number")

In [4]:
myRange.show(5)

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+
only showing top 5 rows



In [5]:
divisBy2 = myRange.where("number % 2 = 0")

In [6]:
divisBy2.show(5)

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



In [7]:
divisBy2.count()

500

### http://localhost:4040
for your spark UI

In [8]:
flightData2015 = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("2015-summary.csv")

In [9]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [10]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#38 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#38 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#85]
   +- FileScan csv [DEST_COUNTRY_NAME#36,ORIGIN_COUNTRY_NAME#37,count#38] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/brian.clements/Documents/Certifications_study_work/databricks/Da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [11]:
spark.conf.set("spark.sql.shuffle.partitions","5")
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [12]:
spark.conf.set("spark.sql.shuffle.partitions","4")
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [13]:
spark.conf.set("spark.sql.shuffle.partitions","16")
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

## Create sql temp view

In [14]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [15]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [16]:
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [17]:
sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 16), ENSURE_REQUIREMENTS, [id=#126]
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_count(1)])
      +- FileScan csv [DEST_COUNTRY_NAME#36] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/brian.clements/Documents/Certifications_study_work/databricks/Da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 16), ENSURE_REQUIREMENTS, [id=#145]
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_count(1)])
      +- FileScan csv [DEST_COUNTRY_NAME#36] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/brian.clements/Documents/Certifications_study_work/da

In [18]:
# sqlWay.show(100)

In [19]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

[Row(max(count)=370002)]

In [20]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [21]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

In [22]:
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [23]:
from pyspark.sql.functions import desc

flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.show(5)

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+
only showing top 5 rows



In [24]:
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#130L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#36,destination_total#130L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[sum(cast(count#38 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 16), ENSURE_REQUIREMENTS, [id=#263]
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_sum(cast(count#38 as bigint))])
         +- FileScan csv [DEST_COUNTRY_NAME#36,count#38] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/brian.clements/Documents/Certifications_study_work/databricks/Da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




# Chapter 3

In [25]:
import os

In [None]:
staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/*.csv")

In [None]:
# staticDataFrame.createOrReplaceTempView("retail_data")
# staticSchema = staticDataFrame.schema


# C:Users/brian.clements/Documents/Certifications_study_work/databricks/Spark-the-Definitive-Guide
# C:/Users/brian.clements/Documents/Certifications_study_work/databricks/Spark-the-Definitive-Guide/data/retail-data/by-day/*.csv"

# FIRST THING: ADDRESS THIS BLOCKER--p43 in book
### CAN'T LOAD CSVS FROM ITS ABSOLUTE PATH, AS IN THE ABOVE CELL.  
### General environment problems--not sure where spark is installed, only able to access it through the Anaconda CLI anyways : /

# MOVING
# TO
# DATABRICKS
## At least until I can dedicate a little time to sorting the above issue