In [33]:
# sc.stop()

In [34]:
from pyspark import SparkContext, SparkConf

if not 'sc' in globals(): # This 'trick' makes sure the SparkContext sc is initialized exactly once
    conf = SparkConf().setMaster('local[*]')  # Spark will use all cores (*) available
    sc = SparkContext(conf=conf) # Initialize SparkContext sc with the above configuration conf 

### Create SparkDataframe using Schema and Data . Also print the schema and dataframe

In [35]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() # getOrCreate() will return an existing SparkSession if there is one, or create a new one if there is none.


# define schema using DDl
schema = "id INT, name STRING, age INT"

data= [(1, "John", 19), (2, "Smith", 29), (3, "Adam", 35), (4, "Henry", 50), (5, "Mike", 78), (6, "Mary", 19), (7, "Peter", 29), (8, "Cora", 35), (9, "Nancy", 50), (10, "Daniel", 78)]

# create dataframe from data and schema
df = spark.createDataFrame(data, schema) # createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True)  


df.show()

df.printSchema()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|  John| 19|
|  2| Smith| 29|
|  3|  Adam| 35|
|  4| Henry| 50|
|  5|  Mike| 78|
|  6|  Mary| 19|
|  7| Peter| 29|
|  8|  Cora| 35|
|  9| Nancy| 50|
| 10|Daniel| 78|
+---+------+---+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [36]:
df.schema # schema of the dataframe that can be used elsewhere

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True)])

In [37]:
# acess only specific column
df.select("name").show()

+------+
|  name|
+------+
|  John|
| Smith|
|  Adam|
| Henry|
|  Mike|
|  Mary|
| Peter|
|  Cora|
| Nancy|
|Daniel|
+------+



In [38]:
# access column with conditions
df.filter(df.age > 30).show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  3|  Adam| 35|
|  4| Henry| 50|
|  5|  Mike| 78|
|  8|  Cora| 35|
|  9| Nancy| 50|
| 10|Daniel| 78|
+---+------+---+



## End to end Example

In [39]:


flightData2015 = spark.read.csv("Data/2015-summary.csv", header=True, inferSchema=True) # use schema inference to create dataframe i.e sample some data and infer schema from it

flightData2015.show()
flightData2015.schema

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', IntegerType(), True)])

In [40]:
flightData2015.take(3) 

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [41]:
flightData2015.sort("count").explain() # explain() will show the execution plan of the query

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#377 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#377 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#434]
      +- FileScan csv [DEST_COUNTRY_NAME#375,ORIGIN_COUNTRY_NAME#376,count#377] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/amrit/Apache_Spark_Fuse/Data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [42]:
flightData2015.sort("count").count() # count() will return the number of rows in the dataframe

256

In [55]:
spark.conf.set("spark.sql.shuffle.partitions", "12")
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [56]:
# dataframe to table view
flightData2015.createOrReplaceTempView("flight_data_2015")

In [57]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.count()

sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#375, 12), ENSURE_REQUIREMENTS, [id=#552]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#375] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/amrit/Apache_Spark_Fuse/Data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#375, 12), ENSURE_REQUIREMENTS, [id=#565]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#375] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(

In [58]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [59]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [60]:
# in Python we can use the same syntax as in SQL
from pyspark.sql.functions import desc
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [61]:
# explan plan
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#520L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#375,destination_total#520L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[sum(count#377)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#375, 12), ENSURE_REQUIREMENTS, [id=#733]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#375], functions=[partial_sum(count#377)])
            +- FileScan csv [DEST_COUNTRY_NAME#375,count#377] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/amrit/Apache_Spark_Fuse/Data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


