In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName("My app").getOrCreate()

In [3]:
myRange=spark.range(1000).toDF("number")

In [4]:
divisBy2 = myRange.where("number%2=0") #transformation

In [5]:
divisBy2.count() #action

500

In [6]:
flightData2015= spark.read.json('book-data/json/2015-summary.json')

In [7]:
#action
flightData2015.take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1)]

In [8]:
flightData2015.explain() #plan

== Physical Plan ==
FileScan json [DEST_COUNTRY_NAME#18,ORIGIN_COUNTRY_NAME#19,count#20L] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/lenovo/book-data/json/2015-summary.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:bigint>




In [9]:
sortedFlightData2015 = flightData2015.sort("count")

In [10]:
sortedFlightData2015.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#20L ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#20L ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=58]
      +- FileScan json [DEST_COUNTRY_NAME#18,ORIGIN_COUNTRY_NAME#19,count#20L] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/lenovo/book-data/json/2015-summary.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:bigint>




In [11]:
sortedFlightData2015.take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [12]:
flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv("book-data/csv/2015-summary.csv")

In [13]:
flightData2015

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [14]:
csvSchema = spark.read.format("csv").option("inferSchema","true").option("header","true").load("book-data/csv/2015-summary.csv").schema

In [15]:
jsonSchema=spark.read.format("json").load("book-data/json/2015-summary.json").schema

In [16]:
print(csvSchema)
print(jsonSchema)

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,IntegerType,true)))
StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))


In [17]:
flightData2015 = spark.read.schema(jsonSchema).option("header","true").csv("book-data/csv/2015-summary.csv")

In [18]:
print(flightData2015.schema)

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))


In [19]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [20]:
sqlWay = spark.sql(""" SELECT 
                            DEST_COUNTRY_NAME, 
                            COUNT(1) 
                        FROM
                            flight_data_2015
                        GROUP BY
                            DEST_COUNTRY_NAME
                    """)

dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [21]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#91, 200), ENSURE_REQUIREMENTS, [plan_id=121]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#91] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/lenovo/book-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [22]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#91, 200), ENSURE_REQUIREMENTS, [plan_id=134]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#91] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/lenovo/book-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [23]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [24]:
spark.sql("SELECT max(count) from flight_Data_2015").take(1)

[Row(max(count)=370002)]

### What are the top five destination countries in the data set? 

In [25]:
maxSql = spark.sql(""" 
                        SELECT 
                            DEST_COUNTRY_NAME,
                            SUM(count) as destination_total
                        FROM
                            flight_data_2015
                        GROUP BY
                            DEST_COUNTRY_NAME
                        ORDER BY
                            sum(count) DESC
                        LIMIT 
                            5
                    """)
maxSql.collect()          

[Row(DEST_COUNTRY_NAME='United States', destination_total=411352),
 Row(DEST_COUNTRY_NAME='Canada', destination_total=8399),
 Row(DEST_COUNTRY_NAME='Mexico', destination_total=7140),
 Row(DEST_COUNTRY_NAME='United Kingdom', destination_total=2025),
 Row(DEST_COUNTRY_NAME='Japan', destination_total=1548)]

In [26]:
from pyspark.sql.functions import desc
flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)","destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .collect()

[Row(DEST_COUNTRY_NAME='United States', destination_total=411352),
 Row(DEST_COUNTRY_NAME='Canada', destination_total=8399),
 Row(DEST_COUNTRY_NAME='Mexico', destination_total=7140),
 Row(DEST_COUNTRY_NAME='United Kingdom', destination_total=2025),
 Row(DEST_COUNTRY_NAME='Japan', destination_total=1548)]

In [27]:
flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)","destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#163L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#91,destination_total#163L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[sum(count#93L)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#91, 200), ENSURE_REQUIREMENTS, [plan_id=304]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#91], functions=[partial_sum(count#93L)])
            +- FileScan csv [DEST_COUNTRY_NAME#91,count#93L] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/lenovo/book-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:bigint>




In [31]:
added10=df.select(df["number"]+10)

In [32]:
df = spark.range(500).toDF("number")

In [33]:
added10.show()

+-------------+
|(number + 10)|
+-------------+
|           10|
|           11|
|           12|
|           13|
|           14|
|           15|
|           16|
|           17|
|           18|
|           19|
|           20|
|           21|
|           22|
|           23|
|           24|
|           25|
|           26|
|           27|
|           28|
|           29|
+-------------+
only showing top 20 rows



In [34]:
spark.range(500).schema

StructType(List(StructField(id,LongType,false)))

In [35]:
spark.range(2).collect()

[Row(id=0), Row(id=1)]

In [32]:
from pyspark.sql.types import *
b=ByteType()

In [33]:
df = spark.read.format("json").load("book-data/json/2015-summary.json")

In [34]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [35]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

### schema
A schema is made up of structType which consists of lists of structfields(name,type,boolean null or not null)

In [36]:
df.schema

StructType(List(StructField(number,LongType,false)))

In [38]:
#manual schema
from pyspark.sql.types import StructField,StructType,StringType,LongType

myManualSchema= StructType([
    StructField("DEST_COUNTRY_NAME",StringType(),True),
    StructField("ORIGIN_COUNTRY_NAME",StringType(),True),
    StructField("count",LongType(),False)
])

In [39]:
df=spark.read.format("json")\
    .schema(myManualSchema)\
    .load("book-data/json/2015-summary.json")

In [40]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

### Columns and Expressions
Columns as expressions

In [41]:
from pyspark.sql.functions import col,column
col("someColumnNames")
#column("someColumnNames") 

Column<'someColumnNames'>

In [42]:
from pyspark.sql.functions import expr
expr("(((somecol+5)*200)-6)<otherCol")

Column<'((((somecol + 5) * 200) - 6) < otherCol)'>

In [45]:
#Accessing a DataFrame's Columns
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

### Records and Rows

In [43]:
#First row
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [41]:
#Creating row
from pyspark.sql import Row
myRow = Row("Hello",None,1,False)

In [42]:
myRow[0]

'Hello'

In [43]:
myRow[2]

1

### Creating DataFrames

In [47]:
df=spark.read.format("json")\
    .load("book-data/json/2015-summary.json")
df.createOrReplaceTempView("dfTable")

#### select and selectexpr
select : columns or expressions
selectexpr : exprsions in strings

In [49]:
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [51]:
result = spark.sql("SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2")
result.collect()

[Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='United States')]

In [52]:
df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(3)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
|    United States|            Ireland|
+-----------------+-------------------+
only showing top 3 rows



In [53]:
result1=spark.sql("SELECT DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME FROM dfTable LIMIT 3")
result1.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland')]

In [47]:
from pyspark.sql.functions import expr,col,column

df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"))\
    .show(2)

+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+
|    United States|    United States|
|    United States|    United States|
+-----------------+-----------------+
only showing top 2 rows



In [55]:
 df.select(col("DEST_COUNTRY_NAME"), "DEST_COUNTRY_NAME").show()


+--------------------+--------------------+
|   DEST_COUNTRY_NAME|   DEST_COUNTRY_NAME|
+--------------------+--------------------+
|       United States|       United States|
|       United States|       United States|
|       United States|       United States|
|               Egypt|               Egypt|
|       United States|       United States|
|       United States|       United States|
|       United States|       United States|
|          Costa Rica|          Costa Rica|
|             Senegal|             Senegal|
|             Moldova|             Moldova|
|       United States|       United States|
|       United States|       United States|
|              Guyana|              Guyana|
|               Malta|               Malta|
|            Anguilla|            Anguilla|
|             Bolivia|             Bolivia|
|       United States|       United States|
|             Algeria|             Algeria|
|Turks and Caicos ...|Turks and Caicos ...|
|       United States|       Uni

In [56]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [57]:
df.select(
    expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")
).show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [50]:
df.selectExpr(
    "DEST_COUNTRY_NAME as newColumnName",
    "DEST_COUNTRY_NAME"
).show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [51]:
df.selectExpr(
    "*",
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
    .show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [52]:

df.selectExpr("avg(count)","count(distinct(DEST_COUNTRY_NAME))").show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



### Converting to Spark Types(Literals)

In [58]:
from pyspark.sql.functions import lit
df.select(
    expr("*"),
    lit(1).alias("One")
).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



### Adding Columns
withColumn

In [59]:
df.withColumn("numberOne",lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [55]:
df.withColumn(
    "withinCountry",
    expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")
).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



### Renaming Columns

In [65]:
df.withColumnRenamed("DEST_COUNTRY_NAME","dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

### Reserved Characters and Keywords in Column Names

In [66]:
dfWithLongColName =df\
    .withColumn(
        "This Long Column-Name",
        expr("ORIGIN_COUNTRY_NAME"))


In [67]:
dfWithLongColName\
.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`")\
    .show(2)

dfWithLongColName.createOrReplaceTempView("dfTableLong")

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [68]:
dfWithLongColName.select(expr("`This Long Column-Name`")).columns

['This Long Column-Name']

### Removing columns

In [69]:
#removing columns
df.drop("ORIGIN_COUNTRY_NAME").columns

['DEST_COUNTRY_NAME', 'count']

In [70]:
#removing multiple columns
dfWithLongColName.drop("ORIGIN_COUNTRY_NAME","DEST_COUNTRY_NAME").columns

['count', 'This Long Column-Name']

### Changing column's type(cast)

In [62]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [63]:
#changing a column's type
df.withColumn("count",col("count").cast("int")).printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



### Filtering Rows
To filter rows we create an expression that evaluates to true or false
where,filter

In [71]:
#filtering rows
colCondition =df.filter(col("count")<2).take(2)
conditional =df.where("count<2").take(2)
colCondition


[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1)]

In [73]:
query = spark.sql("SELECT * FROM dfTable WHERE count<2")
query.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Malta', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Gibraltar', count=1),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Suriname', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Cyprus', count=1),
 Row(DEST_COUNTRY_NAME='Burkina Faso', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Djibouti', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Estonia', count=1),
 Row(DEST_COUNTRY_NAME='Zambia', ORIGIN_COUNTRY_NAME='United States', count=

In [74]:
conditional

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1)]

In [66]:
df.where(col("count")<2)\
    .where(col("ORIGIN_COUNTRY_NAME") !="Croatia")\
    .show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [79]:
query=spark.sql("SELECT * FROM dfTable WHERE count<2 AND ORIGIN_COUNTRY_NAME!='Croatia'")
query.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



### Getting unique Rows

In [80]:
#Getting unique Rows
df.select("ORIGIN_COUNTRY_NAME","DEST_COUNTRY_NAME").count()

256

In [81]:
df.select("ORIGIN_COUNTRY_NAME","DEST_COUNTRY_NAME").distinct().count()

256

### Random Samples

In [82]:
#random samples
seed=5
withReplacement = False
fraction=0.5

In [83]:
df.sample(withReplacement,fraction,seed).count()

138

In [84]:
dataFrames = df.randomSplit([0.25,0.75],seed)
dataFrames[0].count()>dataFrames[1].count()

False

### Concatenating and Appending Rows to a DataFrame

In [85]:
from pyspark.sql import Row

schema=df.schema

In [86]:
from pyspark.sql import Row
schema = df.schema
newRows = [
Row("New Country", "Other Country", 5),
Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

Traceback (most recent call last):
  File "H:\anaconda\Lib\site-packages\pyspark\serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "H:\anaconda\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
    ~~~~~~~^^^^^
  File "H:\anaconda\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
           ~~~~~~~~~~~~^^^^^^^^^^^
  File "H:\anaconda\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 630, in reducer_override
    return self._function_reduce(obj)
           ~~~~~~~~~~~~~~~~~~~~~^^^^^
  File "H:\anaconda\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 503, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^
  File "H:\anaconda\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 484, in 

PicklingError: Could not serialize object: IndexError: tuple index out of range

### Sorting Rows
sort,orderBy

In [88]:
## Sorting rows
df.sort("count").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [89]:
df.orderBy("count","DEST_COUNTRY_NAME").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [90]:
df.orderBy(col("count"),col("DEST_COUNTRY_NAME")).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [94]:
from pyspark.sql.functions import desc,asc

df.orderBy(desc("count")).show(2)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
+-----------------+-------------------+------+
only showing top 2 rows



In [93]:
df.select("count").distinct().orderBy("count", ascending=False).show()


+------+
| count|
+------+
|370002|
|  8483|
|  8399|
|  7187|
|  7140|
|  2025|
|  1970|
|  1548|
|  1496|
|  1468|
|  1420|
|  1353|
|  1336|
|  1048|
|   986|
|   955|
|   952|
|   935|
|   920|
|   873|
+------+
only showing top 20 rows



In [95]:
df.orderBy(desc(col("count")),asc(col("DEST_COUNTRY_NAME"))).show(2)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
+-----------------+-------------------+------+
only showing top 2 rows



In [80]:
#sorting withing partition for optimization
spark.read.format("json")\
    .load("book-data/json/2015-summary.json")\
    .sortWithinPartitions("count")

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

### Limit

In [96]:
#limit
df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



In [97]:
df.orderBy(expr("count desc")).limit(6).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
+--------------------+-------------------+-----+



### Repartition and Coalesce

Repartitioln: full shuffle of the data

In [98]:
#Repartition and Coalesce
df.rdd.getNumPartitions()

1

In [85]:
df.repartition(5)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [86]:
df.repartition(col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [87]:
df.rdd.getNumPartitions()

1

In [88]:
df.repartition(5,col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [89]:
df.repartition(5,col("DEST_COUNTRY_NAME")).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

### Collecting Rows to the Driver

In [99]:
#Collecting Rows to the Driver
collectDF=df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5,False)
collectDF.collect()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India         

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]