In [72]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Certification").getOrCreate()

23/06/14 09:59:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [73]:
# Syntax of repartition()
# DataFrame.repartition(numPartitions, *cols)

In [74]:
# Create Spark Session
spark = SparkSession.builder.appName('Repartition and Coalesce') \
        .master("local[5]").getOrCreate()

# Create PySpark DataFrame
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.show()

# Write to CSV file
df.write.mode("overwrite").csv("data/partition/partition.csv") 

23/06/14 09:59:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



# Repartition

In [75]:
# repartition by number
df2 = df.repartition(numPartitions=3)
print(df2.rdd.getNumPartitions())

# Write DataFrame to CSV file
df2.write.mode("overwrite").csv("data/partition/partitionbynumber.csv")

3


In [76]:
# repartition by column
df2 = df.repartition("state")
print(df2.rdd.getNumPartitions())

# Write
df2.write.mode("overwrite").csv("data/partition/partitionbycolumn.csv")

1


In [77]:
# repartition by number and column
df2 = df.repartition(4, "state")
print(df2.rdd.getNumPartitions())

# Write
df2.write.mode("overwrite").csv("data/partition/partitionbynumberandcolumn.csv")

4


In [78]:
# repartition by multiple column
df2 = df.repartition("state", "department")
print(df2.rdd.getNumPartitions())

# Write
df2.write.mode("overwrite").csv("data/partition/partitionbymultiplecolumn.csv")

1


In [79]:
# repartition by number and multiple column
df2 = df.repartition(2,"state", "department")
print(df2.rdd.getNumPartitions())

# Write
df2.write.mode("overwrite").csv("data/partition/partitionbynumberandmultiplecolumn.csv")

2


# Coalesce

In [80]:
# coalesce by number
df2 = df.coalesce(numPartitions=3)
print(df2.rdd.getNumPartitions())

# Write DataFrame to CSV file
df2.write.mode("overwrite").csv("data/partition1/partitionbynumber.csv")

3


In [81]:
# coalesce by column
df2 = df.coalesce(5)
print(df2.rdd.getNumPartitions())

# Write
df2.write.mode("overwrite").csv("data/partition1/partitionbynumbers.csv")

5



# cache and persist

In [82]:
# import the spark context
from pyspark import SparkContext

# create SparkContext
spark_context=SparkContext.getOrCreate()
spark_context

In [83]:
# Create PySpark DataFrame
simple_data = [
    ("Michael","Sales","NY",86000,56,20000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jeff","Marketing","CA",80000,25,1800),
    ("Kumar","Marketing","NY",9100,50,2100)
  ]

sample_schema = ["employee_name","department","state","salary","age","bonus"]
sample_df = spark.createDataFrame(data=simple_data, schema = sample_schema)
sample_df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|         Jeff| Marketing|   CA| 80000| 25| 1800|
|        Kumar| Marketing|   NY|  9100| 50| 2100|
+-------------+----------+-----+------+---+-----+



In [87]:
# getting total time taken for the count before cache enabled
import timeit as t
start=t.default_timer()
sample_df.count()
sample_df.summary()
sample_df.collect()
end=t.default_timer()
time_elapsed=(end-start)
time_elapsed

0.4793530059978366

In [85]:
sample_df.cache()

DataFrame[employee_name: string, department: string, state: string, salary: bigint, age: bigint, bonus: bigint]

In [86]:
# getting total time taken for the count after cache enabled
import timeit as t
start=t.default_timer()
sample_df.count()
sample_df.summary()
sample_df.collect()
end=t.default_timer()
time_elapsed=(end-start)
time_elapsed

0.8706484040012583