In [3]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
.appName("Spark Core Concepts")\
.getOrCreate()

In [4]:

data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]

columns = ["order_id","city","category","order_amount","status"]

df=spark.createDataFrame(data,columns)
df.show()
df.printSchema()


+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- order_amount: long (nullable = true)
 |-- status: string (nullable = true)



In [5]:
df.rdd.getNumPartitions()

2

In [6]:
df_repart=df.repartition(4)
df_repart.rdd.getNumPartitions()

4

In [7]:
df_coalesce=df_repart.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [8]:
filtered_df=df.filter(df.city=="Delhi")
selected_df = filtered_df.select("order_id","order_amount")

In [9]:
selected_df.show()

+--------+------------+
|order_id|order_amount|
+--------+------------+
|    O002|         800|
|    O006|        2000|
|    O009|         350|
+--------+------------+



In [10]:
df_lineage = (
    df.filter(df.status=="Delivered")
    .filter(df.order_amount>500)
    .select("city","order_amount")
)

In [11]:
df_lineage.count()

6

In [12]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint, status: string
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Optimized Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Physical Plan ==
*(1) Scan ExistingRDD[order_id#0,city#1,category#2,order_amount#3L,status#4]



In [13]:
df_lineage.explain(True)

== Parsed Logical Plan ==
'Project ['city, 'order_amount]
+- Filter (order_amount#3L > cast(500 as bigint))
   +- Filter (status#4 = Delivered)
      +- LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Analyzed Logical Plan ==
city: string, order_amount: bigint
Project [city#1, order_amount#3L]
+- Filter (order_amount#3L > cast(500 as bigint))
   +- Filter (status#4 = Delivered)
      +- LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Optimized Logical Plan ==
Project [city#1, order_amount#3L]
+- Filter ((isnotnull(status#4) AND isnotnull(order_amount#3L)) AND ((status#4 = Delivered) AND (order_amount#3L > 500)))
   +- LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Physical Plan ==
*(1) Project [city#1, order_amount#3L]
+- *(1) Filter ((isnotnull(status#4) AND isnotnull(order_amount#3L)) AND ((status#4 = Delivered) AND (order_amount#3L > 500)))
   +- *(1) Scan ExistingRDD[order_id#0,ci