In [1]:
!pip install pyspark



#Dataset with Spark

In [16]:
orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]

In [17]:
orders_col=["order_id", "city", "order_amt"]

In [18]:
orders_df=spark.createDataFrame(orders_data, orders_col)
orders_df.show()

+--------+---------+---------+
|order_id|     city|order_amt|
+--------+---------+---------+
|    O001|Hyderabad|     1200|
|    O002|    Delhi|      800|
|    O003|   Mumbai|     1500|
|    O004|Bangalore|      400|
|    O005|Hyderabad|      300|
|    O006|    Delhi|     2000|
|    O007|   Mumbai|      700|
|    O008|Bangalore|     1800|
|    O009|    Delhi|      350|
|    O010|Hyderabad|      900|
+--------+---------+---------+



In [19]:
city_data = [
    ("Hyderabad","Tier-1"),
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1")
]

In [20]:
city_cols = ["city","city_category"]

In [21]:
city_df = spark.createDataFrame(city_data, city_cols)
city_df.show()

+---------+-------------+
|     city|city_category|
+---------+-------------+
|Hyderabad|       Tier-1|
|    Delhi|       Tier-1|
|   Mumbai|       Tier-1|
|Bangalore|       Tier-1|
+---------+-------------+



In [25]:
from pyspark.sql.functions import col
filtered_orders = orders_df.filter(col("order_amt") > 500)

In [26]:
joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

In [28]:
final_df = joined_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amt"
)

In [29]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amt]
+- Project [city#28, order_id#27, order_amt#29L, city_category#41]
   +- Join Inner, (city#28 = city#40)
      :- Filter (order_amt#29L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#27, city#28, order_amt#29L], false
      +- LogicalRDD [city#40, city_category#41], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amt: bigint
Project [order_id#27, city#28, city_category#41, order_amt#29L]
+- Project [city#28, order_id#27, order_amt#29L, city_category#41]
   +- Join Inner, (city#28 = city#40)
      :- Filter (order_amt#29L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#27, city#28, order_amt#29L], false
      +- LogicalRDD [city#40, city_category#41], false

== Optimized Logical Plan ==
Project [order_id#27, city#28, city_category#41, order_amt#29L]
+- Join Inner, (city#28 = city#40)
   :- Filter ((isnotnull(order_amt#29L) AND (order_amt#29L 

# Broadcast

In [30]:
from pyspark.sql.functions import broadcast

In [31]:
broadcast_join_df=filtered_orders.join(
    broadcast(city_df),
    on="city",
    how="inner"
)

final_broadcast_df=broadcast_join_df.select(
    "order_id",
    'city',
    "city_category",
    "order_amt"
)

In [32]:
final_broadcast_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amt]
+- Project [city#28, order_id#27, order_amt#29L, city_category#41]
   +- Join Inner, (city#28 = city#40)
      :- Filter (order_amt#29L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#27, city#28, order_amt#29L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#40, city_category#41], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amt: bigint
Project [order_id#27, city#28, city_category#41, order_amt#29L]
+- Project [city#28, order_id#27, order_amt#29L, city_category#41]
   +- Join Inner, (city#28 = city#40)
      :- Filter (order_amt#29L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#27, city#28, order_amt#29L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#40, city_category#41], false

== Optimized Logical Plan ==
Project [order_id#27, city#28, city_category#41, order_amt#29L]
+-