In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
.appName("Order Details")\
.getOrCreate()

In [3]:
from pyspark.sql.functions import col

orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]

orders_cols = ["order_id","city","order_amount"]

orders_df = spark.createDataFrame(orders_data, orders_cols)
orders_df.show()


city_data = [
    ("Hyderabad","Tier-1"),
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1")
]

city_cols = ["city","city_category"]

city_df = spark.createDataFrame(city_data, city_cols)
city_df.show()

##################################################################################


filtered_orders = orders_df.filter(col("order_amount") > 500)

joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

final_df = joined_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)

##################################################################################

final_df.explain(True)

+--------+---------+------------+
|order_id|     city|order_amount|
+--------+---------+------------+
|    O001|Hyderabad|        1200|
|    O002|    Delhi|         800|
|    O003|   Mumbai|        1500|
|    O004|Bangalore|         400|
|    O005|Hyderabad|         300|
|    O006|    Delhi|        2000|
|    O007|   Mumbai|         700|
|    O008|Bangalore|        1800|
|    O009|    Delhi|         350|
|    O010|Hyderabad|         900|
+--------+---------+------------+

+---------+-------------+
|     city|city_category|
+---------+-------------+
|Hyderabad|       Tier-1|
|    Delhi|       Tier-1|
|   Mumbai|       Tier-1|
|Bangalore|       Tier-1|
+---------+-------------+

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#23, order_id#22, order_amount#24L, city_category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L],

In [4]:
from pyspark.sql.functions import broadcast
broadcast_join_df=filtered_orders.join(
    broadcast(city_df),
    on="city",
    how="inner"

)

final_broadcast_df=broadcast_join_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)

In [5]:
final_broadcast_df.show()

final_broadcast_df.explain(True)


+--------+---------+-------------+------------+
|order_id|     city|city_category|order_amount|
+--------+---------+-------------+------------+
|    O001|Hyderabad|       Tier-1|        1200|
|    O002|    Delhi|       Tier-1|         800|
|    O003|   Mumbai|       Tier-1|        1500|
|    O006|    Delhi|       Tier-1|        2000|
|    O007|   Mumbai|       Tier-1|         700|
|    O008|Bangalore|       Tier-1|        1800|
|    O010|Hyderabad|       Tier-1|         900|
+--------+---------+-------------+------------+

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#23, order_id#22, order_amount#24L, city_category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#35, city_category#36], false

== Analyzed Logical Plan ==
order_id: string, ci