In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast
from pyspark.sql.types import *

# Create a Spark session
spark = SparkSession.builder.appName("BroadcastExample").getOrCreate()

# Sample DataFrame with data to be broadcasted
lookup_data = [(1, 'John'), (2, 'Jane'), (3, 'Alice')]
lookup_columns = ["id", "name"]
lookup_df = spark.createDataFrame(lookup_data, lookup_columns)

# Sample DataFrame with a column to be joined using the broadcast variable
data = [(1, 'Some data'), (2, 'More data'), (3, 'Additional data')]
columns = ["id", "description"]
df = spark.createDataFrame(data, columns)

# Broadcast the lookup DataFrame
broadcast_lookup_df = broadcast(lookup_df)

# Perform a join using the broadcast variable
result_df = df.join(broadcast_lookup_df, "id")

# Show the result
result_df.show()

24/02/03 12:21:17 WARN Utils: Your hostname, Tech-Buddy resolves to a loopback address: 127.0.1.1; using 172.17.35.137 instead (on interface eth0)
24/02/03 12:21:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/03 12:21:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+---+---------------+-----+
| id|    description| name|
+---+---------------+-----+
|  1|      Some data| John|
|  2|      More data| Jane|
|  3|Additional data|Alice|
+---+---------------+-----+



In [32]:
broadcast_lookup_df.show()

24/01/27 23:23:33 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.


+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2| Jane|
|  3|Alice|
+---+-----+



In [22]:
from pyspark import SparkContext
sc = spark.sparkContext

In [29]:
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

rdd = spark.sparkContext.parallelize(data)

def state_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).collect()
print(result)

[('James', 'Smith', 'USA', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'Williams', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]


In [37]:
data = [{'id':1, 'name':'x'},
       {'id':2, 'name':'y'},
       {'id':3, 'name':'z'},
       {'id':4, 'name':'a'},
       {'id':5, 'name':'b'}]
schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('name', StringType(), True)
])
df1 = spark.createDataFrame(data = data, schema = schema)
df1.show()
df1.printSchema()

+---+----+
| id|name|
+---+----+
|  1|   x|
|  2|   y|
|  3|   z|
|  4|   a|
|  5|   b|
+---+----+

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)



In [38]:
broadcastdf = broadcast(df1)


In [39]:
data1 = [{'id':1, 'name':'p'},
       {'id':2, 'name':'q'},
       {'id':5, 'name':'r'},
       {'id':6, 'name':'s'},
       {'id':9, 'name':'t'}]
schema1 = StructType([
    StructField('id', IntegerType(), False),
    StructField('name', StringType(), True)
])
df2 = spark.createDataFrame(data = data1, schema = schema1)
df2.show()
df2.printSchema()

+---+----+
| id|name|
+---+----+
|  1|   p|
|  2|   q|
|  5|   r|
|  6|   s|
|  9|   t|
+---+----+

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)



In [40]:
resultdf1 = df2.join(broadcastdf, 'id')
resultdf1.show()

+---+----+----+
| id|name|name|
+---+----+----+
|  1|   p|   x|
|  2|   q|   y|
|  5|   r|   b|
+---+----+----+



In [12]:
lookup_data = [(1, 'John'), (2, 'Jane'), (3, 'Alice'), (None, 'Sam'), (1, 'John')]
lookup_columns = ["id", "name"]
df = spark.createDataFrame(lookup_data, lookup_columns)
df = df.na.drop(subset='id').drop_duplicates()

In [13]:
df.show()

+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2| Jane|
|  3|Alice|
+---+-----+



In [9]:
df = df.drop_duplicates()
df.show()

+----+-----+
|  id| name|
+----+-----+
|   1| John|
|   2| Jane|
|   3|Alice|
|NULL|  Sam|
+----+-----+

