# Execute Pyspark -sparksql joins & Applying Functions in a Pandas DataFrame



#Spark Sql Joins

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder \
    .appName("PySpark Joins") \
    .getOrCreate()

In [0]:
data1 = [("Alice", 34), ("Bob", 45), ("Charlie", 28)]
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("Dave", "Artist")]

In [0]:
df1 = spark.createDataFrame(data1, ["Name", "Age"])
df2 = spark.createDataFrame(data2, ["Name", "Profession"])

# Register DataFrames as temporary views


In [0]:
df1.createOrReplaceTempView("table1")
df2.createOrReplaceTempView("table2")

# Perform SparkSQL join

In [0]:

joined_df = spark.sql("""
    SELECT t1.Name, t1.Age, t2.Profession 
    FROM table1 t1
    LEFT JOIN table2 t2 ON t1.Name = t2.Name
""")


In [0]:
pandas_df = joined_df.toPandas()

In [0]:
print(pandas_df)

      Name  Age Profession
0    Alice   34   Engineer
1      Bob   45     Doctor
2  Charlie   28       None


#PYSPARK JOINS

#Inner Join

In [0]:
inner_join_df = df1.join(df2, on='Name', how='inner')

In [0]:
print("Inner Join:")
inner_join_df.show()

Inner Join:
+-----+---+----------+
| Name|Age|Profession|
+-----+---+----------+
|Alice| 34|  Engineer|
|  Bob| 45|    Doctor|
+-----+---+----------+



#Left Join


In [0]:
left_join_df = df1.join(df2, on='Name', how='left')

In [0]:
print("Left Join:")
left_join_df.show()

Left Join:
+-------+---+----------+
|   Name|Age|Profession|
+-------+---+----------+
|  Alice| 34|  Engineer|
|    Bob| 45|    Doctor|
|Charlie| 28|      NULL|
+-------+---+----------+



# Right Join

In [0]:

right_join_df = df1.join(df2, on='Name', how='right')

In [0]:
print("Right Join:")
right_join_df.show()

Right Join:
+-----+----+----------+
| Name| Age|Profession|
+-----+----+----------+
|Alice|  34|  Engineer|
|  Bob|  45|    Doctor|
| Dave|NULL|    Artist|
+-----+----+----------+



# Full Outer Join


In [0]:
outer_join_df = df1.join(df2, on='Name', how='outer')

In [0]:
print("Full Outer Join:")
outer_join_df.show()

Full Outer Join:
+-------+----+----------+
|   Name| Age|Profession|
+-------+----+----------+
|  Alice|  34|  Engineer|
|    Bob|  45|    Doctor|
|Charlie|  28|      NULL|
|   Dave|NULL|    Artist|
+-------+----+----------+



# Semi Join


In [0]:
semi_join_df = df1.join(df2, on='Name', how='left_semi')

In [0]:
print("semi Join:")
semi_join_df.show()

semi Join:
+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
+-----+---+



#Anti Join

In [0]:
anti_join_df = df1.join(df2, on='Name', how='left_anti')

In [0]:
print("Anti Join:")
anti_join_df.show()

Anti Join:
+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 28|
+-------+---+



# Apply functions to Pandas DataFrame

In [0]:
def process_age(age):
    return age * 2

In [0]:
pandas_df['Processed_Age'] = pandas_df['Age'].apply(process_age)

In [0]:
print(pandas_df)

      Name  Age Profession  Processed_Age
0    Alice   34   Engineer             68
1      Bob   45     Doctor             90
2  Charlie   28       None             56
