In [20]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

spark = SparkSession.builder.master('local[1]').appName('bbi').getOrCreate()

In [21]:
# Sample data for DataFrame df1
data1 = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
columns1 = ["Name", "Value1"]
df1 = spark.createDataFrame(data=data1, schema=columns1)

In [22]:
# Sample data for DataFrame df2
data2 = [("Alice", "X"), ("Bob", "Y"), ("David", "Z")]
columns2 = ["UserName", "Value2"]
df2 = spark.createDataFrame(data=data2, schema=columns2)

In [23]:
# Inner Join Example
inner_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="inner")
# Display the results
print("Inner Join:")
inner_join_df.show()

Inner Join:
+-----+------+--------+------+
| Name|Value1|UserName|Value2|
+-----+------+--------+------+
|Alice|     1|   Alice|     X|
|  Bob|     2|     Bob|     Y|
+-----+------+--------+------+



In [24]:
# Cross Join Example
cross_join_df = df1.crossJoin(df2)
print("Cross Join:")
cross_join_df.show()

Cross Join:
+-------+------+--------+------+
|   Name|Value1|UserName|Value2|
+-------+------+--------+------+
|  Alice|     1|   Alice|     X|
|  Alice|     1|     Bob|     Y|
|  Alice|     1|   David|     Z|
|    Bob|     2|   Alice|     X|
|    Bob|     2|     Bob|     Y|
|    Bob|     2|   David|     Z|
|Charlie|     3|   Alice|     X|
|Charlie|     3|     Bob|     Y|
|Charlie|     3|   David|     Z|
+-------+------+--------+------+



In [25]:
# Outer Join Example (Full Outer Join)
outer_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="outer")
print("Outer Join:")
outer_join_df.show()

Outer Join:
+-------+------+--------+------+
|   Name|Value1|UserName|Value2|
+-------+------+--------+------+
|  Alice|     1|   Alice|     X|
|    Bob|     2|     Bob|     Y|
|Charlie|     3|    NULL|  NULL|
|   NULL|  NULL|   David|     Z|
+-------+------+--------+------+



In [26]:
# Full Outer Join Example (alternative naming)
full_outer_df = df1.join(df2, df1["Name"] == df2["UserName"], how="full_outer")
print("Full Outer Join:")
full_outer_df.show()

Full Outer Join:
+-------+------+--------+------+
|   Name|Value1|UserName|Value2|
+-------+------+--------+------+
|  Alice|     1|   Alice|     X|
|    Bob|     2|     Bob|     Y|
|Charlie|     3|    NULL|  NULL|
|   NULL|  NULL|   David|     Z|
+-------+------+--------+------+



In [27]:
# Left Join Example
left_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="left")
print("Left Join:")
left_join_df.show()

Left Join:
+-------+------+--------+------+
|   Name|Value1|UserName|Value2|
+-------+------+--------+------+
|Charlie|     3|    NULL|  NULL|
|    Bob|     2|     Bob|     Y|
|  Alice|     1|   Alice|     X|
+-------+------+--------+------+



In [28]:
# Left Outer Join Example (alternative naming)
left_outer_df = df1.join(df2, df1["Name"] == df2["UserName"], how="leftouter")
print("Left Outer Join:")
left_outer_df.show()

Left Outer Join:
+-------+------+--------+------+
|   Name|Value1|UserName|Value2|
+-------+------+--------+------+
|Charlie|     3|    NULL|  NULL|
|    Bob|     2|     Bob|     Y|
|  Alice|     1|   Alice|     X|
+-------+------+--------+------+



In [29]:
# Right Join Example
right_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="right")
print("Right Join:")
right_join_df.show()

Right Join:
+-----+------+--------+------+
| Name|Value1|UserName|Value2|
+-----+------+--------+------+
|  Bob|     2|     Bob|     Y|
|Alice|     1|   Alice|     X|
| NULL|  NULL|   David|     Z|
+-----+------+--------+------+



In [30]:
# Right Outer Join Example (alternative naming)
right_outer_df = df1.join(df2, df1["Name"] == df2["UserName"], how="rightouter")
print("right outer join:")
right_outer_df.show()

right outer join:
+-----+------+--------+------+
| Name|Value1|UserName|Value2|
+-----+------+--------+------+
|  Bob|     2|     Bob|     Y|
|Alice|     1|   Alice|     X|
| NULL|  NULL|   David|     Z|
+-----+------+--------+------+



In [31]:
# Semi Join Example
semi_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="semi")
print("Semi Join:")
semi_join_df.show()

Semi Join:
+-----+------+
| Name|Value1|
+-----+------+
|Alice|     1|
|  Bob|     2|
+-----+------+



In [32]:
# Left Semi Join Example (alternative naming)
leftsemi_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="leftsemi")
print("Left Semi Join")
leftsemi_join_df.show()

Left Semi Join
+-----+------+
| Name|Value1|
+-----+------+
|Alice|     1|
|  Bob|     2|
+-----+------+



In [33]:
# Anti Join Example
anti_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="anti")
print('Anti Join:')
anti_join_df.show()

Anti Join:
+-------+------+
|   Name|Value1|
+-------+------+
|Charlie|     3|
+-------+------+



In [34]:
# Left Anti Join Example (alternative naming)
leftanti_join_df = df1.join(df2, df1["Name"] == df2["UserName"], how="leftanti")
print("Left Anti Join:")
leftanti_join_df.show()

Left Anti Join:
+-------+------+
|   Name|Value1|
+-------+------+
|Charlie|     3|
+-------+------+



Certainly! Here are business use cases for each type of join in the context of PySpark DataFrames:

1. **Inner Join:**
   - **Use Case:** Combine data from two tables to retrieve only the rows that have matching values in the specified columns.
   - **Example:** Merging customer information with their purchase history to identify customers who made purchases.

2. **Cross Join:**
   - **Use Case:** Generate all possible combinations of rows from two tables.
   - **Example:** Creating a matrix of all possible product combinations to analyze cross-product performance.

3. **Outer Join (Full Outer Join):**
   - **Use Case:** Retrieve all rows from both tables, filling in missing values with nulls where no match is found.
   - **Example:** Analyzing employee data and training data, capturing information about employees who have not undergone training.

4. **Left Join:**
   - **Use Case:** Retrieve all rows from the left table and the matching rows from the right table.
   - **Example:** Combining a list of customers with their corresponding orders, even if some customers have not placed orders.

5. **Right Join:**
   - **Use Case:** Retrieve all rows from the right table and the matching rows from the left table.
   - **Example:** Analyzing product sales data and including products that have not been sold.

6. **Semi Join:**
   - **Use Case:** Filter the rows from the left table based on the existence of matching rows in the right table.
   - **Example:** Identifying customers who have made at least one purchase by filtering customer data with the list of purchasers.

7. **Left Semi Join:**
   - **Use Case:** Retain only the rows from the left table where there is a match in the right table.
   - **Example:** Filtering a list of active customers using a list of customers who made recent purchases.

8. **Anti Join:**
   - **Use Case:** Filter the rows from the left table based on the absence of matching rows in the right table.
   - **Example:** Identifying customers who have not made any purchases by using an anti-join with the list of purchasers.

9. **Left Anti Join:**
   - **Use Case:** Retain only the rows from the left table where there is no match in the right table.
   - **Example:** Identifying inactive customers by using a left anti-join with the list of customers who made recent purchases.