In [1]:
from pyspark.sql import SparkSession

spark: SparkSession = (
    SparkSession.builder.master("local[3]")  # type: ignore
    .appName("Basic Joins")
    .getOrCreate()
)

spark.version

25/04/08 16:48:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.5.5'

In [2]:
ordersList = [
    ("01", "02", 350, 1),
    ("01", "04", 580, 1),
    ("01", "07", 320, 2),
    ("02", "03", 450, 1),
    ("02", "06", 220, 1),
    ("03", "01", 195, 1),
    ("04", "09", 270, 3),
    ("04", "08", 410, 2),
    ("05", "02", 350, 1),
]

orderDF = spark.createDataFrame(ordersList).toDF(
    "order_id",
    "prod_id",
    "unit_price",
    "qty",
)

orderDF.show()
orderDF.printSchema()

                                                                                

+--------+-------+----------+---+
|order_id|prod_id|unit_price|qty|
+--------+-------+----------+---+
|      01|     02|       350|  1|
|      01|     04|       580|  1|
|      01|     07|       320|  2|
|      02|     03|       450|  1|
|      02|     06|       220|  1|
|      03|     01|       195|  1|
|      04|     09|       270|  3|
|      04|     08|       410|  2|
|      05|     02|       350|  1|
+--------+-------+----------+---+

root
 |-- order_id: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- unit_price: long (nullable = true)
 |-- qty: long (nullable = true)



In [3]:
productList = [
    ("01", "Scroll Mouse", 250, 20),
    ("02", "Optical Mouse", 350, 20),
    ("03", "Wireless Mouse", 450, 50),
    ("04", "Wireless Keyboard", 580, 50),
    ("05", "Standard Keyboard", 360, 10),
    ("06", "16 GB Flash Storage", 240, 100),
    ("07", "32 GB Flash Storage", 320, 50),
    ("08", "64 GB Flash Storage", 430, 25),
]

productDF = spark.createDataFrame(productList).toDF(
    "prod_id",
    "prod_name",
    "list_price",
    "qty",
)

productDF.show()
productDF.printSchema()

+-------+-------------------+----------+---+
|prod_id|          prod_name|list_price|qty|
+-------+-------------------+----------+---+
|     01|       Scroll Mouse|       250| 20|
|     02|      Optical Mouse|       350| 20|
|     03|     Wireless Mouse|       450| 50|
|     04|  Wireless Keyboard|       580| 50|
|     05|  Standard Keyboard|       360| 10|
|     06|16 GB Flash Storage|       240|100|
|     07|32 GB Flash Storage|       320| 50|
|     08|64 GB Flash Storage|       430| 25|
+-------+-------------------+----------+---+

root
 |-- prod_id: string (nullable = true)
 |-- prod_name: string (nullable = true)
 |-- list_price: long (nullable = true)
 |-- qty: long (nullable = true)



In [4]:
join_expr = orderDF.prod_id == productDF.prod_id
# orderDF.join(productDF, join_expr, "inner").show()

orderDF.join(productDF, on=join_expr, how="inner").select(
    "order_id",
    "prod_name",
    "unit_price",
    "qty",
).show()

# pyspark.errors.exceptions.captured.AnalysisException: [AMBIGUOUS_REFERENCE]
# Reference `qty` is ambiguous, could be: [`qty`, `qty`].

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `qty` is ambiguous, could be: [`qty`, `qty`].

## Handling Column Name Ambiguity in Spark Joins

Column name ambiguity is a frequent challenge when joining DataFrames that contain columns with identical names (especially columns *not* used as join keys). Attempting to select or operate on such an ambiguous column in the resulting joined DataFrame will lead to an `AnalysisException`.

**When Does It Typically Occur?**

* When joining DataFrames where non-key columns share the same name (e.g., both DataFrames have a 'description' or 'value' column).
* Even if the join key column has the same name (e.g., 'id') and you use `on="id"` (which correctly results in only one 'id' column), other identically named columns remain ambiguous.
* If using a Column expression like `left_df.id == right_df.id`, the resulting DataFrame might contain *both* 'id' columns (one from left, one from right), making them ambiguous unless handled.

**Strategies to Prevent or Resolve Ambiguity:**

1.  **Rename Columns Before Joining:** This is often the clearest and most recommended approach. Use `withColumnRenamed("old_name", "new_name")` on one or both DataFrames *before* performing the join to ensure all potentially conflicting column names are unique.
    ```python
    # Example: Both DFs have a 'value' column, join on 'id'
    right_renamed = right_df.withColumnRenamed("value", "right_value")
    joined_df = left_df.join(right_renamed, on="id")
    # Now 'value' (from left) and 'right_value' are distinct and usable
    joined_df.select("id", "value", "right_value").show()
    ```

2.  **Alias DataFrames Before Joining:** Use `df.alias("some_alias")`. This allows you to reference columns unambiguously using the alias prefix (e.g., `col("alias.column_name")`) both in the join condition and in subsequent operations like `select`.
    ```python
    l = left_df.alias("l")
    r = right_df.alias("r")
    joined_df = l.join(r, l.id == r.id)
    # Select specific columns using aliases, renaming one 'value' column
    joined_df.select(
        col("l.id"),
        col("l.value"),
        col("r.value").alias("right_value")
    ).show()
    ```

3.  **Select Specific Columns After Join (Using Aliases or Careful Referencing):** If you used aliases (Method 2), you can select using `col("alias.column_name")`. If you joined without prior renaming or aliases and have ambiguous columns, selecting them directly (`joined_df.select("ambiguous_col")`) will fail. You would typically need to have used Method 1 or 2 beforehand. While sometimes `joined_df[left_df.ambiguous_col]` might seem to work, relying on this can be fragile. The most robust methods involve explicit renaming or aliasing *before* the join.

> Proactively renaming columns or using DataFrame aliases before joining is generally the safest and most readable way to handle potential column name ambiguities in Spark. Relying on selecting ambiguous columns after the join is often problematic.

In [5]:
orderDF.join(productDF, on=join_expr, how="inner").select(
    orderDF.order_id,
    productDF.prod_name,
    orderDF.unit_price,
    orderDF.qty,
).show()

+--------+-------------------+----------+---+
|order_id|          prod_name|unit_price|qty|
+--------+-------------------+----------+---+
|      03|       Scroll Mouse|       195|  1|
|      01|      Optical Mouse|       350|  1|
|      05|      Optical Mouse|       350|  1|
|      02|     Wireless Mouse|       450|  1|
|      01|  Wireless Keyboard|       580|  1|
|      02|16 GB Flash Storage|       220|  1|
|      01|32 GB Flash Storage|       320|  2|
|      04|64 GB Flash Storage|       410|  2|
+--------+-------------------+----------+---+



In [6]:
productRenamedDF = productDF.withColumnRenamed("qty", "prod_qty")

# rename the column before joining to avoid ambiguity
orderDF.join(productRenamedDF, on=join_expr, how="inner").select(
    "order_id",
    "prod_name",
    "unit_price",
    "qty",
).show()

+--------+-------------------+----------+---+
|order_id|          prod_name|unit_price|qty|
+--------+-------------------+----------+---+
|      03|       Scroll Mouse|       195|  1|
|      01|      Optical Mouse|       350|  1|
|      05|      Optical Mouse|       350|  1|
|      02|     Wireless Mouse|       450|  1|
|      01|  Wireless Keyboard|       580|  1|
|      02|16 GB Flash Storage|       220|  1|
|      01|32 GB Flash Storage|       320|  2|
|      04|64 GB Flash Storage|       410|  2|
+--------+-------------------+----------+---+



In [7]:
# drop the column after joining to avoid ambiguity
orderDF.join(productRenamedDF, on=join_expr, how="inner").drop(
    productRenamedDF.prod_id,
).select(
    "order_id",
    "prod_id",
    "prod_name",
    "unit_price",
    "qty",
).show()

+--------+-------+-------------------+----------+---+
|order_id|prod_id|          prod_name|unit_price|qty|
+--------+-------+-------------------+----------+---+
|      03|     01|       Scroll Mouse|       195|  1|
|      01|     02|      Optical Mouse|       350|  1|
|      05|     02|      Optical Mouse|       350|  1|
|      02|     03|     Wireless Mouse|       450|  1|
|      01|     04|  Wireless Keyboard|       580|  1|
|      02|     06|16 GB Flash Storage|       220|  1|
|      01|     07|32 GB Flash Storage|       320|  2|
|      04|     08|64 GB Flash Storage|       410|  2|
+--------+-------+-------------------+----------+---+



In [8]:
spark.stop()