### Joins in Spark-SQL
- inner 
- left outer
- right outer
- full outer
- left semi
- left anti
- cross join


In [0]:
# Load transaction data
txns_df = spark.createDataFrame([
    ("101", "C001", 500, "Tamil Nadu"),
    ("102", "C002", 1500, "Kerala"),
    ("103", "C003", 2000, "Karnataka"),
    ("104", "C004", 700, "Tamil Nadu")
], ["txnid", "custid", "amount", "state"])

# Load customer data
cust_df = spark.createDataFrame([
    ("C001", "Arjun"),
    ("C003", "Divya"),
    ("C005", "Ravi")
], ["custid", "fname"])

txns_df.printSchema()
cust_df.printSchema()


In [0]:
txns_df.display()
cust_df.display()


### Basic Join Syntax
- df1.join(df2, on="key_column", how="join_type")
- on → column(s) to join on (string or list of column names)
- how → type of join: "inner", "left", "right", "full", "left_semi", "left_anti", "cross"

In [0]:
### Inner Join - Returns rows with matching keys in both DataFrames.

txns_df.join(cust_df, on="custid", how="inner").display()

### Left Outer Join (Left Join)
- Returns all rows from the left DataFrame, and matched rows from the right DataFrame.
- If no match, right columns are null.

In [0]:
txns_df.join(cust_df, on="custid", how="left").display()

### Right Outer Join (Right Join)
- Returns all rows from the right DataFrame, and matched rows from the left DataFrame.
- If no match, left columns are null.

In [0]:
txns_df.join(cust_df, on="custid", how="right").display()

### Full Outer Join (Full Join)
- Returns all rows from both DataFrames, with null where no match.

In [0]:
txns_df.join(cust_df, on="custid", how="full").display()

### Left Semi Join
- Returns only rows from left DataFrame that have a match in right DataFrame.
- Columns come only from left DataFrame.

In [0]:
txns_df.join(cust_df, on="custid", how="left_semi").display()

### Left Anti Join
- Returns only rows from left DataFrame that do NOT have a match in right DataFrame.

In [0]:
txns_df.join(cust_df, on="custid", how="left_anti").display()

### Cross Join (Cartesian Product)
- Returns all combinations of rows from left and right DataFrames.
- Can be very large if both DataFrames are big.

In [0]:
txns_df.crossJoin(cust_df).display()

In [0]:
# Load transaction data
txns_df = spark.createDataFrame([
    ("101", "C001", 500, "Tamil Nadu"),
    ("102", "C002", 1500, "Kerala"),
    ("103", "C003", 2000, "Karnataka"),
    ("104", "C004", 700, "Tamil Nadu")
], ["txnid", "cust_id", "amount", "state"])

# Load customer data
cust_df = spark.createDataFrame([
    ("C001", "Arjun"),
    ("C003", "Divya"),
    ("C005", "Ravi")
], ["custid", "fname"])


In [0]:
txns_df.join(cust_df, txns_df.cust_id == cust_df.custid, "inner").display()

### Join using multiple columns

In [0]:
joined_df = txns_df.join(
    cust_df,
    (txns_df.cust_id == cust_df.custid) & (txns_df.state == cust_df.fname),
    how="inner"
).display()

### Join with multiple tables

In [0]:
txns_df = spark.createDataFrame([
    ("101", "C001", 500, "Tamil Nadu","PT1"),
    ("102", "C002", 1500, "Kerala","PT2"),
    ("103", "C003", 2000, "Karnataka","PT3"),
    ("104", "C004", 700, "Tamil Nadu","PT3")
], ["txnid", "cust_id", "amount", "state","paymenttype"])

# Load customer data
cust_df = spark.createDataFrame([
    ("C001", "Arjun"),
    ("C003", "Divya"),
    ("C005", "Ravi")
], ["custid", "fname"])

ptype_df = spark.createDataFrame([
    ("PT1","Cash"),("PT2","Debit"),
    ("PT3","Credit")
    ],
["paymenttype","paymentdesc"])

#select * from txns_df inner join cust_df on txns_df.cust_id = cust_df.custid left outer ptype_df on txns_df.paymenttype = ptype_df.paymenttype 

txns_df.join(cust_df, txns_df.cust_id == cust_df.custid, "inner") \
    .join(ptype_df,txns_df.paymenttype==ptype_df.paymenttype,"left") \
    .display()


In [0]:
txns_df.display()
cust_df.display()
ptype_df.display()
