#### Create Spark Session

In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Adding AWS S3 Minio configs
sparkConf = (
    SparkConf()
    .set("spark.jars.ivy","/home/brijeshdhaker/.ivy2")
    #.set("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.0.0")
    #.set("spark.executor.heartbeatInterval", "300000")
    #.set("spark.network.timeout", "400000")
    #.set("spark.hadoop.fs.s3a.endpoint", "http://minio.sandbox.net:9010")
    #.set("spark.hadoop.fs.s3a.access.key", "pgm2H2bR7a5kMc5XCYdO")
    #.set("spark.hadoop.fs.s3a.secret.key", "zjd8T0hXFGtfemVQ6AH3yBAPASJNXNbVSx5iddqG")
    #.set("spark.hadoop.fs.s3a.path.style.access", "true")
    #.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    #.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.set("spark.sql.warehouse.dir", "s3a://defaultfs/spark/warehouse")
    #.set("spark.hadoop.fs.defaultFS", "s3a://defaultfs/")
    #.set("spark.eventLog.enabled", "true")
    #.set("spark.eventLog.dir", "file:///apps/var/logs/spark-events")
)

spark = (
    SparkSession.builder.master("local[*]").
        appName('spark-joins-notebook').
        config(conf=sparkConf).
        getOrCreate()
)

spark.sparkContext.setLogLevel('ERROR')
spark

### Spark Configurations

#### Set

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "8")

#### Get

In [None]:
print(spark.conf.get("spark.sql.shuffle.partitions"))

### Read Data

In [None]:

employee_columns = ['emp_id', 'emp_name', 'emp_role', 'emp_manager', 'emp_hiredate', 'emp_salary', 'emp_comm', 'emp_dept']

employee_schema = StructType() \
    .add("emp_id", IntegerType(), True) \
    .add("emp_name", StringType(), True) \
    .add("emp_role", StringType(), True) \
    .add("emp_manager", StringType(), True) \
    .add("emp_hiredate", DateType(), True) \
    .add("emp_salary", IntegerType(), True) \
    .add("emp_comm", IntegerType(), True) \
    .add("emp_dept", IntegerType(), True)

employee_df = spark.read.csv("file:///apps/sandbox/defaultfs/employee.csv",
    header=True,
    schema=employee_schema
)

#
# employee_df.printSchema()

#
# print(employee_df.rdd.getNumPartitions())

#
employee_df.show(truncate=False)

In [None]:

dept_columns = ['dept_id', 'dept_name', 'dept_location']

dept_schema = StructType() \
    .add("dept_id", IntegerType(), True) \
    .add("dept_name", StringType(), True) \
    .add("dept_location", StringType(), True)

dept_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(dept_schema) \
    .load("file:///apps/sandbox/defaultfs/departments.csv")

#dept_df.printSchema()

dept_df.show(truncate=False)

### Spark Joins

#### Inner Join
Returns only the rows from both the dataframes that have matching values in both columns specified as the join keys.

```sql
df1.join(df2, df1['key'] == df2['key'], 'inner').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "inner").show(truncate=False)

#### Left / Left Outer Join
Returns all the rows from the left dataframe and the matching rows from the right dataframe. If there are no matching values in the right dataframe, then it returns a null.

`Syntax`
```sql
df1.join(df2, df1['key'] == df2['key'], 'left').show()
(OR)
df1.join(df2, df1['key'] == df2['key'], 'leftouter').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "leftouter").show(truncate=False)

#### Right / Right Outer Join
```
df1.join(df2, df1['key'] == df2['key'], 'right').show()
(OR)
df1.join(df2, df1['key'] == df2['key'], 'rightouter').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "right").show(truncate=False)

#### Outer / Full Join

```sql
df1.join(df2, df1['key'] == df2['key'], 'outer').show()
(OR)
df1.join(df2, df1['key'] == df2['key'], 'full').show()
(OR)
df1.join(df2, df1['key'] == df2['key'], 'fullouter').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "outer").show(truncate=False)

### Cross Join

```sql
df1.crossJoin(df2).show()
```

In [None]:
employee_df.crossJoin(dept_df).show(truncate=False)

#### Left Anti Join
A left anti join in Spark SQL is a type of left join operation that returns only the rows from the left dataframe that do not have matching values in the right dataframe. It is used to find the rows in one dataframe that do not have corresponding values in another dataframe.

```sql
df1.join(df2, df1['key'] == df2['key'], 'left_anti').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "left_anti").show(truncate=False)

#### Left Semi Join
A left semi join in Spark SQL is a type of join operation that returns only the columns from the left dataframe that have matching values in the right dataframe. It is used to find the values in one dataframe that have corresponding values in another dataframe.

```sql
df1.join(df2, df1['key'] == df2['key'], 'leftsemi').show()
```

In [None]:
employee_df.join(dept_df, employee_df["emp_dept"] == dept_df["dept_id"], "leftsemi").show(truncate=False)

#### Self Join
A self join in Spark SQL is a join operation in which a dataframe is joined with itself. It is used to compare the values within a single dataframe and return the rows that match specified criteria.

```sql
df.alias("df1").join(df.alias("df2"), df1['key'] == df2['key']).show()
```

In [None]:
employee_df.alias("employee_df1").join(employee_df.alias("employee_df2"), employee_df['emp_id'] == employee_df['emp_id']).show()