In [0]:
from pyspark.sql import SparkSession
sc = SparkSession.builder.getOrCreate()


In [0]:
%fs ls dbfs:///FileStore/tables


path,name,size,modificationTime
dbfs:/FileStore/tables/Detail-1.csv,Detail-1.csv,208476,1706073759000
dbfs:/FileStore/tables/Detail-2.csv,Detail-2.csv,208476,1706256531000
dbfs:/FileStore/tables/Detail.csv,Detail.csv,208476,1706073664000
dbfs:/FileStore/tables/Project_1-1.xlsx,Project_1-1.xlsx,422501,1706517974000
dbfs:/FileStore/tables/Project_1-2.xlsx,Project_1-2.xlsx,422501,1706604678000
dbfs:/FileStore/tables/Project_1-3.xlsx,Project_1-3.xlsx,422501,1706863787000
dbfs:/FileStore/tables/Project_1.xlsx,Project_1.xlsx,422501,1706504988000
dbfs:/FileStore/tables/Project_1__1_.xlsx,Project_1__1_.xlsx,419016,1706605058000
dbfs:/FileStore/tables/contactinfo.txt,contactinfo.txt,49969,1706076089000
dbfs:/FileStore/tables/csv/,csv/,0,0


In [0]:
df_csv = spark.read.format("csv").load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_csv = spark.read.format("csv").option("header",True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
type(df_csv)

Out[294]: pyspark.sql.dataframe.DataFrame

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age",IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv=spark.read.format("csv").schema(schema).option("header",True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json")

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Fina

In [0]:
df_json.printSchema() 

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
#Union 
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['age', 'department', 'dob', 'id', 'name', 'salary']


In [0]:
# df.show()

In [0]:
df_json=df_json.select(df_csv.columns)

In [0]:
df_json=df_json.select(df_csv.columns)

In [0]:
df_json=spark.read.format("json").schema(schema).load("dbfs:/FileStore/tables/json").orderBy("id")

In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df=df_json.union(df_csv)

In [0]:
df.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  5|David|1981-12-18|  41| 90000|        HR|
|  5|David|1981-12-18|  41| 90000|        HR|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  7| Mike|1976-03-15|  46| 95000|

In [0]:
df.display()

id,name,dob,age,salary,department
1,John,1992-05-12,30.0,70000,IT
1,John,1992-05-12,30.0,70000,IT
1,John,1992-05-12,30.0,70000,IT
1,John,1992-05-12,30.0,70000,IT
2,Alice,1997-02-28,25.0,60000,HR
2,Alice,1997-02-28,25.0,60000,HR
2,Alice,1997-02-28,25.0,90000,Finance
2,Alice,1997-02-28,25.0,90000,Finance
3,Bob,,,80000,IT
3,Bob,,,80000,IT


In [0]:
df=df.dropDuplicates()

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  5| David|1981-12-18|  41| 90000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df_json.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  5|David|1981-12-18|  41| 90000|        HR|
|  5|David|1981-12-18|  41| 90000|        HR|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  7| Mike|1976-03-15|  46| 95000|

In [0]:
#SELECT
from pyspark.sql import functions as F
# from pyspark.sql.functions import *
# from pyspark.sql.functions import col, max min

In [0]:
df.select(
    "salary"
).show()

+------+
|salary|
+------+
| 90000|
| 80000|
| 65000|
| 60000|
| 70000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+



In [0]:
df.select(
    "salary","age"
).show()


+------+----+
|salary| age|
+------+----+
| 90000|  41|
| 80000|null|
| 65000|  28|
| 60000|  25|
| 70000|  30|
| 75000|  33|
| 95000|  46|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+



In [0]:
df.select(
    df.salary + .05 * df.salary,
    F.year(F.current_timestamp()) - F.year("dob"),
    F.year(F.current_timestamp()) - F.year(F.col("dob"))
).show

Out[321]: <bound method DataFrame.show of DataFrame[(salary + (salary * 0.05)): double, (year(current_timestamp()) - year(dob)): int, (year(current_timestamp()) - year(dob)): int]>

In [0]:
#Alias
df.select(
    "*",
    df.salary + .05 * df.salary,
    (df.salary + .05 * df.salary),
    F.expr("salary + .05 * salary").alias("salary_raise"),
    (
        F.year(F.current_timestamp()) - F.year(F.col("dob"))
    ).alias("age")
).show()

+---+------+----------+----+------+----------+--------------------------+--------------------------+------------+----+
| id|  name|       dob| age|salary|department|(salary + (salary * 0.05))|(salary + (salary * 0.05))|salary_raise| age|
+---+------+----------+----+------+----------+--------------------------+--------------------------+------------+----+
|  5| David|1981-12-18|  41| 90000|        HR|                   94500.0|                   94500.0|    94500.00|  43|
|  3|   Bob|      null|null| 80000|        IT|                   84000.0|                   84000.0|    84000.00|null|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                   68250.0|                   68250.0|    68250.00|  30|
|  2| Alice|1997-02-28|  25| 60000|        HR|                   63000.0|                   63000.0|    63000.00|  27|
|  1|  John|1992-05-12|  30| 70000|        IT|                   73500.0|                   73500.0|    73500.00|  32|
|  6| Susan|1989-07-05|  33| 75000|   Finance|  

In [0]:
df.select(
    ["salary","age"]
).show()

df.select(
    df.salary,
    df.age
).show()

df.select(
    F.col("salary"),
    F.col("age")
)

+------+----+
|salary| age|
+------+----+
| 90000|  41|
| 80000|null|
| 65000|  28|
| 60000|  25|
| 70000|  30|
| 75000|  33|
| 95000|  46|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+

+------+----+
|salary| age|
+------+----+
| 90000|  41|
| 80000|null|
| 65000|  28|
| 60000|  25|
| 70000|  30|
| 75000|  33|
| 95000|  46|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+

Out[323]: DataFrame[salary: int, age: int]

In [0]:
#WithColumn
df = df.withColumn(
    "salary_raise",
    (F.col("salary") + .05 * F.col("salary"))
)

df = df.withColumn(
    "salary_raise_10_percent",
    (F.col("salary") + .1 * F.col("salary"))
)



In [0]:
# or 


df = df.withColumn(
    "salary_raise",
    (F.col("salary") + .05 * F.col("salary"))
).withColumn(
    "salary_raise_10_percent",
    (F.col("salary") + .1 * F.col("salary"))
)

In [0]:
df = df.withColumns(
    {
        "salary_raise": F.col("salary") + .05 * F.col("salary"), 
        "age": F.year(F.current_timestamp()) - F.year(F.col("dob"))
    }
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-----------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_percent|
+---+------+----------+----+------+----------+------------+-----------------------+
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|                99000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|                88000.0|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|                71500.0|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|                66000.0|
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|                77000.0|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0|                82500.0|
|  7|  Mike|1976-03-15|  48| 95000|        IT|     99750.0|               104500.0|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|     65100.0|                68200.0|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     94500.0|                9

In [0]:
#filter
df.filter(
    F.col("salary_raise") >= 75000
).show()

df.where(
    F.col("salary_raise") >= 75000
).show()

+---+-----+----------+----+------+----------+------------+-----------------------+
| id| name|       dob| age|salary|department|salary_raise|salary_raise_10_percent|
+---+-----+----------+----+------+----------+------------+-----------------------+
|  5|David|1981-12-18|  43| 90000|        HR|     94500.0|                99000.0|
|  3|  Bob|      null|null| 80000|        IT|     84000.0|                88000.0|
|  6|Susan|1989-07-05|  35| 75000|   Finance|     78750.0|                82500.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|     99750.0|               104500.0|
|  2|Alice|1997-02-28|  27| 90000|   Finance|     94500.0|                99000.0|
|  9|James|1983-10-14|  41| 87000|        IT|     91350.0|                95700.0|
+---+-----+----------+----+------+----------+------------+-----------------------+

+---+-----+----------+----+------+----------+------------+-----------------------+
| id| name|       dob| age|salary|department|salary_raise|salary_raise_10_percent|
+--

In [0]:
df = df.withColumn(
    "age_group",
    F.when(F.col("age") <= 20, "Upto 20")
    .when((F.col("age") > 20) & (F.col("age") <= 30), "21 to 30")
    .when((F.col("age") > 30) & (F.col("age") <= 40), "31 to 40")
    .otherwise("More than 40")
    )

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-----------------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_percent|   age_group|
+---+------+----------+----+------+----------+------------+-----------------------+------------+
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|                99000.0|More than 40|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|                88000.0|More than 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|                66000.0|    21 to 30|
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|                77000.0|    31 to 40|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0|                82500.0|    31 to 40|
|  7|  Mike|1976-03-15|  48| 95000|        IT|     99750.0|               104500.0|More than 40|
| 10|Sophie|1992-06-30|  32| 6

In [0]:
df = df.withColumn(
    "company",
    F.lit("Abacus Insights")
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-----------------------+------------+---------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_percent|   age_group|        company|
+---+------+----------+----+------+----------+------------+-----------------------+------------+---------------+
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|                99000.0|More than 40|Abacus Insights|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|                88000.0|More than 40|Abacus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|Abacus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|                66000.0|    21 to 30|Abacus Insights|
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|                77000.0|    31 to 40|Abacus Insights|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0|                82500.0|    31 to 40|

In [0]:
#nulls
df =  df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp()) - F.year(F.col("dob")),
            F.lit(-1),
        ),
        "has_dob_1" : ~F.isnull("dob"),
        "has_dob_2" : F.col("dob").isNotNull()
    }
)

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+-----------------------+------------+---------------+---------+---------+
| id|  name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|        company|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+------------+-----------------------+------------+---------------+---------+---------+
|  5| David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|Abacus Insights|     true|     true|
|  3|   Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|Abacus Insights|    false|    false|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|Abacus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|                66000.0|    21 to 30|Abacus Insights|     true|     true|
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|                77

In [0]:
df = df.drop("company", "has_dob_1", "has_dob_2" )

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+-----------------------+------------+
| id|  name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|
+---+------+----------+---+------+----------+------------+-----------------------+------------+
|  5| David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|
|  3|   Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|                66000.0|    21 to 30|
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|                77000.0|    31 to 40|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     78750.0|                82500.0|    31 to 40|
|  7|  Mike|1976-03-15| 48| 95000|        IT|     99750.0|               104500.0|More than 40|
| 10|Sophie|1992-06-30| 32| 62000|   Fin

In [0]:
#WithColumnRenamed
df = df.withColumnRenamed("name", "first_name")

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|                66000.0|    21 to 30|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|                77000.0|    31 to 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|                82500.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|               104500.0|More than 40|


In [0]:
 from pyspark.sql import Window

In [0]:
window = Window.partitionBy("age_group")

df.withColumn(
    "min_age_in_group",
    F.min("age").over(window)
).show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|min_age_in_group|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|                66000.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|                99000.0|    21 to 30|              27|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|                77000.0|    21 to 30|              27|
|  8|      Lisa|1995-08-20| 29| 58000|        HR|     60900.0|                63800.0|    21 to 30|              27|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|  

#for SQL lowercase 

In [0]:
#Join
ref_df = spark.read.format("parquet").load("dbfs:///FileStore/tables/parquet")


In [0]:
ref_df.show()

+----------+-------+--------+
|department|manager|    lead|
+----------+-------+--------+
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
+----------+-------+--------+



In [0]:
df.join(
    ref_df,
    "department",
    "left",
).show()

+----------+---+----------+----------+---+------+------------+-----------------------+------------+-------+--------+
|department| id|first_name|       dob|age|salary|salary_raise|salary_raise_10_percent|   age_group|manager|    lead|
+----------+---+----------+----------+---+------+------------+-----------------------+------------+-------+--------+
|        HR|  5|     David|1981-12-18| 43| 90000|     94500.0|                99000.0|More than 40|   Brad|   Brian|
|        HR|  5|     David|1981-12-18| 43| 90000|     94500.0|                99000.0|More than 40|   Brad|   Brian|
|        IT|  3|       Bob|      null| -1| 80000|     84000.0|                88000.0|More than 40|  Chris|Chandler|
|        IT|  3|       Bob|      null| -1| 80000|     84000.0|                88000.0|More than 40|  Chris|Chandler|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     68250.0|                71500.0|    21 to 30|  Megan|   Molly|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     68250.0|  

In [0]:
#method2
df.join(
    ref_df,
    df.department==ref_df.department,
    "left"
).show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|   Fi

In [0]:
df= df.alias("main")
ref_df= ref_df.alias("refrence")

In [0]:
df.join(
    ref_df,
    F.col("main.department") == F.col("refrence.department"),
    "left",
).show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|   Fi

In [0]:
df= df.join(
    ref_df,
    F.col("main.department") == F.col("refrence.department"),
    "left",
)

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+----------+-------+--------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|        HR|   Brad|   Brian|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|   Fi

In [0]:
#Load(save)data
df.write.mode("overwrite").format("csv").save("dbfs:///FileStore/tables/final")

In [0]:
%fs ls dbfs:///FileStore/tables/final

path,name,size,modificationTime
dbfs:/FileStore/tables/final/_SUCCESS,_SUCCESS,0,1707804453000
dbfs:/FileStore/tables/final/_committed_1373865280067968330,_committed_1373865280067968330,114,1707804453000
dbfs:/FileStore/tables/final/_started_1373865280067968330,_started_1373865280067968330,0,1707804453000
dbfs:/FileStore/tables/final/part-00000-tid-1373865280067968330-844c7bd5-aebf-4fe0-8a42-7eb0e05160e9-1352-1-c000.csv,part-00000-tid-1373865280067968330-844c7bd5-aebf-4fe0-8a42-7eb0e05160e9-1352-1-c000.csv,701,1707804453000


In [0]:
df.join(
    ref_def
    F.col("main.department")==F.col("refrence.department"),"left",
).select(
   
).show()

[0;36m  File [0;32m<command-2845221430951445>:3[0;36m[0m
[0;31m    F.col("main.department")==F.col("refrence.department"),"left",[0m
[0m    ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:
df.alias("main").join(
    ref_df.alias("refrence"),
    F.col("main.department") == F.col("refrence.department"),
    "left",
).select(
    F.col("main.*"),
    F.col("refrence.manager"),
    F.col("lead")
).show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|manager|    lead|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+-------+--------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|   Brad|   Brian|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|   Brad|   Brian|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|  Chris|Chandler|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|  Megan|   Molly|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|  

In [0]:
#df.write.mode
df.write.mode("overwrite").format("delta").saveAsTable("final")

In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    final|      false|
+--------+---------+-----------+



In [0]:
%sql
select * from final

id,first_name,dob,age,salary,department,salary_raise,salary_raise_10_percent,age_group
5,David,1981-12-18,43,90000,HR,94500.0,99000.0,More than 40
3,Bob,,-1,80000,IT,84000.0,88000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,68250.0,71500.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,63000.0,66000.0,21 to 30
1,John,1992-05-12,32,70000,IT,73500.0,77000.0,31 to 40
6,Susan,1989-07-05,35,75000,Finance,78750.0,82500.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,99750.0,104500.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,65100.0,68200.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,94500.0,99000.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,73500.0,77000.0,21 to 30


In [0]:
%sql 
desc table default.final;

col_name,data_type,comment
id,int,
first_name,string,
dob,date,
age,int,
salary,int,
department,string,
salary_raise,double,
salary_raise_10_percent,double,
age_group,string,


In [0]:
%sql
select * from default.final

id,first_name,dob,age,salary,department,salary_raise,salary_raise_10_percent,age_group
5,David,1981-12-18,43,90000,HR,94500.0,99000.0,More than 40
3,Bob,,-1,80000,IT,84000.0,88000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,68250.0,71500.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,63000.0,66000.0,21 to 30
1,John,1992-05-12,32,70000,IT,73500.0,77000.0,31 to 40
6,Susan,1989-07-05,35,75000,Finance,78750.0,82500.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,99750.0,104500.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,65100.0,68200.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,94500.0,99000.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,73500.0,77000.0,21 to 30


In [0]:
table_df = spark.table("final")

In [0]:
table_df.show()

+---+----------+----------+---+------+----------+------------+-----------------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_raise_10_percent|   age_group|
+---+----------+----------+---+------+----------+------------+-----------------------+------------+
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|                99000.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|                88000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|                71500.0|    21 to 30|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|                66000.0|    21 to 30|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|                77000.0|    31 to 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|                82500.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|               104500.0|More than 40|


In [0]:
display(table_df)

id,first_name,dob,age,salary,department,salary_raise,salary_raise_10_percent,age_group
5,David,1981-12-18,43,90000,HR,94500.0,99000.0,More than 40
3,Bob,,-1,80000,IT,84000.0,88000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,68250.0,71500.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,63000.0,66000.0,21 to 30
1,John,1992-05-12,32,70000,IT,73500.0,77000.0,31 to 40
6,Susan,1989-07-05,35,75000,Finance,78750.0,82500.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,99750.0,104500.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,65100.0,68200.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,94500.0,99000.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,73500.0,77000.0,21 to 30
