In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [0]:
df_csv = spark.read.format('csv').load('dbfs:///FileStore/tables/csv/batch.csv')

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv = spark.read.format('csv').option('header', True).load('dbfs:///FileStore/tables/csv/batch.csv')

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
schema = StructType([
    StructField('id', IntegerType()),
    StructField('name', StringType()),
    StructField('dob', DateType()),
    StructField('age', IntegerType()),
    StructField('salary', IntegerType()),
    StructField('department', StringType())
])

In [0]:
df_csv = spark.read.format('csv').schema(schema).option('header', True).load('dbfs:///FileStore/tables/csv/batch.csv')

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
%fs ls dbfs:///FileStore/tables/json/

path,name,size,modificationTime
dbfs:/FileStore/tables/json/10_20220101.json,10_20220101.json,102,1707701629000
dbfs:/FileStore/tables/json/1_20220101.json,1_20220101.json,94,1707701629000
dbfs:/FileStore/tables/json/2_20220101.json,2_20220101.json,100,1707701630000
dbfs:/FileStore/tables/json/4_20220101.json,4_20220101.json,100,1707701630000
dbfs:/FileStore/tables/json/8_20220101.json,8_20220101.json,94,1707701630000
dbfs:/FileStore/tables/json/9_20220101.json,9_20220101.json,95,1707701630000
dbfs:/FileStore/tables/json/batch.jsonl,batch.jsonl,671,1707701631000


In [0]:
df_json = spark.read.format("json").load("dbfs:///FileStore/tables/json")

In [0]:
df_json.display()

age,department,dob,id,name,salary
30.0,IT,1992-05-12,1,John,70000
25.0,HR,1997-02-28,2,Alice,60000
,IT,,3,Bob,80000
28.0,Finance,1994-11-22,4,Emily,65000
41.0,HR,1981-12-18,5,David,90000
33.0,Finance,1989-07-05,6,Susan,75000
46.0,IT,1976-03-15,7,Mike,95000
30.0,Finance,1992-06-30,10,Sophie,62000
25.0,Finance,1997-02-28,2,Alice,90000
28.0,Finance,1994-11-22,4,Emily,70000


In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df = df_csv.union(df_json)

In [0]:
df.show()

+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   1|   John|1992-05-12|  30| 70000|        IT|
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|null|     IT|      null|   3|   Bob|     80000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     90000|
|  28|Finance|1994-11-22|   4| Emily|     70000|
|  39|     IT|1983-10-14|   9| James|     87000|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|
+----+-------+------

In [0]:
df_json = df_json.select(df_csv.columns)

In [0]:
df_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
print(df_json.columns, df_csv.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
df_json.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df_json.union(df_csv)

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
df_json = spark.read.format('json').schema(schema).load('dbfs:///FileStore/tables/json')

In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df.dropDuplicates()

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  3|   Bob|      null|null| 80000|        IT|
|  5| David|1981-12-18|  41| 90000|        HR|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.display()

id,name,dob,age,salary,department
1,John,1992-05-12,30.0,70000,IT
4,Emily,1994-11-22,28.0,65000,Finance
2,Alice,1997-02-28,25.0,60000,HR
6,Susan,1989-07-05,33.0,75000,Finance
7,Mike,1976-03-15,46.0,95000,IT
3,Bob,,,80000,IT
5,David,1981-12-18,41.0,90000,HR
10,Sophie,1992-06-30,30.0,62000,Finance
2,Alice,1997-02-28,25.0,90000,Finance
4,Emily,1994-11-22,28.0,70000,Finance


In [0]:
df.select("salary", "age").show()
df.select(["salary", "age"]).show()
df.select(df.salary, df.age).show()


+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 75000|  33|
| 95000|  46|
| 80000|null|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 75000|  33|
| 95000|  46|
| 80000|null|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 75000|  33|
| 95000|  46|
| 80000|null|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+



In [0]:
print(df.count())

12


In [0]:
from pyspark.sql import functions as F

In [0]:
df.select(F.col("salary"), F.col("age")).show()

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 75000|  33|
| 95000|  46|
| 80000|null|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+



In [0]:
df.select(
    df.salary + .05 * df.salary,
    F.year(F.current_timestamp()) - F.year("dob"),
    F.year(F.current_timestamp()) - F.year(F.col("dob"))
).show()

+--------------------------+---------------------------------------+---------------------------------------+
|(salary + (salary * 0.05))|(year(current_timestamp()) - year(dob))|(year(current_timestamp()) - year(dob))|
+--------------------------+---------------------------------------+---------------------------------------+
|                   73500.0|                                     32|                                     32|
|                   68250.0|                                     30|                                     30|
|                   63000.0|                                     27|                                     27|
|                   78750.0|                                     35|                                     35|
|                   99750.0|                                     48|                                     48|
|                   84000.0|                                   null|                                   null|
|                  

In [0]:
df.select(
    (df.salary + .05 * df.salary).alias("increased_salary"),
    (F.year(F.current_timestamp()) - F.year("dob")).alias('age')
).show()

+----------------+----+
|increased_salary| age|
+----------------+----+
|         73500.0|  32|
|         68250.0|  30|
|         63000.0|  27|
|         78750.0|  35|
|         99750.0|  48|
|         84000.0|null|
|         94500.0|  43|
|         65100.0|  32|
|         94500.0|  27|
|         73500.0|  30|
|         91350.0|  41|
|         60900.0|  29|
+----------------+----+



In [0]:
df = df.withColumn(
    "salary_raise",
    F.col('salary') + F.col('salary') * 0.05
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|salary_raise|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|     99750.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     73500.0|
|  9| James|1983-10-14|  39| 87000|        IT|     91350.0|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|     60900.0|
+---+------+----------+----+------+----------+------------+



We can chain to add multiple columns 
df = df.withColumn(
    column name,
    column value
).withColumn(
    column name,
    column value
)

In [0]:
df = df.withColumns(
    {
        "salary_raise": F.col('salary') + F.col('salary') * 0.07,
        "age": F.year(F.current_timestamp()) - F.year("dob")
    }
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|salary_raise|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     74900.0|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     69550.0|
|  2| Alice|1997-02-28|  27| 60000|        HR|     64200.0|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     80250.0|
|  7|  Mike|1976-03-15|  48| 95000|        IT|    101650.0|
|  3|   Bob|      null|null| 80000|        IT|     85600.0|
|  5| David|1981-12-18|  43| 90000|        HR|     96300.0|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|     66340.0|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     96300.0|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     74900.0|
|  9| James|1983-10-14|  41| 87000|        IT|     93090.0|
|  8|  Lisa|1995-08-20|  29| 58000|        HR|     62060.0|
+---+------+----------+----+------+----------+------------+



In [0]:
df.filter(
    F.col("salary_raise") >= 75000
).show()

+---+-----+----------+----+------+----------+------------+
| id| name|       dob| age|salary|department|salary_raise|
+---+-----+----------+----+------+----------+------------+
|  6|Susan|1989-07-05|  35| 75000|   Finance|     80250.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|    101650.0|
|  3|  Bob|      null|null| 80000|        IT|     85600.0|
|  5|David|1981-12-18|  43| 90000|        HR|     96300.0|
|  2|Alice|1997-02-28|  27| 90000|   Finance|     96300.0|
|  9|James|1983-10-14|  41| 87000|        IT|     93090.0|
+---+-----+----------+----+------+----------+------------+



In [0]:
df.where(
    F.col("salary_raise") >= 75000
).show()

+---+-----+----------+----+------+----------+------------+
| id| name|       dob| age|salary|department|salary_raise|
+---+-----+----------+----+------+----------+------------+
|  6|Susan|1989-07-05|  35| 75000|   Finance|     80250.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|    101650.0|
|  3|  Bob|      null|null| 80000|        IT|     85600.0|
|  5|David|1981-12-18|  43| 90000|        HR|     96300.0|
|  2|Alice|1997-02-28|  27| 90000|   Finance|     96300.0|
|  9|James|1983-10-14|  41| 87000|        IT|     93090.0|
+---+-----+----------+----+------+----------+------------+



In [0]:
df = df.withColumn(
    "age_group",
    F.when(F.col("age") <= 20, "Upto 20")
    .when((F.col("age") > 20) & (F.col("age") <= 30), "21 to 30")
    .when((F.col("age") > 30) & (F.col("age") <= 40), "31 to 40")
    .otherwise("More than 40")
    )

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|   age_group|
+---+------+----------+----+------+----------+------------+------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     74900.0|    31 to 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     69550.0|    21 to 30|
|  2| Alice|1997-02-28|  27| 60000|        HR|     64200.0|    21 to 30|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     80250.0|    31 to 40|
|  7|  Mike|1976-03-15|  48| 95000|        IT|    101650.0|More than 40|
|  3|   Bob|      null|null| 80000|        IT|     85600.0|More than 40|
|  5| David|1981-12-18|  43| 90000|        HR|     96300.0|More than 40|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|     66340.0|    31 to 40|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     96300.0|    21 to 30|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     74900.0|    21 to 30|
|  9| James|1983-10-14|  41| 87000|        IT|     

In [0]:
df = df.withColumn(
    "age_group",
    F.when(F.col("age") <= 20, "Upto 20")
    .when((F.col("age") > 20) & (F.col("age") <= 30), "21 to 30")
    .when((F.col("age") > 30) & (F.col("age") <= 40), "31 to 40")
    .otherwise(F.lit("More than 40"))
    )

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|   age_group|
+---+------+----------+----+------+----------+------------+------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     74900.0|    31 to 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     69550.0|    21 to 30|
|  2| Alice|1997-02-28|  27| 60000|        HR|     64200.0|    21 to 30|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     80250.0|    31 to 40|
|  7|  Mike|1976-03-15|  48| 95000|        IT|    101650.0|More than 40|
|  3|   Bob|      null|null| 80000|        IT|     85600.0|More than 40|
|  5| David|1981-12-18|  43| 90000|        HR|     96300.0|More than 40|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|     66340.0|    31 to 40|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     96300.0|    21 to 30|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     74900.0|    21 to 30|
|  9| James|1983-10-14|  41| 87000|        IT|     

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp()) - F.year(F.col("dob")),
            F.lit(-1)
        ),
        "has_dob_1": ~(F.isnull("dob")),
        "has_dob_2": F.col("dob").isNotNull()
    }
)

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+------------+---------+---------+
| id|  name|       dob|age|salary|department|salary_raise|   age_group|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+------------+------------+---------+---------+
|  1|  John|1992-05-12| 32| 70000|        IT|     74900.0|    31 to 40|     true|     true|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     64200.0|    21 to 30|     true|     true|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|     true|     true|
|  7|  Mike|1976-03-15| 48| 95000|        IT|    101650.0|More than 40|     true|     true|
|  3|   Bob|      null| -1| 80000|        IT|     85600.0|More than 40|    false|    false|
|  5| David|1981-12-18| 43| 90000|        HR|     96300.0|More than 40|     true|     true|
| 10|Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|     true

In [0]:
df = df.drop("has_dob_1", "has_dob_2")

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+------------+
| id|  name|       dob|age|salary|department|salary_raise|   age_group|
+---+------+----------+---+------+----------+------------+------------+
|  1|  John|1992-05-12| 32| 70000|        IT|     74900.0|    31 to 40|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|
|  2| Alice|1997-02-28| 27| 60000|        HR|     64200.0|    21 to 30|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|
|  7|  Mike|1976-03-15| 48| 95000|        IT|    101650.0|More than 40|
|  3|   Bob|      null| -1| 80000|        IT|     85600.0|More than 40|
|  5| David|1981-12-18| 43| 90000|        HR|     96300.0|More than 40|
| 10|Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|
|  2| Alice|1997-02-28| 27| 90000|   Finance|     96300.0|    21 to 30|
|  4| Emily|1994-11-22| 30| 70000|   Finance|     74900.0|    21 to 30|
|  9| James|1983-10-14| 41| 87000|        IT|     93090.0|More t

In [0]:
df = df.withColumnRenamed('name', 'first_name')

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|
+---+----------+----------+---+------+----------+------------+------------+
|  1|      John|1992-05-12| 32| 70000|        IT|     74900.0|    31 to 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     64200.0|    21 to 30|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|    101650.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|     85600.0|More than 40|
|  5|     David|1981-12-18| 43| 90000|        HR|     96300.0|More than 40|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     96300.0|    21 to 30|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     74900.0|    21 to 30|
|  9|     Ja

In [0]:
display(df)

id,first_name,dob,age,salary,department,salary_raise,age_group
1,John,1992-05-12,32,70000,IT,74900.0,31 to 40
4,Emily,1994-11-22,30,65000,Finance,69550.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,64200.0,21 to 30
6,Susan,1989-07-05,35,75000,Finance,80250.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,101650.0,More than 40
3,Bob,,-1,80000,IT,85600.0,More than 40
5,David,1981-12-18,43,90000,HR,96300.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,66340.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,96300.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,74900.0,21 to 30


In [0]:
from pyspark.sql import Window

In [0]:
window  = Window.partitionBy('age_group')

df.withColumn(
    "min_age_in_group",
    F.min('age').over(window)
).show()

+---+----------+----------+---+------+----------+------------+------------+----------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|min_age_in_group|
+---+----------+----------+---+------+----------+------------+------------+----------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     64200.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     96300.0|    21 to 30|              27|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     74900.0|    21 to 30|              27|
|  8|      Lisa|1995-08-20| 29| 58000|        HR|     62060.0|    21 to 30|              27|
|  1|      John|1992-05-12| 32| 70000|        IT|     74900.0|    31 to 40|              32|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|              32|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 t

In [0]:
partition_dep = Window.partitionBy('department')

df = df.withColumns(
    {
        'average_salary_by_department': F.avg('salary').over(partition_dep),
        'is_above_avg_by_dep': F.col('salary') >= F.col('average_salary_by_department')
    }
)

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|average_salary_by_department|is_above_avg|is_above_avg_by_dep|
+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|                     72400.0|       false|              false|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|                     72400.0|        true|               true|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|                     72400.0|       false|              false|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     96300.0|    21 to 30|                     72400.0|        true|               true|
|  4|     Emily|1994-11-22| 30| 70

In [0]:
average_salary = (df.select(F.avg('salary')).first()[0])

df.withColumns(
    {
    'avg_salary': F.lit(average_salary),
    'is_above_avg': F.col('salary') >= average_salary}
).show()

+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+-----------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|average_salary_by_department|is_above_avg|is_above_avg_by_dep|       avg_salary|
+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+-----------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|                     72400.0|       false|              false|75166.66666666667|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|                     72400.0|       false|               true|75166.66666666667|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|                     72400.0|       false|              false|75166.66666666667|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     96300.0|  

In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- salary: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary_raise: double (nullable = true)
 |-- age_group: string (nullable = false)
 |-- average_salary_by_department: double (nullable = true)
 |-- is_above_avg: boolean (nullable = true)
 |-- is_above_avg_by_dep: boolean (nullable = true)



In [0]:
ref_df =   spark.read.format('parquet').load("dbfs:///FileStore/tables/parquet")

In [0]:
ref_df.show()

+----------+-------+--------+
|department|manager|    lead|
+----------+-------+--------+
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
+----------+-------+--------+



In [0]:
df.join(
    ref_df,
    df.department == ref_df.department,
    "left"
).display()

id,first_name,dob,age,salary,department,salary_raise,age_group,average_salary_by_department,is_above_avg,is_above_avg_by_dep,department.1,manager,lead
4,Emily,1994-11-22,30,65000,Finance,69550.0,21 to 30,72400.0,False,False,Finance,Megan,Molly
6,Susan,1989-07-05,35,75000,Finance,80250.0,31 to 40,72400.0,True,True,Finance,Megan,Molly
10,Sophie,1992-06-30,32,62000,Finance,66340.0,31 to 40,72400.0,False,False,Finance,Megan,Molly
2,Alice,1997-02-28,27,90000,Finance,96300.0,21 to 30,72400.0,True,True,Finance,Megan,Molly
4,Emily,1994-11-22,30,70000,Finance,74900.0,21 to 30,72400.0,False,False,Finance,Megan,Molly
2,Alice,1997-02-28,27,60000,HR,64200.0,21 to 30,69333.33333333333,False,False,HR,Brad,Brian
5,David,1981-12-18,43,90000,HR,96300.0,More than 40,69333.33333333333,True,True,HR,Brad,Brian
8,Lisa,1995-08-20,29,58000,HR,62060.0,21 to 30,69333.33333333333,False,False,HR,Brad,Brian
1,John,1992-05-12,32,70000,IT,74900.0,31 to 40,83000.0,False,False,IT,Chris,Chandler
7,Mike,1976-03-15,48,95000,IT,101650.0,More than 40,83000.0,True,True,IT,Chris,Chandler


In [0]:
df = df.alias('df')
ref_df = ref_df.alias('ref')

df.join(
    ref_df,
    F.col("df.department") == F.col("ref.department"),
    "left"
).show()

+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|average_salary_by_department|is_above_avg|is_above_avg_by_dep|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+------------+----------------------------+------------+-------------------+----------+-------+--------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     69550.0|    21 to 30|                     72400.0|       false|              false|   Finance|  Megan|   Molly|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     80250.0|    31 to 40|                     72400.0|        true|               true|   Finance|  Megan|   Molly|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     66340.0|    31 to 40|                     72400.0|       false|              false|   Finance|  Megan|   Molly|
|  2

In [0]:
df.write.mode('overwrite').format('csv').save('dbfs:/FileStore/tables/final')

In [0]:
%fs ls dbfs:///FileStore/tables/final

path,name,size,modificationTime
dbfs:/FileStore/tables/final/_SUCCESS,_SUCCESS,0,1707806663000
dbfs:/FileStore/tables/final/_committed_1358792467138201331,_committed_1358792467138201331,113,1707804229000
dbfs:/FileStore/tables/final/_committed_317310594704327146,_committed_317310594704327146,211,1707806662000
dbfs:/FileStore/tables/final/_committed_vacuum7875302735787863697,_committed_vacuum7875302735787863697,96,1707806663000
dbfs:/FileStore/tables/final/_started_317310594704327146,_started_317310594704327146,0,1707806662000
dbfs:/FileStore/tables/final/part-00000-tid-317310594704327146-95280c5f-3806-478c-b7b4-098f0c9df0db-608-1-c000.csv,part-00000-tid-317310594704327146-95280c5f-3806-478c-b7b4-098f0c9df0db-608-1-c000.csv,865,1707806662000


In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [0]:
df.write.mode('overwrite').format('delta').saveAsTable('final')

In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    final|      false|
+--------+---------+-----------+



In [0]:
%sql

select * from final

id,first_name,dob,age,salary,department,salary_raise,age_group,average_salary_by_department,is_above_avg,is_above_avg_by_dep
4,Emily,1994-11-22,30,65000,Finance,69550.0,21 to 30,72400.0,False,False
6,Susan,1989-07-05,35,75000,Finance,80250.0,31 to 40,72400.0,True,True
10,Sophie,1992-06-30,32,62000,Finance,66340.0,31 to 40,72400.0,False,False
2,Alice,1997-02-28,27,90000,Finance,96300.0,21 to 30,72400.0,True,True
4,Emily,1994-11-22,30,70000,Finance,74900.0,21 to 30,72400.0,False,False
2,Alice,1997-02-28,27,60000,HR,64200.0,21 to 30,69333.33333333333,False,False
5,David,1981-12-18,43,90000,HR,96300.0,More than 40,69333.33333333333,True,True
8,Lisa,1995-08-20,29,58000,HR,62060.0,21 to 30,69333.33333333333,False,False
1,John,1992-05-12,32,70000,IT,74900.0,31 to 40,83000.0,False,False
7,Mike,1976-03-15,48,95000,IT,101650.0,More than 40,83000.0,True,True
