In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/data/

path,name,size,modificationTime
dbfs:/FileStore/tables/data/csv/,csv/,0,0
dbfs:/FileStore/tables/data/json/,json/,0,0
dbfs:/FileStore/tables/data/parquet/,parquet/,0,0


In [0]:
df_csv = spark.read.format("csv").load("dbfs:/FileStore/tables/data/csv/batch.csv")

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/data/csv/batch.csv")

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [0]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType())
])

In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header", True).load("dbfs:/FileStore/tables/data/csv/batch.csv")

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").schema(schema).load("dbfs:/FileStore/tables/data/json")

In [0]:
df_json.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  5|David|1981-12-18|  41| 90000|        HR|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  7| Mike|1976-03-15|  46| 95000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  5|David|1981-12-18|  41| 90000|        HR|
|  6|Susan|1989-07-05|  33| 75000|   Finance|
|  7| Mike|1976-03-15|  46| 95000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  5|David|1981-12-18|  41| 90000|

In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = df_json.orderBy("id")

In [0]:
df_json.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|

In [0]:
df = df_csv.union(df_json)
df.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|

In [0]:
print(df_csv.columns, df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
df_json = df_json.select(df_csv.columns)

In [0]:
df_json.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  4|Emily|1994-11-22|  28| 70000|   Finance|
|  4|Emily|1994-11-22|  28| 65000|

In [0]:
df = df_csv.union(df_json)

In [0]:
df.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  2|Alice|1997-02-28|  25| 90000|   Finance|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|        IT|
|  3|  Bob|      null|null| 80000|

In [0]:
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df.dropDuplicates()

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.count()

Out[26]: 12

In [0]:
#Transformation
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.select(
    "salary"
).show()

df.select(
    df.salary
).show()

df.select(
    ["salary"]
).show()



+------+
|salary|
+------+
| 80000|
| 65000|
| 60000|
| 70000|
| 90000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+

+------+
|salary|
+------+
| 80000|
| 65000|
| 60000|
| 70000|
| 90000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+

+------+
|salary|
+------+
| 80000|
| 65000|
| 60000|
| 70000|
| 90000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+



In [0]:
df.select(
    "salary",
    "name",
    "age"
).show()


+------+------+----+
|salary|  name| age|
+------+------+----+
| 80000|   Bob|null|
| 65000| Emily|  28|
| 60000| Alice|  25|
| 70000|  John|  30|
| 90000| David|  41|
| 75000| Susan|  33|
| 95000|  Mike|  46|
| 62000|Sophie|  30|
| 90000| Alice|  25|
| 70000| Emily|  28|
| 87000| James|  39|
| 58000|  Lisa|  27|
+------+------+----+



In [0]:
from pyspark.sql import functions as F
# from pyspark.sql.functions import *   imports everything
# from pyspark.sql.functions import col,max,min can override python's max min functions

In [0]:
df.select(
    F.col("salary")
).show()

+------+
|salary|
+------+
| 80000|
| 65000|
| 60000|
| 70000|
| 90000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+



In [0]:
df.select(
    df.salary + 0.05* df.salary.alias("salary_raised"),
    F.year(F.current_timestamp())-F.year("dob"),
    F.year(F.current_timestamp())-F.year(F.col("dob")),
).show()

+-------------------------------------------+---------------------------------------+---------------------------------------+
|(salary + (salary AS salary_raised * 0.05))|(year(current_timestamp()) - year(dob))|(year(current_timestamp()) - year(dob))|
+-------------------------------------------+---------------------------------------+---------------------------------------+
|                                    84000.0|                                   null|                                   null|
|                                    68250.0|                                     30|                                     30|
|                                    63000.0|                                     27|                                     27|
|                                    73500.0|                                     32|                                     32|
|                                    94500.0|                                     43|                                 

In [0]:
df.select(
    "*",
    df.salary + 0.05* df.salary,
    F.expr("salary + 0.05* salary").alias("raise"),
    (F.year(F.current_timestamp())-F.year(F.col("dob"))).alias("ageee"),
).show()

+---+------+----------+----+------+----------+--------------------------+--------+-----+
| id|  name|       dob| age|salary|department|(salary + (salary * 0.05))|   raise|ageee|
+---+------+----------+----+------+----------+--------------------------+--------+-----+
|  3|   Bob|      null|null| 80000|        IT|                   84000.0|84000.00| null|
|  4| Emily|1994-11-22|  28| 65000|   Finance|                   68250.0|68250.00|   30|
|  2| Alice|1997-02-28|  25| 60000|        HR|                   63000.0|63000.00|   27|
|  1|  John|1992-05-12|  30| 70000|        IT|                   73500.0|73500.00|   32|
|  5| David|1981-12-18|  41| 90000|        HR|                   94500.0|94500.00|   43|
|  6| Susan|1989-07-05|  33| 75000|   Finance|                   78750.0|78750.00|   35|
|  7|  Mike|1976-03-15|  46| 95000|        IT|                   99750.0|99750.00|   48|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|                   65100.0|65100.00|   32|
|  2| Alice|1997-02-2

In [0]:
#with column

In [0]:
df = df.withColumn(
    "raise",
    (F.col("salary") + 0.05 *F.col("salary"))
)


df = df.withColumn(
    "raise_10percent",
    (F.col("salary") + .1 *F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+-------+---------------+
| id|  name|       dob| age|salary|department|  raise|raise_10percent|
+---+------+----------+----+------+----------+-------+---------------+
|  3|   Bob|      null|null| 80000|        IT|84000.0|        88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|68250.0|        71500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|63000.0|        66000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|73500.0|        77000.0|
|  5| David|1981-12-18|  41| 90000|        HR|94500.0|        99000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|78750.0|        82500.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|99750.0|       104500.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|65100.0|        68200.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|94500.0|        99000.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|73500.0|        77000.0|
|  9| James|1983-10-14|  39| 87000|        IT|91350.0|        95700.0|
|  8| 

In [0]:
#or
df = df.withColumn(
    "raise",
    (F.col("salary") + 0.05 *F.col("salary"))
).withColumn(
    "raise_10percent",
    (F.col("salary") + .1 *F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+-------+---------------+
| id|  name|       dob| age|salary|department|  raise|raise_10percent|
+---+------+----------+----+------+----------+-------+---------------+
|  3|   Bob|      null|null| 80000|        IT|84000.0|        88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|68250.0|        71500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|63000.0|        66000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|73500.0|        77000.0|
|  5| David|1981-12-18|  41| 90000|        HR|94500.0|        99000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|78750.0|        82500.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|99750.0|       104500.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|65100.0|        68200.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|94500.0|        99000.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|73500.0|        77000.0|
|  9| James|1983-10-14|  39| 87000|        IT|91350.0|        95700.0|
|  8| 

In [0]:
df = df.withColumns(
    {
        "raise": df.salary + 0.05 *F.col("salary"),
        "age": F.year(F.current_timestamp())-F.year(F.col("dob"))
    }
)

In [0]:
df.show()

+---+------+----------+----+------+----------+-------+---------------+
| id|  name|       dob| age|salary|department|  raise|raise_10percent|
+---+------+----------+----+------+----------+-------+---------------+
|  3|   Bob|      null|null| 80000|        IT|84000.0|        88000.0|
|  4| Emily|1994-11-22|  30| 65000|   Finance|68250.0|        71500.0|
|  2| Alice|1997-02-28|  27| 60000|        HR|63000.0|        66000.0|
|  1|  John|1992-05-12|  32| 70000|        IT|73500.0|        77000.0|
|  5| David|1981-12-18|  43| 90000|        HR|94500.0|        99000.0|
|  6| Susan|1989-07-05|  35| 75000|   Finance|78750.0|        82500.0|
|  7|  Mike|1976-03-15|  48| 95000|        IT|99750.0|       104500.0|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|65100.0|        68200.0|
|  2| Alice|1997-02-28|  27| 90000|   Finance|94500.0|        99000.0|
|  4| Emily|1994-11-22|  30| 70000|   Finance|73500.0|        77000.0|
|  9| James|1983-10-14|  41| 87000|        IT|91350.0|        95700.0|
|  8| 

In [0]:
#filter
df.filter(
    F.col("raise")> 75000
).show()

+---+-----+----------+----+------+----------+-------+---------------+
| id| name|       dob| age|salary|department|  raise|raise_10percent|
+---+-----+----------+----+------+----------+-------+---------------+
|  3|  Bob|      null|null| 80000|        IT|84000.0|        88000.0|
|  5|David|1981-12-18|  43| 90000|        HR|94500.0|        99000.0|
|  6|Susan|1989-07-05|  35| 75000|   Finance|78750.0|        82500.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|99750.0|       104500.0|
|  2|Alice|1997-02-28|  27| 90000|   Finance|94500.0|        99000.0|
|  9|James|1983-10-14|  41| 87000|        IT|91350.0|        95700.0|
+---+-----+----------+----+------+----------+-------+---------------+



In [0]:
#filter
df.where(
    F.col("raise")>= 75000
).show()

+---+-----+----------+----+------+----------+-------+---------------+
| id| name|       dob| age|salary|department|  raise|raise_10percent|
+---+-----+----------+----+------+----------+-------+---------------+
|  3|  Bob|      null|null| 80000|        IT|84000.0|        88000.0|
|  5|David|1981-12-18|  43| 90000|        HR|94500.0|        99000.0|
|  6|Susan|1989-07-05|  35| 75000|   Finance|78750.0|        82500.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|99750.0|       104500.0|
|  2|Alice|1997-02-28|  27| 90000|   Finance|94500.0|        99000.0|
|  9|James|1983-10-14|  41| 87000|        IT|91350.0|        95700.0|
+---+-----+----------+----+------+----------+-------+---------------+



In [0]:
from pyspark.sql import functions as F

df = df.withColumn(
    "age-group",
    F.when(
        F.col("age") <= 20,
        "upto 20"
    ).when(
        (
            (F.col("age") > 20) &  # Corrected the logical AND operator
            (F.col("age") <= 30)
        ),
        "20 to 30"
    ).when(
        (
            (F.col("age") > 30) &  # Corrected the logical AND operator
            (F.col("age") <= 40)
        ),
        "30 to 40"
    ).otherwise(
        "More than 40"
    )
)


In [0]:
df.show()

+---+------+----------+----+------+----------+-------+---------------+------------+
| id|  name|       dob| age|salary|department|  raise|raise_10percent|   age-group|
+---+------+----------+----+------+----------+-------+---------------+------------+
|  3|   Bob|      null|null| 80000|        IT|84000.0|        88000.0|More than 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|68250.0|        71500.0|    20 to 30|
|  2| Alice|1997-02-28|  27| 60000|        HR|63000.0|        66000.0|    20 to 30|
|  1|  John|1992-05-12|  32| 70000|        IT|73500.0|        77000.0|    30 to 40|
|  5| David|1981-12-18|  43| 90000|        HR|94500.0|        99000.0|More than 40|
|  6| Susan|1989-07-05|  35| 75000|   Finance|78750.0|        82500.0|    30 to 40|
|  7|  Mike|1976-03-15|  48| 95000|        IT|99750.0|       104500.0|More than 40|
| 10|Sophie|1992-06-30|  32| 62000|   Finance|65100.0|        68200.0|    30 to 40|
|  2| Alice|1997-02-28|  27| 90000|   Finance|94500.0|        99000.0|    20

In [0]:
df = df.withColumn(
   "company" ,
     F.lit("Abacus Insights")
)

In [0]:
df.show()

+---+------+----------+----+------+----------+-------+---------------+------------+---------------+
| id|  name|       dob| age|salary|department|  raise|raise_10percent|   age-group|        company|
+---+------+----------+----+------+----------+-------+---------------+------------+---------------+
|  3|   Bob|      null|null| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|
|  1|  John|1992-05-12|  32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|
|  5| David|1981-12-18|  43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|
|  6| Susan|1989-07-05|  35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Insights|
|  7|  Mike|1976-03-15|  48| 95000|        IT|99750.0|       104500.0|More than 40|Abacus Insights|


In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp())- F.year(F.col("dob")),
            F.lit(-1),
        ),
       
    })

In [0]:
df.show()

+---+------+----------+---+------+----------+-------+---------------+------------+---------------+
| id|  name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|
+---+------+----------+---+------+----------+-------+---------------+------------+---------------+
|  3|   Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|
|  4| Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|
|  2| Alice|1997-02-28| 27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|
|  1|  John|1992-05-12| 32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|
|  5| David|1981-12-18| 43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|
|  6| Susan|1989-07-05| 35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Insights|
|  7|  Mike|1976-03-15| 48| 95000|        IT|99750.0|       104500.0|More than 40|Abacus Insights|
| 10|Sophi

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp())- F.year(F.col("dob")),
            F.lit(-1),
        ),
        "has_dob_1" : ~F.isnull("dob"),
        "has_dob_2" : F.col("dob").isNotNull(),
    })

In [0]:
df.show()

+---+------+----------+---+------+----------+-------+---------------+------------+---------------+---------+---------+
| id|  name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+-------+---------------+------------+---------------+---------+---------+
|  3|   Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|    false|
|  4| Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|     true|     true|
|  1|  John|1992-05-12| 32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|     true|     true|
|  5| David|1981-12-18| 43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|     true|     true|
|  6| Susan|1989-07-05| 35| 75000|   Finance|787

In [0]:
df = df.drop("has_dob_2")

In [0]:
df = df.withColumnRenamed("name", "first_name")

In [0]:
print(df.columns)

['id', 'first_name', 'dob', 'age', 'salary', 'department', 'raise', 'raise_10percent', 'age-group', 'company', 'has_dob_1']


In [0]:
df

Out[52]: DataFrame[id: int, first_name: string, dob: date, age: int, salary: int, department: string, raise: double, raise_10percent: double, age-group: string, company: string, has_dob_1: boolean]

In [0]:
df.show()

+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
| id|first_name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|
+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|
|  2|     Alice|1997-02-28| 27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|     true|
|  1|      John|1992-05-12| 32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|     true|
|  5|     David|1981-12-18| 43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|     true|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Ins

In [0]:
from pyspark.sql.window import Window

In [0]:
window = Window.partitionBy("department")

df.withColumn(
    "mean_salary",
    F.mean("salary").over(window)>=F.col("salary")
).show()

+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+-----------+
| id|first_name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|mean_salary|
+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+-----------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|       true|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Insights|     true|      false|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|65100.0|        68200.0|    30 to 40|Abacus Insights|     true|       true|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|94500.0|        99000.0|    20 to 30|Abacus Insights|     true|      false|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|73500.0|        77000.0|    20 to 30|Abacus Insights|     true|       true|


In [0]:
# calculate mean salary and check if it is greater or equal to the salary of employees in each department.
# calculate mean salary and check if it is greater or equal to the salary of all employees

In [0]:
df_par = spark.read.format("parquet").schema(schema).load("dbfs:/FileStore/tables/data/parquet")

In [0]:
df_par.show()

+----+----+----+----+------+----------+
|  id|name| dob| age|salary|department|
+----+----+----+----+------+----------+
|null|null|null|null|  null|   Finance|
|null|null|null|null|  null|        HR|
|null|null|null|null|  null|        IT|
|null|null|null|null|  null|  Delivery|
|null|null|null|null|  null|   Finance|
|null|null|null|null|  null|        HR|
|null|null|null|null|  null|        IT|
|null|null|null|null|  null|  Delivery|
|null|null|null|null|  null|   Finance|
|null|null|null|null|  null|        HR|
|null|null|null|null|  null|        IT|
|null|null|null|null|  null|  Delivery|
+----+----+----+----+------+----------+



In [0]:
df.show()

+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
| id|first_name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|
+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|
|  2|     Alice|1997-02-28| 27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|     true|
|  1|      John|1992-05-12| 32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|     true|
|  5|     David|1981-12-18| 43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|     true|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Ins

In [0]:
df.join(
    df_par,
    "department",
    "semi"
).show()

+----------+---+----------+----------+---+------+-------+---------------+------------+---------------+---------+
|department| id|first_name|       dob|age|salary|  raise|raise_10percent|   age-group|        company|has_dob_1|
+----------+---+----------+----------+---+------+-------+---------------+------------+---------------+---------+
|        IT|  3|       Bob|      null| -1| 80000|84000.0|        88000.0|More than 40|Abacus Insights|    false|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|
|        HR|  2|     Alice|1997-02-28| 27| 60000|63000.0|        66000.0|    20 to 30|Abacus Insights|     true|
|        IT|  1|      John|1992-05-12| 32| 70000|73500.0|        77000.0|    30 to 40|Abacus Insights|     true|
|        HR|  5|     David|1981-12-18| 43| 90000|94500.0|        99000.0|More than 40|Abacus Insights|     true|
|   Finance|  6|     Susan|1989-07-05| 35| 75000|78750.0|        82500.0|    30 to 40|Abacus Ins

In [0]:
df.join(
    df_par,
    df.department == df_par.department,
    "semi"
).show()

+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
| id|first_name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|
+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus Insights|     true|
|  2|     Alice|1997-02-28| 27| 60000|        HR|63000.0|        66000.0|    20 to 30|Abacus Insights|     true|
|  1|      John|1992-05-12| 32| 70000|        IT|73500.0|        77000.0|    30 to 40|Abacus Insights|     true|
|  5|     David|1981-12-18| 43| 90000|        HR|94500.0|        99000.0|More than 40|Abacus Insights|     true|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|78750.0|        82500.0|    30 to 40|Abacus Ins

In [0]:
df = df.alias("main")
df_par = df_par.alias("reference")

In [0]:
df =df.join(
    df_par,
    F.col("main.department") == F.col("reference.department"),
    "left"
)

In [0]:
df.show()

+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+----+----+----+----+------+----------+
| id|first_name|       dob|age|salary|department|  raise|raise_10percent|   age-group|        company|has_dob_1|  id|name| dob| age|salary|department|
+---+----------+----------+---+------+----------+-------+---------------+------------+---------------+---------+----+----+----+----+------+----------+
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|null|null|null|null|  null|        IT|
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|null|null|null|null|  null|        IT|
|  3|       Bob|      null| -1| 80000|        IT|84000.0|        88000.0|More than 40|Abacus Insights|    false|null|null|null|null|  null|        IT|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|68250.0|        71500.0|    20 to 30|Abacus I

In [0]:
# load save data

In [0]:
df.write.mode("overwrite").format("csv").save("dbfs:/FileStore/tables/final")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4005864246803051>:1[0m
[0;32m----> 1[0m [43mdf[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mmode[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43moverwrite[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mcsv[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43msave[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mdbfs:/FileStore/tables/final[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48

In [0]:
%fs ls dbfs:/FileStore/tables/final



In [0]:
spark.sql("show tables").show()



In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("final")



In [0]:
spark.sql("show tables").show()



In [0]:
%fs ls dbfs:/user/hive/warehouse/final/_delta_log




In [0]:
%sql
desc default.final



In [0]:
%sql
desc detail default.final



In [0]:
%sql
desc extended default.final



In [0]:
%sql
select * except (raise, raise_10percent, age, company, dob)from final



In [0]:
table_df = spark.table("final")

#to directly load from delta unlike spark.read.format



In [0]:
display(table_df)

