In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/parquet

path,name,size,modificationTime
dbfs:/FileStore/tables/parquet/reference.parquet,reference.parquet,515,1707461332000


In [0]:
df_csv = spark.read.format("csv").load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/csv/batch.csv")
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType

In [0]:
sch = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv = spark.read.format("csv").schema(sch).option("header", True).load("dbfs:////FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json")
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df_json = df_json.orderBy("id")
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
+----+----------+----------+---+------+------+



In [0]:
df_json = df_json.select(df_csv.columns)

In [0]:
df = df_csv.union(df_json)
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
print(df_csv.columns, df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json = spark.read.format("json").schema(sch).load("dbfs:///FileStore/tables/json")

In [0]:
df_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df = df_json.dropDuplicates()

In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  5| David|1981-12-18|  41| 90000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.count()

Out[373]: 12

In [0]:
df.select(
    ['salary']
).show()
df.select(
    'salary'
).show()
df.select(
    df.salary,
    df.age
).show()

+------+
|salary|
+------+
| 90000|
| 80000|
| 65000|
| 60000|
| 70000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+

+------+
|salary|
+------+
| 90000|
| 80000|
| 65000|
| 60000|
| 70000|
| 75000|
| 95000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+

+------+----+
|salary| age|
+------+----+
| 90000|  41|
| 80000|null|
| 65000|  28|
| 60000|  25|
| 70000|  30|
| 75000|  33|
| 95000|  46|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+



unionall() returns duplicate values

In [0]:
from pyspark.sql import functions as F

In [0]:
df.select(
    (df.salary + .05*df.salary).alias("adjusted_salary"),
    (F.year(F.current_timestamp()) - F.year("dob")).alias("age1"),
    (F.year(F.current_timestamp()) - F.year(F.col("dob"))).alias("age2"),
).show()

+---------------+----+----+
|adjusted_salary|age1|age2|
+---------------+----+----+
|        94500.0|  43|  43|
|        84000.0|null|null|
|        68250.0|  30|  30|
|        63000.0|  27|  27|
|        73500.0|  32|  32|
|        78750.0|  35|  35|
|        99750.0|  48|  48|
|        65100.0|  32|  32|
|        94500.0|  27|  27|
|        73500.0|  30|  30|
|        91350.0|  41|  41|
|        60900.0|  29|  29|
+---------------+----+----+



In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  5| David|1981-12-18|  41| 90000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  5| David|1981-12-18|  41| 90000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df = df.withColumn(
    "salary_raise",
    (F.col("salary")+.05*F.col("salary"))
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|salary_raise|
+---+------+----------+----+------+----------+------------+
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|     99750.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     73500.0|
|  9| James|1983-10-14|  39| 87000|        IT|     91350.0|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|     60900.0|
+---+------+----------+----+------+----------+------------+



In [0]:
# df.select(
#     (F.expr("salary + 0.5 * salary")).alias("salary_raise"),
#     (F.year(F.current_timestamp()) - F.year("dob")),
#     (F.year(F.current_timestamp()) - F.year(F.col("dob"))),
# )
# df.show()

In [0]:
df =df.withColumn(
    "salary_raise",
    (F.col("salary")+.05*F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|salary_raise|
+---+------+----------+----+------+----------+------------+
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|     99750.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     73500.0|
|  9| James|1983-10-14|  39| 87000|        IT|     91350.0|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|     60900.0|
+---+------+----------+----+------+----------+------------+



In [0]:
df = df.withColumn(
    "salary_rasie_10_perc",
    (F.col("salary")+.1*F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_rasie_10_perc|
+---+------+----------+----+------+----------+------------+--------------------+
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|             99000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|             88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|             71500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|             66000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|             77000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|             82500.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|     99750.0|            104500.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|             68200.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|             99000.0|
|  4| Emily|1994-11-22|  28|

In [0]:
df = df.withColumns(
    {
        "salary_raise": F.col("salary")+0.5*F.col("Salary")
        # "age": F.year(F.current_timestamp())-F.year(F.col("dob"))
        
    }
)

In [0]:
df.filter(
    F.col("salary_raise")>= 75000
).show()

df.where(
    F.col("salary_raise")>= 75000
)

+---+------+----------+----+------+----------+------------+--------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_rasie_10_perc|
+---+------+----------+----+------+----------+------------+--------------------+
|  5| David|1981-12-18|  41| 90000|        HR|    135000.0|             99000.0|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|             88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|             71500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|             66000.0|
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|             77000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    112500.0|             82500.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|    142500.0|            104500.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     93000.0|             68200.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|    135000.0|             99000.0|
|  4| Emily|1994-11-22|  28|

In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age")<= 20,
        "Upto 20"
    ).when(
        (
            (F.col("age")> 20)&
            (F.col("age")<= 40)
        ),
        "31 to 40"
    ).otherwise(
        "More than 40"
    )
)

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp()) - F.year(F.col("dob")),
            F.lit(-1),
        ),
        "has_dob_1": ~F.isnull("dob"),
        "has_dob_2": F.col("dob").isNotNull(),
    }
)

drop column


In [0]:
df = df.drop("company", "has_dob_1", "has_dob_2", "salary_raise_10_perc")

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+--------------------+------------+
| id|  name|       dob|age|salary|department|salary_raise|salary_rasie_10_perc|   age_group|
+---+------+----------+---+------+----------+------------+--------------------+------------+
|  5| David|1981-12-18| 43| 90000|        HR|    135000.0|             99000.0|More than 40|
|  3|   Bob|      null| -1| 80000|        IT|    120000.0|             88000.0|More than 40|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     97500.0|             71500.0|    31 to 40|
|  2| Alice|1997-02-28| 27| 60000|        HR|     90000.0|             66000.0|    31 to 40|
|  1|  John|1992-05-12| 32| 70000|        IT|    105000.0|             77000.0|    31 to 40|
|  6| Susan|1989-07-05| 35| 75000|   Finance|    112500.0|             82500.0|    31 to 40|
|  7|  Mike|1976-03-15| 48| 95000|        IT|    142500.0|            104500.0|More than 40|
| 10|Sophie|1992-06-30| 32| 62000|   Finance|     93000.0|            

withColumnRenamed

In [0]:
df = df.withColumnRenamed("name", "first_name")

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+--------------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_rasie_10_perc|   age_group|
+---+----------+----------+---+------+----------+------------+--------------------+------------+
|  5|     David|1981-12-18| 43| 90000|        HR|    135000.0|             99000.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|    120000.0|             88000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     97500.0|             71500.0|    31 to 40|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     90000.0|             66000.0|    31 to 40|
|  1|      John|1992-05-12| 32| 70000|        IT|    105000.0|             77000.0|    31 to 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|    112500.0|             82500.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|    142500.0|            104500.0|More than 40|
| 10|    Sophie|1992-06-30| 32

partition by


In [0]:
from pyspark.sql import Window

In [0]:
window = Window.partitionBy("age_group")

df.withColumn(
    "min_age",
    F.min("age").over(window)
)

Out[393]: DataFrame[id: int, first_name: string, dob: date, age: int, salary: int, department: string, salary_raise: double, salary_rasie_10_perc: double, age_group: string, min_age: int]

Calculate mean salary and check if it is greater or equal to salary of employees in each department
Calculate mean salary and check if it is greater or equal to the salary of all employees.

Join

In [0]:
ref_df = spark.read.format("parquet").load("dbfs:///FileStore/tables/parquet")

In [0]:
ref_df.show()

+----------+-------+--------+
|department|manager|    lead|
+----------+-------+--------+
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
+----------+-------+--------+



In [0]:
ref_df.printSchema()

root
 |-- department: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- lead: string (nullable = true)



In [0]:
df.join(
    ref_df,
    "department",
    "left",
).show()

+----------+---+----------+----------+---+------+------------+--------------------+------------+-------+--------+
|department| id|first_name|       dob|age|salary|salary_raise|salary_rasie_10_perc|   age_group|manager|    lead|
+----------+---+----------+----------+---+------+------------+--------------------+------------+-------+--------+
|        HR|  5|     David|1981-12-18| 43| 90000|    135000.0|             99000.0|More than 40|   Brad|   Brian|
|        IT|  3|       Bob|      null| -1| 80000|    120000.0|             88000.0|More than 40|  Chris|Chandler|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     97500.0|             71500.0|    31 to 40|  Megan|   Molly|
|        HR|  2|     Alice|1997-02-28| 27| 60000|     90000.0|             66000.0|    31 to 40|   Brad|   Brian|
|        IT|  1|      John|1992-05-12| 32| 70000|    105000.0|             77000.0|    31 to 40|  Chris|Chandler|
|   Finance|  6|     Susan|1989-07-05| 35| 75000|    112500.0|             82500.0|    3

In [0]:
df.join(
    ref_df,
    "department",
    "semi",
).show()

+----------+---+----------+----------+---+------+------------+--------------------+------------+
|department| id|first_name|       dob|age|salary|salary_raise|salary_rasie_10_perc|   age_group|
+----------+---+----------+----------+---+------+------------+--------------------+------------+
|        HR|  5|     David|1981-12-18| 43| 90000|    135000.0|             99000.0|More than 40|
|        IT|  3|       Bob|      null| -1| 80000|    120000.0|             88000.0|More than 40|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     97500.0|             71500.0|    31 to 40|
|        HR|  2|     Alice|1997-02-28| 27| 60000|     90000.0|             66000.0|    31 to 40|
|        IT|  1|      John|1992-05-12| 32| 70000|    105000.0|             77000.0|    31 to 40|
|   Finance|  6|     Susan|1989-07-05| 35| 75000|    112500.0|             82500.0|    31 to 40|
|        IT|  7|      Mike|1976-03-15| 48| 95000|    142500.0|            104500.0|More than 40|
|   Finance| 10|    Sophie|199

In [0]:
# df.join(
#     ref_df,
#     "department",
#     "anti",
# ).show()

inner
left, leftouter, left_outer
right, rightouter, right_outer,
cross,
outer, full, full_outer,
semi, leftsemi, left_semi,
anti, leftanti, left_anti,


In [0]:
# df.join(
#     ref_df,
#     df.department == ref_df.department,
#     "left",
# ).show()

In [0]:
df = df.alias("main")
ref_df = ref_df.alias("reference")

In [0]:
# df = df.join(
#     ref_df,
#     F.col("main.department") == F.col("reference.department"),
#      "left",
#  )

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+--------------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_rasie_10_perc|   age_group|
+---+----------+----------+---+------+----------+------------+--------------------+------------+
|  5|     David|1981-12-18| 43| 90000|        HR|    135000.0|             99000.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|    120000.0|             88000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     97500.0|             71500.0|    31 to 40|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     90000.0|             66000.0|    31 to 40|
|  1|      John|1992-05-12| 32| 70000|        IT|    105000.0|             77000.0|    31 to 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|    112500.0|             82500.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|    142500.0|            104500.0|More than 40|
| 10|    Sophie|1992-06-30| 32

load(save) data

In [0]:
df.write.mode("overwrite").format("csv").save("dbfs:///FileStore/tables/final")

In [0]:
%fs ls dbfs:///FileStore/tables/final

path,name,size,modificationTime
dbfs:/FileStore/tables/final/_SUCCESS,_SUCCESS,0,1707805031000
dbfs:/FileStore/tables/final/_committed_7145688290367327973,_committed_7145688290367327973,114,1707804628000
dbfs:/FileStore/tables/final/_committed_859607062971485087,_committed_859607062971485087,213,1707805031000
dbfs:/FileStore/tables/final/_started_7145688290367327973,_started_7145688290367327973,0,1707804628000
dbfs:/FileStore/tables/final/_started_859607062971485087,_started_859607062971485087,0,1707805031000
dbfs:/FileStore/tables/final/part-00000-tid-859607062971485087-e19fbdd7-22cf-4591-94f8-175368392d36-1619-1-c000.csv,part-00000-tid-859607062971485087-e19fbdd7-22cf-4591-94f8-175368392d36-1619-1-c000.csv,705,1707805031000


In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("final")

In [0]:
%fs ls dbfs:///user/hive/warehouse/final

path,name,size,modificationTime
dbfs:/user/hive/warehouse/final/_delta_log/,_delta_log/,0,0
dbfs:/user/hive/warehouse/final/part-00000-7d1d661f-70db-4458-ac19-0b9f49e1493d-c000.snappy.parquet,part-00000-7d1d661f-70db-4458-ac19-0b9f49e1493d-c000.snappy.parquet,3208,1707806857000


In [0]:
%sql
desc table extended default.final

col_name,data_type,comment
id,int,
first_name,string,
dob,date,
age,int,
salary,int,
department,string,
salary_raise,double,
salary_rasie_10_perc,double,
age_group,string,
,,


In [0]:
%sql
desc detail default.final;

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,bcf29e98-e25d-45ee-8806-aba3abc914ba,spark_catalog.default.final,,dbfs:/user/hive/warehouse/final,2024-02-13T06:47:33.252+0000,2024-02-13T06:47:40.000+0000,List(),1,3208,Map(),1,2,"List(appendOnly, invariants)",Map()


In [0]:
%sql
select * from final

id,first_name,dob,age,salary,department,salary_raise,salary_rasie_10_perc,age_group
5,David,1981-12-18,43,90000,HR,135000.0,99000.0,More than 40
3,Bob,,-1,80000,IT,120000.0,88000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,97500.0,71500.0,31 to 40
2,Alice,1997-02-28,27,60000,HR,90000.0,66000.0,31 to 40
1,John,1992-05-12,32,70000,IT,105000.0,77000.0,31 to 40
6,Susan,1989-07-05,35,75000,Finance,112500.0,82500.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,142500.0,104500.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,93000.0,68200.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,135000.0,99000.0,31 to 40
4,Emily,1994-11-22,30,70000,Finance,105000.0,77000.0,31 to 40


In [0]:
%sql
select * except(id, )

In [0]:
table_df = spark.table("final")

In [0]:
table_df.show()

+---+----------+----------+---+------+----------+------------+--------------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary_rasie_10_perc|   age_group|
+---+----------+----------+---+------+----------+------------+--------------------+------------+
|  5|     David|1981-12-18| 43| 90000|        HR|    135000.0|             99000.0|More than 40|
|  3|       Bob|      null| -1| 80000|        IT|    120000.0|             88000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     97500.0|             71500.0|    31 to 40|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     90000.0|             66000.0|    31 to 40|
|  1|      John|1992-05-12| 32| 70000|        IT|    105000.0|             77000.0|    31 to 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|    112500.0|             82500.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|    142500.0|            104500.0|More than 40|
| 10|    Sophie|1992-06-30| 32

In [0]:
display(table_df)

id,first_name,dob,age,salary,department,salary_raise,salary_rasie_10_perc,age_group
5,David,1981-12-18,43,90000,HR,135000.0,99000.0,More than 40
3,Bob,,-1,80000,IT,120000.0,88000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,97500.0,71500.0,31 to 40
2,Alice,1997-02-28,27,60000,HR,90000.0,66000.0,31 to 40
1,John,1992-05-12,32,70000,IT,105000.0,77000.0,31 to 40
6,Susan,1989-07-05,35,75000,Finance,112500.0,82500.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,142500.0,104500.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,93000.0,68200.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,135000.0,99000.0,31 to 40
4,Emily,1994-11-22,30,70000,Finance,105000.0,77000.0,31 to 40
