In [0]:
# This is just for an example. Not required when using SparkSession

from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/csv

path,name,size,modificationTime
dbfs:/FileStore/tables/csv/batch.csv,batch.csv,151,1707460729000


### Goal

Load three types of data:
1. CSV batch file.
2. JSONL batch file.
3. JSON incremental files.
4. Load the files into a single dataframe.

### Scenario

The client had sent their whole system data in CSV at the beginning.

Later, the client switched from CSV to JSON.

Instead of only sending the CDC data, they send a JSONL batch file and JSON incremental files.

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
spark

In [0]:
type(spark)

Out[19]: pyspark.sql.session.SparkSession

# Extract Data

## Extract CSV batch

In [0]:
%fs ls dbfs:///FileStore/tables/csv

path,name,size,modificationTime
dbfs:/FileStore/tables/csv/batch.csv,batch.csv,151,1707460729000


In [0]:
df_csv = spark.read.format("csv").load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
type(df_csv)

Out[17]: pyspark.sql.dataframe.DataFrame

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      NULL|NULL| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



By default csv doesn't set header. You'll have to mention it.

In [0]:
df_csv = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv = spark.read.format("csv").option("header", True).option("inferSchema", True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



The order of the data is according to the rows in CSV file.

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



The order of the columns is according to the CSV file. But all the columns have string datatype.

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
sch = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv = spark.read.format("csv").schema(sch).option("header", True).load("dbfs:/FileStore/tables/csv/batch.csv")

We fix it by defining schema.

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      NULL|NULL| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



## Extract JSON

In [0]:
df_json = spark.read.format("json").load("dbfs:/FileStore/tables/json") # Sets header auto. No order of data.

Loading data from json files sets the header according to the keys. But the order of data is not maintained.

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json = df_json.orderBy("id")

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema() # Infers schema

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



## Nulls

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp()) - F.year(F.col("dob")),
            F.lit(0),
        ),
        "has_dob_1": ~(F.isnull("dob")),
        "has_dob_2": F.col("dob").isNotNull(),
    }
)

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+------------+---------------+---------+---------+
| id|  name|       dob|age|salary|department|salary_raise|   age_group|        company|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+------------+------------+---------------+---------+---------+
|  3|   Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|Abacus Insights|    false|    false|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|Abacus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|Abacus Insights|     true|     true|
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|Abacus Insights|     true|     true|
|  5| David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|Abacus Insights|     true|     true|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|Abacus Insights|     true|     true|
|  7|  Mike|1976-03-15| 48| 

## Drop Col

##### Method 1

In [0]:
df.select(
    "id",
    "name",
    "dob",
    "salary",
    "department",
    "salary_raise",
    "age_group",
    # "company",
).show()

+---+------+----------+------+----------+------------+------------+
| id|  name|       dob|salary|department|salary_raise|   age_group|
+---+------+----------+------+----------+------------+------------+
|  3|   Bob|      NULL| 80000|        IT|     84000.0|More than 40|
|  4| Emily|1994-11-22| 65000|   Finance|     68250.0|    21 to 30|
|  2| Alice|1997-02-28| 60000|        HR|     63000.0|    21 to 30|
|  1|  John|1992-05-12| 70000|        IT|     73500.0|    31 to 40|
|  5| David|1981-12-18| 90000|        HR|     94500.0|More than 40|
|  6| Susan|1989-07-05| 75000|   Finance|     78750.0|    31 to 40|
|  7|  Mike|1976-03-15| 95000|        IT|     99750.0|More than 40|
| 10|Sophie|1992-06-30| 62000|   Finance|     65100.0|    31 to 40|
|  2| Alice|1997-02-28| 90000|   Finance|     94500.0|    21 to 30|
|  4| Emily|1994-11-22| 70000|   Finance|     73500.0|    21 to 30|
|  9| James|1983-10-14| 87000|        IT|     91350.0|More than 40|
|  8|  Lisa|1995-08-20| 58000|        HR|     60

##### Method 2

In [0]:
df = df.drop("company", "has_dob_1", "has_dob_2")

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+------------+
| id|  name|       dob|age|salary|department|salary_raise|   age_group|
+---+------+----------+---+------+----------+------------+------------+
|  3|   Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|
|  5| David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|
|  7|  Mike|1976-03-15| 48| 95000|        IT|     99750.0|More than 40|
| 10|Sophie|1992-06-30| 32| 62000|   Finance|     65100.0|    31 to 40|
|  2| Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|
|  4| Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|
|  9| James|1983-10-14| 41| 87000|        IT|     91350.0|More t

## withColumnRenamed

In [0]:
df = df.withColumnRenamed("name", "first_name")

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|
+---+----------+----------+---+------+----------+------------+------------+
|  3|       Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|More than 40|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     65100.0|    31 to 40|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|
|  9|     Ja

## Partition By

In [0]:
from pyspark.sql import Window

In [0]:
window = Window.partitionBy("age_group")

df.withColumn(
    "min_age_in_group",
    F.min("age").over(window)
).show()

+---+----------+----------+---+------+----------+------------+------------+----------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|min_age_in_group|
+---+----------+----------+---+------+----------+------------+------------+----------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|              27|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|              27|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|              27|
|  8|      Lisa|1995-08-20| 29| 58000|        HR|     60900.0|    21 to 30|              27|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|              32|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|              32|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     65100.0|    31 t

## Task
Calculate mean salary and check if it is greater or equal to the salary of employees in each department.

Calculate mean salary and check if it is greater or equal to the salary of all employees.

In [0]:
window_spec = Window.partitionBy("department")

df.withColumn(
    "mean_salary_department",
    F.mean("salary").over(window_spec)
).show()

+---+----------+----------+---+------+----------+------------+------------+----------------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|mean_salary_department|
+---+----------+----------+---+------+----------+------------+------------+----------------------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|               72400.0|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|               72400.0|
| 10|    Sophie|1992-06-30| 32| 62000|   Finance|     65100.0|    31 to 40|               72400.0|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|               72400.0|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|               72400.0|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|     69333.33333333333|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|     69333.33333333333|
|  8|     

In [0]:
window_spec = Window.partitionBy()
window_spec = Window.orderBy()
window_spec = Window.partitionBy().orderBy()

df.withColumn(
    "is_salary_above_mean",
    (
        F.col("salary") >= F.mean("salary").over(window_spec)
    )
).show()

+---+----------+----------+---+------+----------+------------+------------+--------------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|is_salary_above_mean|
+---+----------+----------+---+------+----------+------------+------------+--------------------+
|  3|       Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|                true|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|               false|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|               false|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|               false|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|                true|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|               false|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|More than 40|                true|
| 10|    Sophie|1992-06-30| 32

## Join

In [0]:
ref = spark.read.format("parquet").load("dbfs:///FileStore/tables/data/parquet")

In [0]:
ref.show()

+----------+-------+--------+
|department|manager|    lead|
+----------+-------+--------+
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
+----------+-------+--------+



In [0]:
df.join(
    ref,
   "department",
    "left",
).show()

+----------+---+----------+----------+---+------+------------+------------+-------+--------+
|department| id|first_name|       dob|age|salary|salary_raise|   age_group|manager|    lead|
+----------+---+----------+----------+---+------+------------+------------+-------+--------+
|        IT|  3|       Bob|      NULL|  0| 80000|     84000.0|More than 40|  Chris|Chandler|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     68250.0|    21 to 30|  Megan|   Molly|
|        HR|  2|     Alice|1997-02-28| 27| 60000|     63000.0|    21 to 30|   Brad|   Brian|
|        IT|  1|      John|1992-05-12| 32| 70000|     73500.0|    31 to 40|  Chris|Chandler|
|        HR|  5|     David|1981-12-18| 43| 90000|     94500.0|More than 40|   Brad|   Brian|
|   Finance|  6|     Susan|1989-07-05| 35| 75000|     78750.0|    31 to 40|  Megan|   Molly|
|        IT|  7|      Mike|1976-03-15| 48| 95000|     99750.0|More than 40|  Chris|Chandler|
|   Finance| 10|    Sophie|1992-06-30| 32| 62000|     65100.0|    31 t

inner,

left, leftouter, left_outer,

right, rightouter, right_outer,

cross,

outer, full, fullouter, full_outer,

semi, leftsemi, left_semi,

anti, leftanti and left_anti.

##### Method 1

In [0]:
df.join(
    ref,
   "department",
    "left",
).show()

+----------+---+----------+----------+---+------+------------+------------+-------+--------+
|department| id|first_name|       dob|age|salary|salary_raise|   age_group|manager|    lead|
+----------+---+----------+----------+---+------+------------+------------+-------+--------+
|        IT|  3|       Bob|      NULL|  0| 80000|     84000.0|More than 40|  Chris|Chandler|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     68250.0|    21 to 30|  Megan|   Molly|
|        HR|  2|     Alice|1997-02-28| 27| 60000|     63000.0|    21 to 30|   Brad|   Brian|
|        IT|  1|      John|1992-05-12| 32| 70000|     73500.0|    31 to 40|  Chris|Chandler|
|        HR|  5|     David|1981-12-18| 43| 90000|     94500.0|More than 40|   Brad|   Brian|
|   Finance|  6|     Susan|1989-07-05| 35| 75000|     78750.0|    31 to 40|  Megan|   Molly|
|        IT|  7|      Mike|1976-03-15| 48| 95000|     99750.0|More than 40|  Chris|Chandler|
|   Finance| 10|    Sophie|1992-06-30| 32| 62000|     65100.0|    31 t

##### Method 2

In [0]:
df.join(
    ref,
   df.department == ref.department,
    "left",
).show()

+---+----------+----------+---+------+----------+------------+------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+------------+----------+-------+--------+
|  3|       Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|   Finance|  Megan|   Molly|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|        HR|   Brad|   Brian|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|        IT|  Chris|Chandler|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|        HR|   Brad|   Brian|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|   Finance|  Megan|   Molly|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|Mo

##### Method 3

In [0]:
df = df.alias("df")
ref = ref.alias("ref")

In [0]:
df.join(
    ref,
   F.col("df.department") == F.col("ref.department"),
    "inner",
).show()

+---+----------+----------+---+------+----------+------------+------------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+------------+----------+-------+--------+
|  3|       Bob|      NULL|  0| 80000|        IT|     84000.0|More than 40|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|   Finance|  Megan|   Molly|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|        HR|   Brad|   Brian|
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|    31 to 40|        IT|  Chris|Chandler|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|        HR|   Brad|   Brian|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|    31 to 40|   Finance|  Megan|   Molly|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|Mo

# Load(Save) Data

In [0]:
df.write.mode("overwrite").format("csv").save("dbfs:///FileStore/tables/final")

In [0]:
%fs ls dbfs:///FileStore/tables/final

path,name,size,modificationTime
dbfs:/FileStore/tables/final/_SUCCESS,_SUCCESS,0,1706683097000
dbfs:/FileStore/tables/final/_committed_6028123568704318576,_committed_6028123568704318576,113,1706682076000
dbfs:/FileStore/tables/final/_committed_7426842490311099765,_committed_7426842490311099765,212,1706683097000
dbfs:/FileStore/tables/final/_started_6028123568704318576,_started_6028123568704318576,0,1706682075000
dbfs:/FileStore/tables/final/_started_7426842490311099765,_started_7426842490311099765,0,1706683097000
dbfs:/FileStore/tables/final/part-00000-tid-7426842490311099765-bc80ccc9-57a7-4d40-8214-ceac47396506-641-1-c000.csv,part-00000-tid-7426842490311099765-bc80ccc9-57a7-4d40-8214-ceac47396506-641-1-c000.csv,603,1706683097000


In [0]:
# df.write.mode("overwrite").saveAsTable("final")
df.write.mode("overwrite").format("delta").saveAsTable("final")

In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    final|      false|
+--------+---------+-----------+



In [0]:
%sql
desc table default.final;

col_name,data_type,comment
id,int,
first_name,string,
dob,date,
age,int,
salary,int,
department,string,
salary_raise,double,
age_group,string,


In [0]:
%sql
desc table extended default.final;

col_name,data_type,comment
id,int,
first_name,string,
dob,date,
age,int,
salary,int,
department,string,
salary_raise,double,
age_group,string,
,,
# Delta Statistics Columns,,


In [0]:
%sql
desc detail default.final;

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,d7700751-af48-403c-974e-4e3914d5e7cd,spark_catalog.default.final,,dbfs:/user/hive/warehouse/final,2024-01-31T06:31:21.865Z,2024-01-31T06:38:22Z,List(),List(),1,2820,Map(),1,2,"List(appendOnly, invariants)",Map()


In [0]:
%sql

select * from default.final

id,first_name,dob,age,salary,department,salary_raise,age_group
3,Bob,,0,80000,IT,84000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,68250.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,63000.0,21 to 30
1,John,1992-05-12,32,70000,IT,73500.0,31 to 40
5,David,1981-12-18,43,90000,HR,94500.0,More than 40
6,Susan,1989-07-05,35,75000,Finance,78750.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,99750.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,65100.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,94500.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,73500.0,21 to 30


In [0]:
%sql

select * except (id, dob, age) from default.final

first_name,salary,department,salary_raise,age_group
Bob,80000,IT,84000.0,More than 40
Emily,65000,Finance,68250.0,21 to 30
Alice,60000,HR,63000.0,21 to 30
John,70000,IT,73500.0,31 to 40
David,90000,HR,94500.0,More than 40
Susan,75000,Finance,78750.0,31 to 40
Mike,95000,IT,99750.0,More than 40
Sophie,62000,Finance,65100.0,31 to 40
Alice,90000,Finance,94500.0,21 to 30
Emily,70000,Finance,73500.0,21 to 30


In [0]:
%sql

create view default.final_view as
select * except (id, dob, age) from default.final

In [0]:
%sql

desc default.final_view

col_name,data_type,comment
first_name,string,
salary,int,
department,string,
salary_raise,double,
age_group,string,


In [0]:
table_df = spark.table("final")

In [0]:
display(table_df)

id,first_name,dob,age,salary,department,salary_raise,age_group
3,Bob,,0,80000,IT,84000.0,More than 40
4,Emily,1994-11-22,30,65000,Finance,68250.0,21 to 30
2,Alice,1997-02-28,27,60000,HR,63000.0,21 to 30
1,John,1992-05-12,32,70000,IT,73500.0,31 to 40
5,David,1981-12-18,43,90000,HR,94500.0,More than 40
6,Susan,1989-07-05,35,75000,Finance,78750.0,31 to 40
7,Mike,1976-03-15,48,95000,IT,99750.0,More than 40
10,Sophie,1992-06-30,32,62000,Finance,65100.0,31 to 40
2,Alice,1997-02-28,27,90000,Finance,94500.0,21 to 30
4,Emily,1994-11-22,30,70000,Finance,73500.0,21 to 30
