**Creating a Spark Session**

In [0]:
from pyspark.sql import SparkSession as ss
spark = ss.builder.getOrCreate()
spark

**Checking for Upload**

In [0]:
%fs ls dbfs:////FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/Address-1.xlsx,Address-1.xlsx,151315,1706076168000
dbfs:/FileStore/tables/Address-10.xlsx,Address-10.xlsx,151315,1706170593000
dbfs:/FileStore/tables/Address-11.xlsx,Address-11.xlsx,151315,1706174664000
dbfs:/FileStore/tables/Address-2.xlsx,Address-2.xlsx,151315,1706076328000
dbfs:/FileStore/tables/Address-3.xlsx,Address-3.xlsx,151315,1706076419000
dbfs:/FileStore/tables/Address-4.xlsx,Address-4.xlsx,151315,1706077911000
dbfs:/FileStore/tables/Address-5.xlsx,Address-5.xlsx,151315,1706079849000
dbfs:/FileStore/tables/Address-6.xlsx,Address-6.xlsx,151315,1706079981000
dbfs:/FileStore/tables/Address-7.xlsx,Address-7.xlsx,151315,1706159285000
dbfs:/FileStore/tables/Address-8.xlsx,Address-8.xlsx,151315,1706159471000


In [0]:
"""
dbfs:/FileStore/tables/csv/batch.csv
dbfs:/FileStore/tables/json/1_20220101.json
dbfs:/FileStore/tables/json/10_20220101.json
dbfs:/FileStore/tables/json/2_20220101.json
dbfs:/FileStore/tables/json/4_20220101.json
dbfs:/FileStore/tables/json/8_20220101.json
dbfs:/FileStore/tables/json/9_20220101.json
dbfs:/FileStore/tables/json/batch.jsonl
dbfs:/FileStore/tables/parquet/reference.parquet
"""

Out[172]: '\ndbfs:/FileStore/tables/csv/batch.csv\ndbfs:/FileStore/tables/json/1_20220101.json\ndbfs:/FileStore/tables/json/10_20220101.json\ndbfs:/FileStore/tables/json/2_20220101.json\ndbfs:/FileStore/tables/json/4_20220101.json\ndbfs:/FileStore/tables/json/8_20220101.json\ndbfs:/FileStore/tables/json/9_20220101.json\ndbfs:/FileStore/tables/json/batch.jsonl\ndbfs:/FileStore/tables/parquet/reference.parquet\n'

**Loading Files into SIngle DF**

CSV

In [0]:
#df_csv = spark.read.format("csv").load("dbfs:/FileStore/tables/csv/batch.csv")

#For Headers
df_csv = spark.read.format("csv").option("Header", True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
type(df_csv)

Out[174]: pyspark.sql.dataframe.DataFrame

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



Instead of loading all data string, loading them as desired data type

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
sch = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
df_csv = spark.read.format("csv").schema(sch).option("Header", True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



JSON

In [0]:
#df_json = spark.read.format("json").schema(sch).option("Header", True).load("dbfs:/FileStore/tables/json")
#no need for headers because uses key of json
df_json = spark.read.format("json").schema(sch).load("dbfs:/FileStore/tables/json")

In [0]:
df_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df_json_order = df_json.orderBy("id")
df_json_order.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
df_json_order.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



**Union**

In [0]:
df = df_csv.union(df_json_order)
df.show()
print(df.count())

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+

17


In [0]:
print(df_csv.columns, df_json_order.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


Columns of both datasets are not in same order

In [0]:
#df_json_order = df_json_order.select(df_csv) 

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



UnionAll is depricated in PySPark. Union() and UnionAll() both act as UnionAll(). Meaning the duplicates are not trucated.

In [0]:
df = df.dropDuplicates()

In [0]:
print(df.count())

12


**TRANSFORMATION**

In [0]:
df = df.orderBy("id")
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



In [0]:
from pyspark.sql import functions as F
#from pyspark.sql.functions import *
#from pyspark.sql.functions import col,max,min

Ways to select

In [0]:
df.select(
    "salary",
    "age"
).show()

df.select(
    ["salary","age"]
).show()

df.select(
    df.salary,
    df.age
).show()

df.select(
    F.col("salary"),
    F.col("age")
).show()

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 60000|  25|
| 90000|  25|
| 80000|null|
| 65000|  28|
| 70000|  28|
| 90000|  41|
| 75000|  33|
| 95000|  46|
| 58000|  27|
| 87000|  39|
| 62000|  30|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 60000|  25|
| 90000|  25|
| 80000|null|
| 65000|  28|
| 70000|  28|
| 90000|  41|
| 75000|  33|
| 95000|  46|
| 58000|  27|
| 87000|  39|
| 62000|  30|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 60000|  25|
| 90000|  25|
| 80000|null|
| 65000|  28|
| 70000|  28|
| 90000|  41|
| 75000|  33|
| 95000|  46|
| 58000|  27|
| 87000|  39|
| 62000|  30|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 60000|  25|
| 90000|  25|
| 80000|null|
| 65000|  28|
| 70000|  28|
| 90000|  41|
| 75000|  33|
| 95000|  46|
| 58000|  27|
| 87000|  39|
| 62000|  30|
+------+----+



In [0]:
df.select(
    df.salary + 0.05 *df.salary,
    F.year(F.current_timestamp()) - F.year("dob"),
    F.year(F.current_timestamp()) - F.year(F.col("dob"))
).show()

+--------------------------+---------------------------------------+---------------------------------------+
|(salary + (salary * 0.05))|(year(current_timestamp()) - year(dob))|(year(current_timestamp()) - year(dob))|
+--------------------------+---------------------------------------+---------------------------------------+
|                   73500.0|                                     32|                                     32|
|                   63000.0|                                     27|                                     27|
|                   94500.0|                                     27|                                     27|
|                   84000.0|                                   null|                                   null|
|                   68250.0|                                     30|                                     30|
|                   73500.0|                                     30|                                     30|
|                  

In [0]:
df.select(
    F.col("*")
).show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
|  9| James|1983-10-14|  39| 87000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
+---+------+----------+----+------+----------+



**Alias, Select and Expr**

In [0]:
df.select(
    df.salary + 0.05 *df.salary,
    F.expr('salary + 0.05 * salary').alias("salary_raise"),
    F.year(F.current_timestamp()) - F.year("dob"),
    (F.year(F.current_timestamp()) - F.year(F.col("dob"))).alias("age")
).show()

+--------------------------+------------+---------------------------------------+----+
|(salary + (salary * 0.05))|salary_raise|(year(current_timestamp()) - year(dob))| age|
+--------------------------+------------+---------------------------------------+----+
|                   73500.0|    73500.00|                                     32|  32|
|                   63000.0|    63000.00|                                     27|  27|
|                   94500.0|    94500.00|                                     27|  27|
|                   84000.0|    84000.00|                                   null|null|
|                   68250.0|    68250.00|                                     30|  30|
|                   73500.0|    73500.00|                                     30|  30|
|                   94500.0|    94500.00|                                     43|  43|
|                   78750.0|    78750.00|                                     35|  35|
|                   99750.0|    99750.00|  

**WithColumn**

In [0]:
"""
df = df.withColumn(
  "salary_raise",
  (F.col("salary") + 0.5 * F.col("salary"))
)
df = df.withColumn(
  "salary_raise_10_perc"
  (F.col("salary") + 0.5 * F.col("salary"))
)
"""

Out[198]: '\ndf = df.withColumn(\n  "salary_raise",\n  (F.col("salary") + 0.5 * F.col("salary"))\n)\ndf = df.withColumn(\n  "salary_raise_10_perc"\n  (F.col("salary") + 0.5 * F.col("salary"))\n)\n'

In [0]:
df = df.withColumn(
  "salary_raise",
  (F.col("salary") + 0.5 * F.col("salary"))
).withColumn(
  "salary_raise_10_perc",
  (F.col("salary") + 0.5 * F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|
+---+------+----------+----+------+----------+------------+--------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|            105000.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|             90000.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|    135000.0|            135000.0|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|            120000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|             97500.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|    105000.0|            105000.0|
|  5| David|1981-12-18|  41| 90000|        HR|    135000.0|            135000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    112500.0|            112500.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|    142500.0|            142500.0|
|  8|  Lisa|1995-08-20|  27|

In [0]:
df = df.withColumns(
    {
        "salary_raise": F.col("salary") + 0.05 * F.col("salary"),
        "age": F.year(F.current_timestamp()) - F.year(F.col("dob"))
    }
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|
+---+------+----------+----+------+----------+------------+--------------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|            105000.0|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|             90000.0|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|            120000.0|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|             97500.0|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     73500.0|            105000.0|
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|            135000.0|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0|            112500.0|
|  7|  Mike|1976-03-15|  48| 95000|        IT|     99750.0|            142500.0|
|  8|  Lisa|1995-08-20|  29|

Using Filter

In [0]:
df.filter(
    F.col("salary_raise")>=75000
).show()

+---+-----+----------+----+------+----------+------------+--------------------+
| id| name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|
+---+-----+----------+----+------+----------+------------+--------------------+
|  2|Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|
|  3|  Bob|      null|null| 80000|        IT|     84000.0|            120000.0|
|  5|David|1981-12-18|  43| 90000|        HR|     94500.0|            135000.0|
|  6|Susan|1989-07-05|  35| 75000|   Finance|     78750.0|            112500.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|     99750.0|            142500.0|
|  9|James|1983-10-14|  41| 87000|        IT|     91350.0|            130500.0|
+---+-----+----------+----+------+----------+------------+--------------------+



In [0]:
df.where(
    F.col("salary_raise")>=75000
).show()

+---+-----+----------+----+------+----------+------------+--------------------+
| id| name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|
+---+-----+----------+----+------+----------+------------+--------------------+
|  2|Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|
|  3|  Bob|      null|null| 80000|        IT|     84000.0|            120000.0|
|  5|David|1981-12-18|  43| 90000|        HR|     94500.0|            135000.0|
|  6|Susan|1989-07-05|  35| 75000|   Finance|     78750.0|            112500.0|
|  7| Mike|1976-03-15|  48| 95000|        IT|     99750.0|            142500.0|
|  9|James|1983-10-14|  41| 87000|        IT|     91350.0|            130500.0|
+---+-----+----------+----+------+----------+------------+--------------------+



In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        "31 to 40"
    ).otherwise(
        "More than 40"
    )
)

df.show()

+---+------+----------+----+------+----------+------------+--------------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|   age_group|
+---+------+----------+----+------+----------+------------+--------------------+------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|            105000.0|    31 to 40|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|             90000.0|    21 to 30|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|    21 to 30|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|            120000.0|More than 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|             97500.0|    21 to 30|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     73500.0|            105000.0|    21 to 30|
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|            135000.0|More than 40|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0| 

In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        F.col("name") #col value can be assign instead of contant value
    ).otherwise(
        "More than 40"
    )
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|   age_group|
+---+------+----------+----+------+----------+------------+--------------------+------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|            105000.0|        John|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|             90000.0|    21 to 30|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|    21 to 30|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|            120000.0|More than 40|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|             97500.0|    21 to 30|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     73500.0|            105000.0|    21 to 30|
|  5| David|1981-12-18|  43| 90000|        HR|     94500.0|            135000.0|More than 40|
|  6| Susan|1989-07-05|  35| 75000|   Finance|     78750.0| 

In [0]:
df =df.withColumns(
    {
        "company": F.lit("Abascus Insights")
    }
)

df.show()

+---+------+----------+----+------+----------+------------+--------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+------------+----------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|            105000.0|        John|Abascus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|             90000.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  27| 90000|   Finance|     94500.0|            135000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|            120000.0|More than 40|Abascus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|             97500.0|    21 to 30|Abascus Insights|
|  4| Emily|1994-11-22|  30| 70000|   Finance|     73500.0|            105000.0|    21 to 30|Abascus Insights|
|

*Nulls*

In [0]:
df = df.withColumns(
  {
    "age" : F.coalesce(
      F.year(F.current_timestamp()) - F.year(F.col("dob")),
      F.lit(-1),
    ),
    "has_dob_1": ~(F.isnull("dob")),
    "has_dob_2": F.col("dob").isNotNull()
  }
)
df.show()

+---+------+----------+---+------+----------+------------+--------------------+------------+----------------+---------+---------+
| id|  name|       dob|age|salary|department|salary_raise|salary_raise_10_perc|   age_group|         company|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+------------+--------------------+------------+----------------+---------+---------+
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|            105000.0|        John|Abascus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|             90000.0|    21 to 30|Abascus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 90000|   Finance|     94500.0|            135000.0|    21 to 30|Abascus Insights|     true|     true|
|  3|   Bob|      null| -1| 80000|        IT|     84000.0|            120000.0|More than 40|Abascus Insights|    false|    false|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|             97500.0|    21 to 3

**Dropping Columns**

In [0]:
df = df.drop("company", "has_dob_1", "has_dob_2", "salary_raise_10_perc")
df.show()

+---+------+----------+---+------+----------+------------+------------+
| id|  name|       dob|age|salary|department|salary_raise|   age_group|
+---+------+----------+---+------+----------+------------+------------+
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|        John|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|
|  2| Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|
|  3|   Bob|      null| -1| 80000|        IT|     84000.0|More than 40|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|
|  4| Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|
|  5| David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|
|  6| Susan|1989-07-05| 35| 75000|   Finance|     78750.0|       Susan|
|  7|  Mike|1976-03-15| 48| 95000|        IT|     99750.0|More than 40|
|  8|  Lisa|1995-08-20| 29| 58000|        HR|     60900.0|    21 to 30|
|  9| James|1983-10-14| 41| 87000|        IT|     91350.0|More t

withColumnRenamed

In [0]:
df = df.withColumnRenamed("name", "first_name")
df.show()

+---+----------+----------+---+------+----------+------------+------------+
| id|first_name|       dob|age|salary|department|salary_raise|   age_group|
+---+----------+----------+---+------+----------+------------+------------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|        John|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|    21 to 30|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|More than 40|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|    21 to 30|
|  5|     David|1981-12-18| 43| 90000|        HR|     94500.0|More than 40|
|  6|     Susan|1989-07-05| 35| 75000|   Finance|     78750.0|       Susan|
|  7|      Mike|1976-03-15| 48| 95000|        IT|     99750.0|More than 40|
|  8|      Lisa|1995-08-20| 29| 58000|        HR|     60900.0|    21 to 30|
|  9|     Ja

**Partition By**

In [0]:
from pyspark.sql import window

In [0]:
window = window.partitionBy("age_group")

df.withColumn(
    "min_age_in_group"
)