In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark 

In [0]:
%fs ls dbfs:////FileStore/tables


path,name,size,modificationTime
dbfs:/FileStore/tables/Address-1.xlsx,Address-1.xlsx,151315,1706935437000
dbfs:/FileStore/tables/Address.xlsx,Address.xlsx,151315,1706076818000
dbfs:/FileStore/tables/Detail.csv,Detail.csv,208476,1706073616000
dbfs:/FileStore/tables/Project_1-1.xlsx,Project_1-1.xlsx,422501,1706605367000
dbfs:/FileStore/tables/Project_1.xlsx,Project_1.xlsx,422501,1706605307000
dbfs:/FileStore/tables/contactinfo-1.txt,contactinfo-1.txt,49969,1706935632000
dbfs:/FileStore/tables/contactinfo.txt,contactinfo.txt,49969,1706076166000
dbfs:/FileStore/tables/csv/,csv/,0,0
dbfs:/FileStore/tables/dummy_data_2.sql,dummy_data_2.sql,574,1706609575000
dbfs:/FileStore/tables/header.json,header.json,115560,1706518881000


In [0]:
df_csv=spark.read.format("csv").load("dbfs:////FileStore/tables/csv/batch.csv")

In [0]:
type(df_csv)

Out[62]: pyspark.sql.dataframe.DataFrame

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv =spark.read.format("csv").option("header",True).load("dbfs:////FileStore/tables/csv/batch.csv")
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType

In [0]:
schema =StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType()),

])

In [0]:
df_csv=spark.read.format("csv").schema(schema).option("header",True).load("dbfs:////FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json=spark.read.format("json").load("dbfs:/FileStore/tables/json")


In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df=df_csv.union(df_json)

In [0]:
df.show()

+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   1|   John|1992-05-12|  30| 70000|        IT|
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|null|     IT|      null|   3|   Bob|     80000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     90000|
|  28|Finance|1994-11-22|   4| Emily|     70000|
|  39|     IT|1983-10-14|   9| James|     87000|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|
+----+-------+------

In [0]:
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['age', 'department', 'dob', 'id', 'name', 'salary']


In [0]:
df_json=df_json.select(df_csv.columns)
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
df_json=spark.read.format("json").schema(schema).load("dbfs:/FileStore/tables/json").orderBy("id")
df_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df=df.dropDuplicates()

In [0]:
df.show()

+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|   1|   John|1992-05-12|  30| 70000|        IT|
|null|     IT|      null|   3|   Bob|     80000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     90000|
|  28|Finance|1994-11-22|   4| Emily|     70000|
|  39|     IT|1983-10-14|   9| James|     87000|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|
+----+-------+----------+----+------+----------+



In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.functions import col,max,min
df.show()


+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|   1|   John|1992-05-12|  30| 70000|        IT|
|null|     IT|      null|   3|   Bob|     80000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     90000|
|  28|Finance|1994-11-22|   4| Emily|     70000|
|  39|     IT|1983-10-14|   9| James|     87000|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|
+----+-------+----------+----+------+----------+



In [0]:
df.select(
    df.name,df.salary
).show()

df.select(
    F.col("salary")
).show()

+-------+------+
|   name|salary|
+-------+------+
|  Alice| 60000|
|    Bob| 80000|
|  Emily| 65000|
|   John| 70000|
|     IT|   Bob|
|Finance| Susan|
|     HR| David|
|     HR| Alice|
|     IT|  John|
|Finance| Emily|
|     IT|  Mike|
|Finance|Sophie|
|Finance| Alice|
|Finance| Emily|
|     IT| James|
|     HR|  Lisa|
+-------+------+

+------+
|salary|
+------+
| 60000|
| 80000|
| 65000|
| 70000|
|   Bob|
| Susan|
| David|
| Alice|
|  John|
| Emily|
|  Mike|
|Sophie|
| Alice|
| Emily|
| James|
|  Lisa|
+------+



In [0]:
df.select(
    (df.salary + .05 * df.salary).alias("SAALRYY"),
    (F.year(F.current_timestamp()) -F.year(F.col("dob"))).alias("age"),
    F.year(F.current_timestamp()) - F.year("dob")

     ).show()

+-------+----+---------------------------------------+
|SAALRYY| age|(year(current_timestamp()) - year(dob))|
+-------+----+---------------------------------------+
|63000.0|  27|                                     27|
|84000.0|null|                                   null|
|68250.0|  30|                                     30|
|73500.0|  32|                                     32|
|   null|null|                                   null|
|   null|  35|                                     35|
|   null|  43|                                     43|
|   null|  27|                                     27|
|   null|  32|                                     32|
|   null|  30|                                     30|
|   null|  48|                                     48|
|   null|  32|                                     32|
|   null|  27|                                     27|
|   null|  30|                                     30|
|   null|  41|                                     41|
|   null| 

In [0]:
df.select(
    (df.salary + .05 * df.salary).alias("SAALRYY"),
    (F.year(F.current_timestamp()) -F.year(F.col("dob"))).alias("age"),
    (F.year(F.current_timestamp()) - F.year("dob")).alias("another age"),
     F.expr("salary + 0.5 * salary").alias("salary raize")

     ).show()

+-------+----+-----------+------------+
|SAALRYY| age|another age|salary raize|
+-------+----+-----------+------------+
|63000.0|  27|         27|     90000.0|
|84000.0|null|       null|    120000.0|
|68250.0|  30|         30|     97500.0|
|73500.0|  32|         32|    105000.0|
|   null|null|       null|        null|
|   null|  35|         35|        null|
|   null|  43|         43|        null|
|   null|  27|         27|        null|
|   null|  32|         32|        null|
|   null|  30|         30|        null|
|   null|  48|         48|        null|
|   null|  32|         32|        null|
|   null|  27|         27|        null|
|   null|  30|         30|        null|
|   null|  41|         41|        null|
|   null|  29|         29|        null|
+-------+----+-----------+------------+



In [0]:
df=df.withColumn(
    "salary_raise",(F.col("salary")+ 0.05 * F.col("salary")) 
)

df=df.withColumn(
    "salary_raise_10_perc",
    (F.col("salary")+.1 * F.col("salary"))
)

In [0]:
df.show()

+----+-------+----------+----+------+----------+------------+--------------------+
|  id|   name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|
+----+-------+----------+----+------+----------+------------+--------------------+
|   2|  Alice|1997-02-28|  25| 60000|        HR|     63000.0|             66000.0|
|   3|    Bob|      null|null| 80000|        IT|     84000.0|             88000.0|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|     68250.0|             71500.0|
|   1|   John|1992-05-12|  30| 70000|        IT|     73500.0|             77000.0|
|null|     IT|      null|   3|   Bob|     80000|        null|                null|
|  33|Finance|1989-07-05|   6| Susan|     75000|        null|                null|
|  41|     HR|1981-12-18|   5| David|     90000|        null|                null|
|  25|     HR|1997-02-28|   2| Alice|     60000|        null|                null|
|  30|     IT|1992-05-12|   1|  John|     70000|        null|                null|
|  2

In [0]:

df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        "31 to 40"
    ).otherwise(
        "More than 40"
    )
)

In [0]:
df.show()

+----+-------+----------+----+------+----------+------------+--------------------+------------+
|  id|   name|       dob| age|salary|department|salary_raise|salary_raise_10_perc|   age_group|
+----+-------+----------+----+------+----------+------------+--------------------+------------+
|   2|  Alice|1997-02-28|  25| 60000|        HR|     63000.0|             66000.0|    21 to 30|
|   3|    Bob|      null|null| 80000|        IT|     84000.0|             88000.0|More than 40|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|     68250.0|             71500.0|    21 to 30|
|   1|   John|1992-05-12|  30| 70000|        IT|     73500.0|             77000.0|    21 to 30|
|null|     IT|      null|   3|   Bob|     80000|        null|                null|     Upto 20|
|  33|Finance|1989-07-05|   6| Susan|     75000|        null|                null|     Upto 20|
|  41|     HR|1981-12-18|   5| David|     90000|        null|                null|     Upto 20|
|  25|     HR|1997-02-28|   2| Alice|   

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year (F.current_timestamp()) - F.year(F.col("dob")),
            F.lit(-1),
        ) ,
        "has_dob_1": ~(F.isnull("dob")),
        "has_dob_2": F.col("dob").isNotNull(),
    }

)

In [0]:
df.show()

+----+-------+----------+---+------+----------+------------+--------------------+------------+---------+---------+
|  id|   name|       dob|age|salary|department|salary_raise|salary_raise_10_perc|   age_group|has_dob_1|has_dob_2|
+----+-------+----------+---+------+----------+------------+--------------------+------------+---------+---------+
|   2|  Alice|1997-02-28| 27| 60000|        HR|     63000.0|             66000.0|    21 to 30|     true|     true|
|   3|    Bob|      null| -1| 80000|        IT|     84000.0|             88000.0|More than 40|    false|    false|
|   4|  Emily|1994-11-22| 30| 65000|   Finance|     68250.0|             71500.0|    21 to 30|     true|     true|
|   1|   John|1992-05-12| 32| 70000|        IT|     73500.0|             77000.0|    21 to 30|     true|     true|
|null|     IT|      null| -1|   Bob|     80000|        null|                null|     Upto 20|    false|    false|
|  33|Finance|1989-07-05| 35| Susan|     75000|        null|                null

In [0]:
df=df.withColumn(
    "company",F.lit("ABACUS INSIGHTS")
)

In [0]:
df.show()

+----+-------+----------+---+------+----------+------------+--------------------+------------+---------+---------+---------------+
|  id|   name|       dob|age|salary|department|salary_raise|salary_raise_10_perc|   age_group|has_dob_1|has_dob_2|        company|
+----+-------+----------+---+------+----------+------------+--------------------+------------+---------+---------+---------------+
|   2|  Alice|1997-02-28| 27| 60000|        HR|     63000.0|             66000.0|    21 to 30|     true|     true|ABACUS INSIGHTS|
|   3|    Bob|      null| -1| 80000|        IT|     84000.0|             88000.0|More than 40|    false|    false|ABACUS INSIGHTS|
|   4|  Emily|1994-11-22| 30| 65000|   Finance|     68250.0|             71500.0|    21 to 30|     true|     true|ABACUS INSIGHTS|
|   1|   John|1992-05-12| 32| 70000|        IT|     73500.0|             77000.0|    21 to 30|     true|     true|ABACUS INSIGHTS|
|null|     IT|      null| -1|   Bob|     80000|        null|                null|  

In [0]:
df = df.drop( "company", "has_dob_1", "has_dob_2", "salary_raise_10_perc")

In [0]:
df=df.withColumnRenamed("name","firstName")

In [0]:
df.show()

+----+---------+----------+---+------+----------+------------+------------+
|  id|firstName|       dob|age|salary|department|salary_raise|   age_group|
+----+---------+----------+---+------+----------+------------+------------+
|   2|    Alice|1997-02-28| 27| 60000|        HR|     63000.0|    21 to 30|
|   3|      Bob|      null| -1| 80000|        IT|     84000.0|More than 40|
|   4|    Emily|1994-11-22| 30| 65000|   Finance|     68250.0|    21 to 30|
|   1|     John|1992-05-12| 32| 70000|        IT|     73500.0|    21 to 30|
|null|       IT|      null| -1|   Bob|     80000|        null|     Upto 20|
|  33|  Finance|1989-07-05| 35| Susan|     75000|        null|     Upto 20|
|  41|       HR|1981-12-18| 43| David|     90000|        null|     Upto 20|
|  25|       HR|1997-02-28| 27| Alice|     60000|        null|     Upto 20|
|  30|       IT|1992-05-12| 32|  John|     70000|        null|     Upto 20|
|  28|  Finance|1994-11-22| 30| Emily|     65000|        null|     Upto 20|
|  46|      