In [0]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType
spark=SparkSession.builder.getOrCreate()


In [0]:
schema=StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType()),

])

In [0]:
df_csv=spark.read.format("csv").schema(schema).option("header",True).load("dbfs:///FileStore/tables/data/csv/batch.csv")

In [0]:
df_json = spark.read.format("json").load("dbfs:///FileStore/tables/data/json"
)

In [0]:
df_json=df_json.select(df_csv.columns)

In [0]:
df=df_json.union(df_csv)
# print(df.disp())
# df.drop()

In [0]:
# # repeated 2 times then first value  should be true and remaining rows should be false
window_spec = Window.partitionBy("id").orderBy("dob")

df = df.withColumn(
    "is_duplicate",
    F.row_number().over(window_spec) > 1
)
df.display()

id,name,dob,age,salary,department,is_duplicate
1,John,1992-05-12,30.0,70000,IT,False
1,John,1992-05-12,30.0,70000,IT,True
1,John,1992-05-12,30.0,70000,IT,True
1,John,1992-05-12,30.0,70000,IT,True
1,John,1992-05-12,30.0,70000,IT,True
2,Alice,1997-02-28,25.0,60000,HR,False
2,Alice,1997-02-28,25.0,60000,HR,True
2,Alice,1997-02-28,25.0,90000,Finance,True
2,Alice,1997-02-28,25.0,90000,Finance,True
2,Alice,1997-02-28,25.0,60000,HR,True


In [0]:
df=df.dropDuplicates(["id"])


In [0]:
# Calculate mean salary and check if it is greater or equal to the salary of employees in each department.
window_spec = Window.partitionBy("department")
df=df.withColumns({
    "mean_salary_department":F.avg("salary").over(window_spec),
    "is_above_mean_department":    F.col("salary") >= F.col("mean_salary_department")
})

df.display()

id,name,dob,age,salary,department,is_duplicate,mean_salary_department,is_above_mean_department
4,Emily,1994-11-22,28.0,65000,Finance,True,67333.33333333333,False
6,Susan,1989-07-05,33.0,75000,Finance,True,67333.33333333333,True
10,Sophie,1992-06-30,30.0,62000,Finance,True,67333.33333333333,False
2,Alice,1997-02-28,25.0,60000,HR,True,69333.33333333333,False
5,David,1981-12-18,41.0,90000,HR,True,69333.33333333333,True
8,Lisa,1995-08-20,27.0,58000,HR,True,69333.33333333333,False
1,John,1992-05-12,30.0,70000,IT,True,83000.0,False
3,Bob,,,80000,IT,True,83000.0,False
7,Mike,1976-03-15,46.0,95000,IT,True,83000.0,True
9,James,1983-10-14,39.0,87000,IT,True,83000.0,True


In [0]:
# Calculate mean salary and check if it is greater or equal to the salary of all employees.

mean_salary = df.select(F.avg('salary')).first()[0]

df.withColumns(
    {
        'avg_salary':F.lit(mean_salary),
        'is_above_avg':F.col("salary")>=mean_salary
    }
).show()

+---+------+----------+----+------+----------+------------+----------------------+------------------------+----------+------------+
| id|  name|       dob| age|salary|department|is_duplicate|mean_salary_department|is_above_mean_department|avg_salary|is_above_avg|
+---+------+----------+----+------+----------+------------+----------------------+------------------------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|        true|               83000.0|                   false|   74200.0|       false|
|  2| Alice|1997-02-28|  25| 60000|        HR|        true|     69333.33333333333|                   false|   74200.0|       false|
|  3|   Bob|      null|null| 80000|        IT|        true|               83000.0|                   false|   74200.0|        true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|        true|     67333.33333333333|                   false|   74200.0|       false|
|  5| David|1981-12-18|  41| 90000|        HR|        true|     69333.333333

In [0]:
df.show()



In [0]:
df=df.withColumns({
    "age":F.coalesce(
        F.year(F.current_timestamp())-F.year(F.col("dob")),
        F.lit(-1)
    ),
    "has_dob_1":~(F.isnull("dob")),
    "has_dob_2":F.col("dob").isNotNull(),
})



In [0]:
df.show()



In [0]:
df.withColumnRenamed("name","first_name").show()



In [0]:
df = df.withColumn(
    "age_group",
    F.when(F.col("age") <= 20, "Upto 20")
    .when((F.col("age") > 20) & (F.col("age") <= 30), "21 to 30")
    .when((F.col("age") > 30) & (F.col("age") <= 40), "31 to 40")
    .otherwise("More than 40")
    )



In [0]:
from pyspark.sql import Window

window=Window.partitionBy("age_group")

df.withColumn(
    "min_age_by_group",
    F.min("age").over(window)

).show()



In [0]:
# Define a window specification
window_spec = Window.partitionBy("id")

# Add a new column is_duplicate
df = df.withColumn("is_duplicate", F.count("id").over(window_spec) > 1)

# Show the result
df.show()

