In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark




In [0]:
%fs ls dbfs:///FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/csv/,csv/,0,0
dbfs:/FileStore/tables/json/,json/,0,0
dbfs:/FileStore/tables/parquet/,parquet/,0,0


In [0]:
df_csv = spark.read.format("csv").option('header' , True).load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType


In [0]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [0]:
from pyspark.sql.functions import col

In [0]:
df1 = df_csv.select(
    col("id").cast(IntegerType()),
    col("name").cast(StringType()),
    col("dob").cast(DateType()),
    col("age").cast(IntegerType()),
    col("salary").cast(IntegerType()),
    col("department").cast(StringType())
)


In [0]:
df2 = df_json.select(
    col('id').cast(IntegerType()),
    col("name").cast(StringType()),
    col("dob").cast(DateType()),
    col("age").cast(IntegerType()),
    col("salary").cast(IntegerType()),
    col("department").cast(StringType())
)

In [0]:
df1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_json = spark.read.format('json').option('header',True).load("dbfs:/FileStore/tables/json")

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df1.union(df2)

In [0]:
df

Out[241]: DataFrame[id: int, name: string, dob: date, age: int, salary: int, department: string]

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df = df.withColumn ("salary_raise", df.salary + df.salary * .05) 

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+
| id|  name|       dob| age|salary|department|salary_raise|
+---+------+----------+----+------+----------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|     99750.0|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|
|  4| Emily|1994-11-22|  28| 70000|   Fi

In [0]:
df = df.withColumn('salary_10_per',df.salary + df.salary * 0.1)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_10_per|
+---+------+----------+----+------+----------+------------+-------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|      71500.0|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|      71500.0|
|  5| David|1981-12-18|  41| 90000|        HR|     94500.0|      99000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|      82500.0|
|  7|  Mike|1976-03-15|  46| 95000|   

Load both csv and json union them properly. Instead of dropping duplicates, create a boolean column is_duplicate and set False to only one row and True to rest of them

In [0]:
from pyspark.sql.window import Window

In [0]:
from pyspark.sql.functions import row_number

In [0]:
windowspec = Window.partitionBy('id','name','dob','age','salary','department','salary_raise').orderBy('id')

In [0]:
df = df.withColumn("is_duplicate",row_number().over(windowspec))

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_10_per|is_duplicate|
+---+------+----------+----+------+----------+------------+-------------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           1|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           2|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           3|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|           1|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|           2|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|      99000.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|           2|
|  4| Emily|1994-11-22|  28| 65000|   Finan

In [0]:
df = df.drop("d_no")

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+-------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary_10_per|is_duplicate|
+---+------+----------+----+------+----------+------------+-------------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           1|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           2|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|      77000.0|           3|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|           1|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|      66000.0|           2|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|      99000.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|      88000.0|           2|
|  4| Emily|1994-11-22|  28| 65000|   Finan

In [0]:
df = df.drop("salary_10_per")

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|is_duplicate|
+---+------+----------+----+------+----------+------------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|           1|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|           2|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|           3|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|           1|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|           2|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|           1|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|           2|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|           1|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|           2|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     

In [0]:
from pyspark.sql.functions import when
from pyspark.sql import functions as f


In [0]:
df = df.withColumn('is_Duplicate',when(df["is_duplicate"] > 1,"True").otherwise("False"))

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|is_Duplicate|
+---+------+----------+----+------+----------+------------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|       False|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|       False|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|        True|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|       False|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|       False|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|        True|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|       False|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|        True|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     

Task-2.1
Calculate mean salary and check if it is greater or equal to the salary of employees in each department.

In [0]:
df5=df.withColumn("mean_sal_dept",f.avg("salary").over(Window.partitionBy(df.department)))

In [0]:
df5.show()

+---+------+----------+----+------+----------+------------+------------+-----------------+
| id|  name|       dob| age|salary|department|salary_raise|is_Duplicate|    mean_sal_dept|
+---+------+----------+----+------+----------+------------+------------+-----------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|       False|71166.66666666667|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|       False|71166.66666666667|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|        True|71166.66666666667|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     73500.0|       False|71166.66666666667|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|       False|71166.66666666667|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|       False|71166.66666666667|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|       False|          67000.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|        True|          67000.0|

In [0]:
df5=df.withColumn("mean_sal_dept",f.avg("salary").over(Window.partitionBy(df.department)) <= df.salary)

In [0]:
df5.show()

+---+------+----------+----+------+----------+------------+------------+-------------+
| id|  name|       dob| age|salary|department|salary_raise|is_Duplicate|mean_sal_dept|
+---+------+----------+----+------+----------+------------+------------+-------------+
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|       False|         true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|       False|        false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|        True|        false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|     73500.0|       False|        false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|     78750.0|       False|         true|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     65100.0|       False|        false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|       False|        false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|        True|        false|
|  5| David|1981-12-18|  41| 90000|        

Calculate mean salary and check if it is greater or equal to the salary of all employees

In [0]:
avg_sal = df.select(f.avg('salary')).first()[0]
print(avg_sal)

73352.94117647059


In [0]:
df7 = df.withColumn("avg_sal", df.salary >= avg_sal)

In [0]:
df7.show()

+---+------+----------+----+------+----------+------------+------------+-------+
| id|  name|       dob| age|salary|department|salary_raise|is_Duplicate|avg_sal|
+---+------+----------+----+------+----------+------------+------------+-------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|       False|  false|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|  false|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|       False|  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|        True|  false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|       False|   true|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|       False|   true|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|        True|   true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|       False|  false|
|  4| Emily|1994-11-22|  28|

In [0]:
df7 = df.withColumn("avg_sal", col("salary") <= avg_sal)

In [0]:
df7.show()

+---+------+----------+----+------+----------+------------+------------+-------+
| id|  name|       dob| age|salary|department|salary_raise|is_Duplicate|avg_sal|
+---+------+----------+----+------+----------+------------+------------+-------+
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|       False|  false|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|  false|
|  1|  John|1992-05-12|  30| 70000|        IT|     73500.0|        True|  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|       False|  false|
|  2| Alice|1997-02-28|  25| 60000|        HR|     63000.0|        True|  false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|     94500.0|       False|   true|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|       False|   true|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|        True|   true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     68250.0|       False|  false|
|  4| Emily|1994-11-22|  28|