# Initializing Pyspark

In [50]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

!ls
# Initialize findspark
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.82)] [Connecting to security.ubuntu.com (91.189.91.8                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
                                                                                                    0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcontent.net]                                                                                       Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security/rest

In [51]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

# Mounting the google Drive

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Csv File

In [71]:
csv_path = r'/content/drive/MyDrive/data/csv/batch.csv'

In [72]:
df = spark.read.csv(csv_path,header=True)
df.printSchema()
df.show(10)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



# Desinging Final schema

In [73]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [74]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("age", IntegerType()),
    StructField("salary", IntegerType()),
    StructField("department", StringType()),
])

In [75]:
df_csv = spark.read.format("csv").schema(schema).option("header", True).load(csv_path)
df_csv.printSchema()
df_csv.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



# Loading Json data

In [76]:
json_path = r'/content/drive/MyDrive/data/json'
df_json = spark.read.format("json").schema(schema).option("header", True).load(json_path)
df_json.printSchema()
df_json.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+---------

# Union of Csv and Json data

In [77]:
df = df_csv.union(df_json)
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  2| Alice|1997-02-2

In [78]:
df.count()

17

In [79]:
df = df_csv.union(df_json)
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  2| Alice|1997-02-2

# Task 1

# Finding Duplicate value

In [80]:

from pyspark.sql.window import Window
from pyspark.sql.functions import col, count, avg


In [81]:

windowSpec = Window.partitionBy([col(x) for x in df.columns])
df = df.withColumn("count", count("*").over(windowSpec))
df = df.withColumn("isDuplicate", col("count") > 1)
df = df.drop("count")
df.show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|isDuplicate|
+---+------+----------+----+------+----------+-----------+
|  7|  Mike|1976-03-15|  46| 95000|        IT|      false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|      false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  5| David|1981-12-18|  41| 90000|        HR|      false|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|      false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|
|  9| James|1983-10-14|  39| 87000|        IT|      false|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       tru

In [82]:
df = df.orderBy("id")
df.show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|isDuplicate|
+---+------+----------+----+------+----------+-----------+
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|      false|
|  5| David|1981-12-18|  41| 90000|        HR|      false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|      false|
|  7|  Mike|1976-03-15|  46| 95000|        IT|      fals

# Task 2

## Finding if salary greater than Mean salary of the department

In [83]:
df = df.dropDuplicates()
df.show()

+---+------+----------+----+------+----------+-----------+
| id|  name|       dob| age|salary|department|isDuplicate|
+---+------+----------+----+------+----------+-----------+
|  7|  Mike|1976-03-15|  46| 95000|        IT|      false|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|      false|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|
|  3|   Bob|      null|null| 80000|        IT|       true|
|  5| David|1981-12-18|  41| 90000|        HR|      false|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|      false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|
|  9| James|1983-10-14|  39| 87000|        IT|      false|
|  1|  John|1992-05-12|  30| 70000|        IT|       true|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|
|  4| Emily|1994-11-22|  28| 70000|   Finance|      false|
|  6| Susan|1989-07-05|  33| 75000|   Finance|      false|
+---+------+----------+----+------+----------+-----------+



In [84]:
windowSpec = Window.partitionBy(df.department)
df = df.withColumn("Mean salary", avg("salary").over(windowSpec))
df.show()

+---+------+----------+----+------+----------+-----------+-----------------+
| id|  name|       dob| age|salary|department|isDuplicate|      Mean salary|
+---+------+----------+----+------+----------+-----------+-----------------+
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|69333.33333333333|
|  5| David|1981-12-18|  41| 90000|        HR|      false|69333.33333333333|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|      false|69333.33333333333|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|      false|          72400.0|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|          72400.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|          72400.0|
|  4| Emily|1994-11-22|  28| 70000|   Finance|      false|          72400.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|      false|          72400.0|
|  7|  Mike|1976-03-15|  46| 95000|        IT|      false|          83000.0|
|  3|   Bob|      null|null| 80000|        IT|       true|          83000.0|

In [85]:
df = df.withColumn(
    "Is Salary greater than mean Salary",
    col("Mean salary") < col("salary")
).orderBy("id")
df.show()

+---+------+----------+----+------+----------+-----------+-----------------+----------------------------------+
| id|  name|       dob| age|salary|department|isDuplicate|      Mean salary|Is Salary greater than mean Salary|
+---+------+----------+----+------+----------+-----------+-----------------+----------------------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|       true|          83000.0|                             false|
|  2| Alice|1997-02-28|  25| 90000|   Finance|      false|          72400.0|                              true|
|  2| Alice|1997-02-28|  25| 60000|        HR|       true|69333.33333333333|                             false|
|  3|   Bob|      null|null| 80000|        IT|       true|          83000.0|                             false|
|  4| Emily|1994-11-22|  28| 70000|   Finance|      false|          72400.0|                             false|
|  4| Emily|1994-11-22|  28| 65000|   Finance|       true|          72400.0|                            