# Agenda:

- Dropping Columns

- Dropping Rows

- Various Parameter in Dropping Functionalities

- Handling the Missing Values by Mean, Median and Mode

In [1]:
# Start Pyspark Session
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [3]:
# Read the Dataset
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema = True)

In [4]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         1| 20000|
|   Harsha|  21|         2| 15000|
|  Shubham|  23|      null| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



## Drop the Columns

In [11]:
df_pyspark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         1| 20000|
|  21|         2| 15000|
|  23|      null| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [6]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         1| 20000|
|   Harsha|  21|         2| 15000|
|  Shubham|  23|      null| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [7]:
# Dropping Null Values
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         1| 20000|
|   Harsha| 21|         2| 15000|
+---------+---+----------+------+



In [10]:
# any == how to drop any null values
df_pyspark.na.drop(how = "any").show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         1| 20000|
|   Harsha| 21|         2| 15000|
+---------+---+----------+------+



In [13]:
## Threshold drop if min 2 null values are there
df_pyspark.na.drop(how = "any", thresh=3).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         1| 20000|
|   Harsha| 21|         2| 15000|
|  Shubham| 23|      null| 18000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



In [14]:
# Subset
df_pyspark.na.drop(how="any", subset=['Experience']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         1| 20000|
|   Harsha| 21|         2| 15000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



## Filling the Missing Value

In [15]:
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         1| 20000|
|        Harsha|  21|         2| 15000|
|       Shubham|  23|      null| 18000|
|        Mahesh|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [16]:
df_pyspark.na.fill('Missing Values', ['Experience', 'age']).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         1| 20000|
|   Harsha|  21|         2| 15000|
|  Shubham|  23|      null| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [17]:
## Filling with Mean
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         1| 20000|
|   Harsha|  21|         2| 15000|
|  Shubham|  23|      null| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [18]:
# We use Imputer Funtion for Replacing Mean Values
from pyspark.ml.feature import Imputer

In [19]:
imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("mean")

In [20]:
# Add Imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         1| 20000|         24|                 1|         20000|
|   Harsha|  21|         2| 15000|         21|                 2|         15000|
|  Shubham|  23|      null| 18000|         23|                 5|         18000|
|   Mahesh|null|      null| 40000|         28|                 5|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 5|         25750|
+---------+----+----------+-