# pyspark Handling Missing Values

- Dropping Columns.
- Dropping Rows.
- Various parameters in dropping functionalities.
- Handling missing values by Mean, Median and Mode.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [2]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack'],
    'Age': [25, 30, 22, np.nan, 28, 35, 29, 40, 32, 27],
    'Salary': [50000, 60000, 45000, np.nan, 55000, 70000, 60000, 80000, np.nan, 58000],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'San Francisco', 'Boston', 'Seattle', 'Denver', 'Austin', 'Miami'],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male']
}

df = pd.DataFrame(data)

# Introduce some missing values
df.loc[2, 'Age'] = np.nan
df.loc[3, 'Salary'] = np.nan
df.loc[5, 'Age'] = np.nan
df.loc[8, 'Salary'] = np.nan

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Function to save DataFrame to CSV file
def save_to_csv(dataframe, file_path):
    dataframe.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}")

# Save the DataFrame to a CSV file
csv_file_path = 'sample_data.csv'
save_to_csv(df, csv_file_path)


Original DataFrame:
      Name   Age   Salary           City  Gender
0    Alice  25.0  50000.0       New York  Female
1      Bob  30.0  60000.0    Los Angeles    Male
2  Charlie   NaN  45000.0        Chicago    Male
3    David   NaN      NaN        Houston    Male
4     Emma  28.0  55000.0  San Francisco  Female
5    Frank   NaN  70000.0         Boston    Male
6    Grace  29.0  60000.0        Seattle  Female
7    Henry  40.0  80000.0         Denver    Male
8      Ivy  32.0      NaN         Austin  Female
9     Jack  27.0  58000.0          Miami    Male
DataFrame saved to sample_data.csv


In [6]:
df_pyspark = spark.read.csv('sample_data.csv', header=True, inferSchema=True)

In [7]:
df_pyspark.show()

+-------+----+-------+-------------+------+
|   Name| Age| Salary|         City|Gender|
+-------+----+-------+-------------+------+
|  Alice|25.0|50000.0|     New York|Female|
|    Bob|30.0|60000.0|  Los Angeles|  Male|
|Charlie|NULL|45000.0|      Chicago|  Male|
|  David|NULL|   NULL|      Houston|  Male|
|   Emma|28.0|55000.0|San Francisco|Female|
|  Frank|NULL|70000.0|       Boston|  Male|
|  Grace|29.0|60000.0|      Seattle|Female|
|  Henry|40.0|80000.0|       Denver|  Male|
|    Ivy|32.0|   NULL|       Austin|Female|
|   Jack|27.0|58000.0|        Miami|  Male|
+-------+----+-------+-------------+------+



In [8]:
df_pyspark.drop('Name').show()

+----+-------+-------------+------+
| Age| Salary|         City|Gender|
+----+-------+-------------+------+
|25.0|50000.0|     New York|Female|
|30.0|60000.0|  Los Angeles|  Male|
|NULL|45000.0|      Chicago|  Male|
|NULL|   NULL|      Houston|  Male|
|28.0|55000.0|San Francisco|Female|
|NULL|70000.0|       Boston|  Male|
|29.0|60000.0|      Seattle|Female|
|40.0|80000.0|       Denver|  Male|
|32.0|   NULL|       Austin|Female|
|27.0|58000.0|        Miami|  Male|
+----+-------+-------------+------+



In [9]:
df_pyspark.show()

+-------+----+-------+-------------+------+
|   Name| Age| Salary|         City|Gender|
+-------+----+-------+-------------+------+
|  Alice|25.0|50000.0|     New York|Female|
|    Bob|30.0|60000.0|  Los Angeles|  Male|
|Charlie|NULL|45000.0|      Chicago|  Male|
|  David|NULL|   NULL|      Houston|  Male|
|   Emma|28.0|55000.0|San Francisco|Female|
|  Frank|NULL|70000.0|       Boston|  Male|
|  Grace|29.0|60000.0|      Seattle|Female|
|  Henry|40.0|80000.0|       Denver|  Male|
|    Ivy|32.0|   NULL|       Austin|Female|
|   Jack|27.0|58000.0|        Miami|  Male|
+-------+----+-------+-------------+------+



## Dropping all the Null Rows.

In [11]:
df_pyspark.na.drop().show()

+-----+----+-------+-------------+------+
| Name| Age| Salary|         City|Gender|
+-----+----+-------+-------------+------+
|Alice|25.0|50000.0|     New York|Female|
|  Bob|30.0|60000.0|  Los Angeles|  Male|
| Emma|28.0|55000.0|San Francisco|Female|
|Grace|29.0|60000.0|      Seattle|Female|
|Henry|40.0|80000.0|       Denver|  Male|
| Jack|27.0|58000.0|        Miami|  Male|
+-----+----+-------+-------------+------+



In [12]:
df_pyspark.na.drop(how='all').show()

+-------+----+-------+-------------+------+
|   Name| Age| Salary|         City|Gender|
+-------+----+-------+-------------+------+
|  Alice|25.0|50000.0|     New York|Female|
|    Bob|30.0|60000.0|  Los Angeles|  Male|
|Charlie|NULL|45000.0|      Chicago|  Male|
|  David|NULL|   NULL|      Houston|  Male|
|   Emma|28.0|55000.0|San Francisco|Female|
|  Frank|NULL|70000.0|       Boston|  Male|
|  Grace|29.0|60000.0|      Seattle|Female|
|  Henry|40.0|80000.0|       Denver|  Male|
|    Ivy|32.0|   NULL|       Austin|Female|
|   Jack|27.0|58000.0|        Miami|  Male|
+-------+----+-------+-------------+------+



In [16]:
##hreshold.

df_pyspark.na.drop(how='any', thresh=4).show()

+-------+----+-------+-------------+------+
|   Name| Age| Salary|         City|Gender|
+-------+----+-------+-------------+------+
|  Alice|25.0|50000.0|     New York|Female|
|    Bob|30.0|60000.0|  Los Angeles|  Male|
|Charlie|NULL|45000.0|      Chicago|  Male|
|   Emma|28.0|55000.0|San Francisco|Female|
|  Frank|NULL|70000.0|       Boston|  Male|
|  Grace|29.0|60000.0|      Seattle|Female|
|  Henry|40.0|80000.0|       Denver|  Male|
|    Ivy|32.0|   NULL|       Austin|Female|
|   Jack|27.0|58000.0|        Miami|  Male|
+-------+----+-------+-------------+------+



In [17]:
## subset

df_pyspark.na.drop(how='any', subset=['Age']).show()

+-----+----+-------+-------------+------+
| Name| Age| Salary|         City|Gender|
+-----+----+-------+-------------+------+
|Alice|25.0|50000.0|     New York|Female|
|  Bob|30.0|60000.0|  Los Angeles|  Male|
| Emma|28.0|55000.0|San Francisco|Female|
|Grace|29.0|60000.0|      Seattle|Female|
|Henry|40.0|80000.0|       Denver|  Male|
|  Ivy|32.0|   NULL|       Austin|Female|
| Jack|27.0|58000.0|        Miami|  Male|
+-----+----+-------+-------------+------+



In [20]:
## Filing the missing values.

df_pyspark.na.fill('Missing Value',['Salary','Name']).show()

+-------+----+-------+-------------+------+
|   Name| Age| Salary|         City|Gender|
+-------+----+-------+-------------+------+
|  Alice|25.0|50000.0|     New York|Female|
|    Bob|30.0|60000.0|  Los Angeles|  Male|
|Charlie|NULL|45000.0|      Chicago|  Male|
|  David|NULL|   NULL|      Houston|  Male|
|   Emma|28.0|55000.0|San Francisco|Female|
|  Frank|NULL|70000.0|       Boston|  Male|
|  Grace|29.0|60000.0|      Seattle|Female|
|  Henry|40.0|80000.0|       Denver|  Male|
|    Ivy|32.0|   NULL|       Austin|Female|
|   Jack|27.0|58000.0|        Miami|  Male|
+-------+----+-------+-------------+------+



In [25]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Salary']]
    ).setStrategy("mean")

In [26]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+-------+-------------+------+------------------+--------------+
|   Name| Age| Salary|         City|Gender|       Age_imputed|Salary_imputed|
+-------+----+-------+-------------+------+------------------+--------------+
|  Alice|25.0|50000.0|     New York|Female|              25.0|       50000.0|
|    Bob|30.0|60000.0|  Los Angeles|  Male|              30.0|       60000.0|
|Charlie|NULL|45000.0|      Chicago|  Male|30.142857142857142|       45000.0|
|  David|NULL|   NULL|      Houston|  Male|30.142857142857142|       59750.0|
|   Emma|28.0|55000.0|San Francisco|Female|              28.0|       55000.0|
|  Frank|NULL|70000.0|       Boston|  Male|30.142857142857142|       70000.0|
|  Grace|29.0|60000.0|      Seattle|Female|              29.0|       60000.0|
|  Henry|40.0|80000.0|       Denver|  Male|              40.0|       80000.0|
|    Ivy|32.0|   NULL|       Austin|Female|              32.0|       59750.0|
|   Jack|27.0|58000.0|        Miami|  Male|              27.0|  