# Data Preprocessing for Water Potability Dataset

This notebook covers cleaning, handling missing values, and feature engineering for the water potability dataset.

In [3]:
# Import libraries
import pandas as pd
import numpy as np

In [4]:
# Load dataset
file_path = "../data/water_potability.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [5]:
# Check missing values
missing = df.isnull().sum()
missing

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
# Impute missing values with median
for col in df.columns:
    df[col] = df[col].fillna(df[col].median())

# Verify no missing values remain
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [7]:
# Feature scaling (optional, for modeling)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = df.drop('Potability', axis=1)
features_scaled = scaler.fit_transform(features)

# Combine scaled features with target
df_scaled = pd.DataFrame(features_scaled, columns=features.columns)
df_scaled['Potability'] = df['Potability'].values
df_scaled.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,-0.025474,0.259195,-0.139471,0.112415,0.965957,1.708954,-1.180651,1.305434,-1.286298,0
1,-2.284717,-2.036414,-0.385987,-0.307694,-0.014799,2.062575,0.270597,-0.639186,0.684218,0
2,0.697319,0.847665,-0.240047,1.360594,-0.014799,-0.094032,0.781117,0.0008,-1.167365,0
3,0.845393,0.547651,0.000493,0.592008,0.64413,-0.77883,1.255134,2.152154,0.848412,0
4,1.372982,-0.464429,-0.460249,-0.363698,-0.649522,-0.343939,-0.824357,-2.182297,0.138786,0
