In [1]:
#import commands
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [4]:
raw_water_data = pd.read_csv('https://drive.google.com/uc?id=13N4nBi8cZCQUQambCexi0-XArwSghdrj')

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


Perform relevant summary of the data set (exploratory data analysis)

In [24]:
# The number of rows that has at least one NaN 
missing_sum = (raw_water_data.isna().sum(axis=1) > 0).sum()
missing_sum

1265

In [26]:
# The total number of rows in the dataset
total_rows = raw_water_data.shape[0]
total_rows

3276

In [27]:
# The percentage of how many rows have missing data in the dataset
percent_missing = (missing_sum / total_rows) * 100
percent_missing

38.614163614163616

In [33]:
# Preprocessor to impute missing values
preprocessor_missing = make_column_transformer(
    (SimpleImputer(), ["ph", "Sulfate", "Trihalomethanes"]),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Fit and transform the dataset with the preprocessor
preprocessor_missing.fit(raw_water_data)
water_data = preprocessor_missing.transform(raw_water_data)

# Replace the label for each class
water_data["Potability"] = water_data["Potability"].replace({
    0: "Not Potable",
    1: "Potable"
})

# The number of observations in each class
water_data["Potability"].value_counts()

Not Potable    1998
Potable        1278
Name: Potability, dtype: int64

In [35]:
# The dataset that only contains "Not Potable" water
np_water = water_data[water_data["Potability"] == "Not Potable"]

# The dataset that only contains "Potable" water
p_water = water_data[water_data["Potability"] == "Potable"]

# Upsampling to increase the number of observations for "Potable" water 
p_water_upsampled = resample(
    p_water, n_samples=np_water.shape[0]
)

# Concatnating the upsampled dataset and the dataset that only contains "Not Potable" water together
upsampled_water = pd.concat((p_water_upsampled, np_water))

# The number of observations in each class
upsampled_water['Potability'].value_counts()

Potable        1998
Not Potable    1998
Name: Potability, dtype: int64

creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis