<a href="https://colab.research.google.com/github/aerionator/Water-Quality-Potability/blob/main/Fix_Water_Quality_and_Potability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center> Water Quality and Potability Case-study

### This dataset is valuable for water quality assessment, water treatment planning, and ensuring the safety of drinking water supplies. It can be utilized by water treatment plants, environmental agencies, and researchers to make data-driven decisions regarding water quality and potability.


# Required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping


# Loading Data

In [None]:
df_water_quality = pd.read_csv('/content/drive/MyDrive/archive/water_potability.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Wrangling

In [None]:
df_water_quality.head()

In [None]:
df_water_quality.describe()

In [None]:
df_water_quality.isna().sum()

In [None]:
df_water_quality.shape

In [None]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputed_values = imputer.fit_transform(df_water_quality)

In [None]:
df_water_quality_imputed = pd.DataFrame(columns = list(df_water_quality.columns), data = imputed_values)

In [None]:
df_water_quality_imputed.head()

In [None]:
df_water_quality_imputed.isna().sum()

# Exploratory Data Analysis

In [None]:
df_water_quality_imputed['Potability'].value_counts().plot(kind = 'bar')

## Data Normalization

In [None]:
min_ratio = df_water_quality_imputed['Potability'].value_counts().min()

In [None]:
df_water_quality_imputed_0 = df_water_quality_imputed[df_water_quality_imputed['Potability'] == 0].sample(min_ratio, replace = True)
df_water_quality_imputed_1 = df_water_quality_imputed[df_water_quality_imputed['Potability'] == 1]
df_water_quality_imputed_balanced = pd.concat([df_water_quality_imputed_0, df_water_quality_imputed_1])

In [None]:
df_water_quality_imputed_balanced['Potability'].value_counts().plot(kind = 'bar')

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df_water_quality_imputed_balanced.corr(), annot = True)

In [None]:
sns.jointplot(x = 'ph', y = 'Hardness', data = df_water_quality_imputed_balanced, kind = "hex")

In [None]:
sns.jointplot(x = 'ph', y = 'Sulfate', data = df_water_quality_imputed_balanced, kind = "hex")

In [None]:
sns.scatterplot(x = 'Organic_carbon', y = 'ph', data = df_water_quality_imputed_balanced, hue = 'Potability'  )

# Data Preprocessing

In [None]:
scaler = MinMaxScaler()
df_scaled_vales = scaler.fit_transform(df_water_quality_imputed_balanced)
df_water_quality_scaled = pd.DataFrame(columns = list(df_water_quality.columns), data = df_scaled_vales)

In [None]:
df_water_quality_scaled.head()

In [None]:
X = df_water_quality_scaled.drop('Potability', axis = 1)
y = df_water_quality_scaled['Potability']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Modeling

In [None]:
Ann_model = Sequential([
    Dense(256, activation = 'relu'),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

In [None]:
Ann_model.compile(
    optimizer = 'Adam',
    loss = 'binary_crossentropy',
    metrics = ['Accuracy']
)

In [None]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 7)

In [None]:
Ann_history = Ann_model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 50, callbacks = [early_stopping])

In [None]:
plt.plot(Ann_history.history['loss'])
plt.plot(Ann_history.history['val_loss'])

In [None]:
plt.plot(Ann_history.history['Accuracy'])
plt.plot(Ann_history.history['val_Accuracy'])

In [None]:
Ann_model.evaluate(X_test, y_test)