# Water Quality and Potability Classification
# 1. Problem statement:
  - The main aim of this project to classify  water sample as potable(1) or not potable(0)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## sklearn imports 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


# 2. Data Assessing

In [None]:
data = pd.read_csv('/kaggle/input/water-quality-and-potability/water_potability.csv')
data.head()

## data columns description
1. `pH`: The pH level of the water.
2. `Hardness`: Water hardness, a measure of mineral content.
3. `Solids`: Total dissolved solids in the water.
4. `Chloramines`: Chloramines concentration in the water.
5. `Sulfate`: Sulfate concentration in the water.
6. `Conductivity`: Electrical conductivity of the water.
7. `Organic_carbon`: Organic carbon content in the water.
8. `Trihalomethanes`: Trihalomethanes concentration in the water.
9. `Turbidity`: Turbidity level, a measure of water clarity.
10. `Potability`: Target variable; indicates water potability with values 1 (potable) and 0 (not potable).

In [None]:
# last 5 rows in data
data.tail()

In [None]:
# shape of dataset
print('Shape of  Dataset is: ',data.shape)
print('Total number of rows in data: ',data.shape[0])
print('Total number of columns in data: ',data.shape[1])

In [None]:
data.info()

In [None]:
# check count of missing or null values in data
data.isnull().sum()

In [None]:
# check count of duplicated rows
data.duplicated().sum()

In [None]:
data.describe()

## Conclusion 1: based on Data assessing
1. There are total 10 columns in dataset
2. There are 9 columns are having float type and one column (our target column) have int type data.
3. There are 3 columns [ph, Sulfate, Trihalomethanes ], that are having so many missing values.
4. In dataset, no any duplicated value is present.m


# 3. Data Cleaning
  - Now we clean the original data and store it in new dataframe.

In [None]:
# create a copy of original data and clean it
data1 = data.copy()


In [None]:
# null values
sns.heatmap(data1.isnull())

In [None]:
data.isnull().sum()

In [None]:
null_cols = ['ph','Sulfate','Trihalomethanes']
data1[null_cols].describe()

In [None]:
#### we fill null values by mean (Simpleimputer)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
data1_filled = imputer.fit_transform(data1)
data2 = pd.DataFrame(data1_filled, columns=data1.columns)

# check null values in data2
data2.isnull().sum()

In [None]:
data2[null_cols].describe()

## Conclusion 2: about data cleaning
- Now we replace null values with mean and create new dataframe 'data2'

# 4. Exploratory Data Analysis

### 1. Column types
 - We have all numerical columns , which are having 2 types of data (float and int).
 - 9 columns are having float data and 1 column (target column) is having int data.
 

### 2. Univariate analysis

In [None]:

# create fuction for univariate analysis
def univariate(df,col):
    fig, ax = plt.subplots(1,2, figsize=(8,4))
    # histplot
    sns.histplot(x=df[col], bins=20, kde=True, ax=ax[0])
    # boxplot
    sns.boxplot(x=df[col])
    plt.show()

In [None]:
a = 1
for col in data2.columns:
    print(f'{a}. Univariate analysis for {col} :')
    univariate(data2, col)
    print('=='*50)
    a+=1

### target column 'Potability' univariate analysis

In [None]:
# value counts
data2['Potability'].value_counts()


In [None]:
# change type of 'Potability' column  as `category`
print('Before convert data type: ',data2['Potability'].dtype)
data2['Potability']= data2['Potability'].astype('category')
print('After convert data type: ', data2['Potability'].dtype)

In [None]:
# countplot
sns.countplot(x=data2['Potability'])
plt.title('Countplot for Potability')
plt.show()

## Conclusion 3: about Univariate analysis
- All independent column are having normally distributed data.
- All independent columns are having outliers in them.
- In dataset, potable water samples count is less than non potable wanter sample.

### 3. Bivariate analysis

In [None]:
# create a fuction for bivariate analysis between each independet feature or column and target or dependent column (Potability)
def bivariate(df , col):
    fig, ax = plt.subplots(2,2, figsize=(10, 10))
    
    # Bar Plot
    sns.barplot(x='Potability', y=col, data=df, ax=ax[0, 0])
    ax[0, 0].set_title(f'Average {col} by Potability.')

    # Box Plot
    sns.boxplot(x='Potability', y=col, data=df, ax=ax[0, 1])
    ax[0, 1].set_title(f'{col} distribution by Potability')

    # Violin Plot
    sns.violinplot(x='Potability', y=col, data=df, ax=ax[1, 0])
    ax[1, 0].set_title(f'{col} distribution by Potability')
    # Point Plot
    sns.pointplot(x='Potability', y=col, data=df, ax=ax[1, 1])
    ax[1, 1].set_title(f'Average {col} by Potability.')

    plt.tight_layout()
    plt.show()

In [None]:
a = 1
for col in data2.columns[:-1]:
    print(f'{a}. Bivariate analysis between {col} and Potability:')
    bivariate(data2, col)
    print('=='*50)
    a+=1

## Conclusion 4: about bivariate analysis
- Average ph values for patable and not potable water samples are quite similar.
- Average Hardness values for patable and not potable water samples are quite similar.
- Average Solids values for patable and not potable water samples are quite similar.
- Average Chloromines values for patable and not potable water samples are quite similar.
- Similarly average values of ['Sulfate', 'Conductivity','Organic_carbon', 'Trihalomethanes', 'Turbidity'] for patable and not potable water samples are quite similar.



In [None]:
data2.columns

### 4. Multivariate analysis

In [None]:
### correlation between independent and dependent variables
round(data2.corr(),2)

In [None]:
# heatmap for corr
plt.figure(figsize=(10,8))
sns.heatmap(data2.corr(),annot = True)

In [None]:
# pairplot
sns.pairplot(data2)

In [None]:
data2.info()

In [None]:
sns.boxplot(data=data2.drop('Solids', axis=1))
plt.xticks(rotation=70)
plt.show()

In [None]:
data2.describe()

In [None]:
#  fuction for remove outlier
def remove_outlier(data, column):
    #  quartiles and IQR for the specified column
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers outside the bounds for the specified column
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    
    return data



In [None]:
cols= data2.columns[:-1]
cols

In [None]:
# remove outliers from data2
for col in cols:
    cleaned_data = remove_outlier(data2, col )
    
print('DAta2 shape: ',data2.shape)
print('Cleaned data shape: ',cleaned_data.shape)

In [None]:
cleaned_data.describe()

In [None]:
# remove outliers from data2
for col in cols:
    cleaned_data = remove_outlier(data2, col )
    
print('DAta2 shape: ',data2.shape)
print('Cleaned data shape: ',cleaned_data.shape)

# 5. Feature Scaling

### Split data as dependent and Independent features

In [None]:
X = cleaned_data.drop('Potability', axis=1)
y = cleaned_data['Potability']

### Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

## 1. Standard Scaler

In [None]:
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_std, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_std, columns=X_train.columns)

round(X_train_scaled.describe(),2)

## 2. MinMax Scaler

In [None]:
minmax = MinMaxScaler()
X_train_mm = minmax.fit_transform(X_train)
X_test_mm = minmax.transform(X_test)

X_train_minmax = pd.DataFrame(X_train_mm, columns=X_train.columns)
X_test_minmax = pd.DataFrame(X_test_mm, columns=X_train.columns)

round(X_train_minmax.describe(),2)

## 3. Robust Scaler

In [None]:
rbst =RobustScaler()
X_train_rb = rbst.fit_transform(X_train)
X_test_rb = rbst.transform(X_test)

X_train_rbst = pd.DataFrame(X_train_rb, columns=X_train.columns)
X_test_rbst= pd.DataFrame(X_test_rb, columns=X_train.columns)

round(X_train_rbst.describe(),2)

# 6. Machine Models

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, f1_score, recall_score

In [None]:
# first check with  logistic regression on standard scaled data
lgr  = LogisticRegression()
lgr.fit(X_train_scaled, y_train)
y_pred_lg = lgr.predict(X_test_scaled)
print('Accuracy score for Logistic regression on standard scaled data: ', accuracy_score(y_test, y_pred_lg))
print('Precision score for Logistic regression on standard scaled data: ', precision_score(y_test, y_pred_lg))
print()
print(confusion_matrix(y_test, y_pred_lg))
print()
print(classification_report(y_test, y_pred_lg))

In [None]:
models={
      'Support vector machine': SVC(),
      'K-Nearest Neighbors':KNeighborsClassifier(),
      'Decision Tree': DecisionTreeClassifier(),
      'Random Forest Classifier': RandomForestClassifier(),
      'Naive Bayes': GaussianNB()
       }


### 1. with Standard scaled data

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    
    # check model performance
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    #roc = roc_auc_score(y_test, pred, average='macro',multi_class='ovr')
    
    print(list(models.keys())[i])
    
    print('Model Performance:')
    print('Accuracy: ',accuracy)
    print('F1_score: ', f1)
    print('Precision: ', precision)
    print('REcall: ', recall)
    #print('ROC AUC score: ', roc)
    
    print('='*40)
    print('\n')

### 2. with Minmax scaled data

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_minmax, y_train)
    pred = model.predict(X_test_minmax)
    
    # check model performance
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    #roc = roc_auc_score(y_test, pred, average='macro',multi_class='ovr')
    
    print(list(models.keys())[i])
    
    print('Model Performance:')
    print('Accuracy: ',accuracy)
    print('F1_score: ', f1)
    print('Precision: ', precision)
    print('REcall: ', recall)
    #print('ROC AUC score: ', roc)
    
    print('='*40)
    print('\n')

### 3. with Robust scaled data

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_rbst, y_train)
    pred = model.predict(X_test_rbst)
    
    # check model performance
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    #roc = roc_auc_score(y_test, pred, average='macro',multi_class='ovr')
    
    print(list(models.keys())[i])
    
    print('Model Performance:')
    print('Accuracy: ',accuracy)
    print('F1_score: ', f1)
    print('Precision: ', precision)
    print('REcall: ', recall)
    #print('ROC AUC score: ', roc)
    
    print('='*40)
    print('\n')

## Here we have to retrain our model to improve model performance, and then select model with best accuracy and store it in pkl file.