In [114]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [115]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

In [116]:
df.shape

In [117]:
df.info()

In [118]:
df.describe()

In [119]:
# removing duplicated values
df.drop_duplicates(inplace=True)

In [120]:
# check for null values
df.isna()

In [121]:
# replace null values with the mean
for col in df.columns:
    df[col].fillna(df[col].mean(), inplace=True)

df.isna()

In [122]:
drinkable = df[df['Potability'] == 1]
undrinkable = df[df['Potability'] == 0]
drinkable.head()

In [123]:
drinkable.describe()

In [124]:
undrinkable.describe()

In [133]:
drinkable.shape, undrinkable.shape

In [149]:
# detecting outliers
def detect_outliers(data, columns):
    outliers_indices = []
    for col in columns:
        
        q1 = np.percentile(data[col], 25)
        q3 = np.percentile(data[col], 75)
        IQR = q3 - q1
        outlier_step = IQR * 1.5
        
        upper = list(data[data[col] >= q3 + outlier_step].index)
        lower = list(data[data[col] <= q1 - outlier_step].index)
        
        outliers_indices.extend(lower)
        outliers_indices.extend(upper)

    return outliers_indices

In [150]:
drinkable_outliers = detect_outliers(drinkable, columns)
undrinkable_outliers = detect_outliers(undrinkable, columns)


In [154]:
drinkable.drop(index=drinkable_outliers)

In [155]:
undrinkable.drop(index=undrinkable_outliers)

In [157]:
drinkable_means = {'ph': [], 'Hardness': [], 'Solids': [], 'Chloramines': [], 'Sulfate': [], 'Conductivity': [], 'Organic_carbon': [], 'Trihalomethanes': [], 'Turbidity': [], 'Potability': []}
undrinkable_means = {'ph': [], 'Hardness': [], 'Solids': [], 'Chloramines': [], 'Sulfate': [], 'Conductivity': [], 'Organic_carbon': [], 'Trihalomethanes': [], 'Turbidity': [], 'Potability': []}


for i in range(1000):
    drinkable_sample = drinkable.sample(1000)
    undrinkable_sample = undrinkable.sample(1000)
    for col in df.columns:
        drinkable_means[col].append(drinkable_sample[col].mean())
        undrinkable_means[col].append(undrinkable_sample[col].mean())

print("Drinakable water means: ")
for col, means in drinkable_means.items():
    means = np.array(means)
    print("{}: {}".format(col, means.mean()))
    
print("\n Undrinkable water means: ")
for col, means in undrinkable_means.items():
    means = np.array(means)
    print("{}: {}".format(col, means.mean()))


In [158]:
drinkable.hist(figsize=(15, 15))

In [159]:
undrinkable.hist(figsize=(15, 15))

In [160]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Potability', data=df, palette='husl')

In [161]:
columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']

def boxPlot(data, colName):
    sns.catplot(x="Potability", y=colName, data=df, kind="box");

for col in columns:
    boxPlot(df, col)

In [162]:
plt.figure(figsize=(12, 12))
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True,linewidth=.8, mask=matrix, cmap="rocket");