In [1]:
pip install altair>=5

Note: you may need to restart the kernel to use updated packages.


In [2]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Water Quality

### Introduction 

[Kaggle Link](https://www.kaggle.com/datasets/mssmartypants/water-quality)

Data Atrributes:

- aluminium - dangerous if greater than 2.8
- ammonia - dangerous if greater than 32.5
- arsenic - dangerous if greater than 0.01
- barium - dangerous if greater than 2
- cadmium - dangerous if greater than 0.005
- chloramine - dangerous if greater than 4
- chromium - dangerous if greater than 0.1
- copper - dangerous if greater than 1.3
- flouride - dangerous if greater than 1.5
- bacteria - dangerous if greater than 0
- viruses - dangerous if greater than 0
- lead - dangerous if greater than 0.015
- nitrates - dangerous if greater than 10
- nitrites - dangerous if greater than 1
- mercury - dangerous if greater than 0.002
- perchlorate - dangerous if greater than 56
- radium - dangerous if greater than 5
- selenium - dangerous if greater than 0.5
- silver - dangerous if greater than 0.1
- uranium - dangerous if greater than 0.3
- is_safe - class attribute {0 - not safe, 1 - safe}

## Preliminary exploratory data analysis:

#### Importing data:

In [13]:
water_quality = pd.read_csv("data/water_quality.csv")[0:5000]
water_quality.shape

(5000, 21)

#### Making all values floats:

In [4]:
water_quality.dtypes

aluminium      float64
ammonia         object
arsenic        float64
barium         float64
cadmium        float64
chloramine     float64
chromium       float64
copper         float64
flouride       float64
bacteria       float64
viruses        float64
lead           float64
nitrates       float64
nitrites       float64
mercury        float64
perchlorate    float64
radium         float64
selenium       float64
silver         float64
uranium        float64
is_safe         object
dtype: object

In [5]:
water_quality[water_quality['ammonia'] == "#NUM!"]

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe


In [6]:
water_quality = water_quality[water_quality['ammonia'] != "#NUM!" ]
water_quality = water_quality.astype(float)
water_quality.dtypes

aluminium      float64
ammonia        float64
arsenic        float64
barium         float64
cadmium        float64
chloramine     float64
chromium       float64
copper         float64
flouride       float64
bacteria       float64
viruses        float64
lead           float64
nitrates       float64
nitrites       float64
mercury        float64
perchlorate    float64
radium         float64
selenium       float64
silver         float64
uranium        float64
is_safe        float64
dtype: object

#### Infomation about the FULL dataset:

In [7]:
water_quality_info = pd.DataFrame({"name": water_quality.columns, 
                                   "non-nulls": len(water_quality)-water_quality.isnull().sum().values, 
                                   "nulls": water_quality.isnull().sum().values, 
                                   "type": water_quality.dtypes.values})
water_quality_info

Unnamed: 0,name,non-nulls,nulls,type
0,aluminium,5000,0,float64
1,ammonia,5000,0,float64
2,arsenic,5000,0,float64
3,barium,5000,0,float64
4,cadmium,5000,0,float64
5,chloramine,5000,0,float64
6,chromium,5000,0,float64
7,copper,5000,0,float64
8,flouride,5000,0,float64
9,bacteria,5000,0,float64


#### Distribution of Safe vs. Not Safe Water Samples:

In [8]:
water_quality['is_safe'].value_counts()

0.0    4137
1.0     863
Name: is_safe, dtype: int64

In [9]:
unsafe_dist = round((water_quality['is_safe'].value_counts()[0.0]/water_quality['is_safe'].count())*100,2)
safe_dist = round((water_quality['is_safe'].value_counts()[1.0]/water_quality['is_safe'].count())*100,2)
print("Unsafe Samples:", str(unsafe_dist)+"%" "\n"
      "Safe Samples:" , str(safe_dist)+"%" )

Unsafe Samples: 82.74%
Safe Samples: 17.26%


### Splitting into Training and Testing Data

In [10]:
water_training, water_testing = train_test_split(water_quality, test_size = 0.25, random_state=64)
water_training.shape

(3750, 21)

### Training Data Information:

In [11]:
water_training_info = pd.DataFrame({"name": water_training.columns, 
                                   "non-nulls": len(water_training)-water_training.isnull().sum().values, 
                                   "nulls": water_training.isnull().sum().values, 
                                   "type": water_training.dtypes.values})
water_training_info

Unnamed: 0,name,non-nulls,nulls,type
0,aluminium,3750,0,float64
1,ammonia,3750,0,float64
2,arsenic,3750,0,float64
3,barium,3750,0,float64
4,cadmium,3750,0,float64
5,chloramine,3750,0,float64
6,chromium,3750,0,float64
7,copper,3750,0,float64
8,flouride,3750,0,float64
9,bacteria,3750,0,float64


In [25]:
lead_chart = alt.Chart(water_training).mark_bar().encode(
        alt.Y('count(lead)'),
        alt.X('lead'))

mercury_chart = alt.Chart(water_training).mark_bar().encode(
        alt.Y('count(mercury)'),
        alt.X('mercury'))

dist_chart = mercury_chart | lead_chart
dist_chart

## Methods

## Expected Outcomes:

- What do you expect to find?
- What impact could such findings have?
- What future questions could this lead to?