Data Cell

Imports dependencies 
Imports CSVs
Read CSVs

Cleaning process done with the intention of finding the most relevant data. In this case relevancy means that the data has been most consistently tracked throughout the entire CSV. The hypothesis is that if a data metric has been measured throughout the entire dataset, that metric must hold some importance. 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
arsenic = "Resources/arsenic.csv"
algae = "Resources/algae.csv"
metals = "Resources/quarterlyMetals.csv"

# Read the mouse data and the study results
arsenic = pd.read_csv(arsenic)
algae = pd.read_csv(algae)
metals = pd.read_csv(metals)

# Combine the data into a single dataset
algae.tail()


Unnamed: 0,site number,site location,Cluster,site acronym,date,sample type,conductance (uS/cm),chl A (ug/L),phaeophytin (ug/L),phaeophytin chl A (ug/L),chlorophyta (organisms/ml),cyanophyta (organisms/ml),bacillariophyta (organisms/ml),total (organisms/ml),algae comments
9700,Roos In,Roosevelt at Salt River Inlet,salt,SRNR,2016-11-07,,,,,,,,,,
9701,Salt Gila,Salt Gila Pump Station,,SGPS,2016-11-07,,973.0,,,,,,,,
9702,SOCA,South Canal below CAP Cross-conect,srp,South canal below CAP,2016-11-07,,626.0,,,,,,,,
9703,SPT In,Tempe Canal - Inlet to Tempe's South Plant,tempe,STP In,2016-11-07,,,,,,,,,,
9704,UH In,Union Hills Inlet,cap,Union Hills Inlet,2016-11-07,,985.0,,,,,,,,


Arsenic CSV - measuring arsenic in a body of water. 

In [2]:
arsenic.count()

Site Number           3428
Site location         3428
Cluster               3193
Site Acronym          3428
Arsenic (ug/L)        2801
Perchlorate (ug/L)      66
date                  3428
dtype: int64

In [3]:
just_arsenic = arsenic[["Site Number", "Site location", "Cluster", "Arsenic (ug/L)", "date"]]
just_arsenic = just_arsenic.dropna()
just_arsenic.head(50)
just_arsenic.count()

Site Number       2637
Site location     2637
Cluster           2637
Arsenic (ug/L)    2637
date              2637
dtype: int64

Algae CSV - measuring conductance in a body of water. 

In [4]:
algae.head(100)
conductance = algae[["site number", "site location", "Cluster", "date", "conductance (uS/cm)"]]
conductance=conductance.dropna()
conductance.count()

site number            5610
site location          5610
Cluster                5610
date                   5610
conductance (uS/cm)    5610
dtype: int64

In [5]:
conductance.head()

Unnamed: 0,site number,site location,Cluster,date,conductance (uS/cm)
0,R5,Verde River btwn Horseshoe and Bartlett,verde,1999-08-16,500.0
1,R6A,Bartlett Reservoir near outlet,verde,1999-08-16,490.0
2,R6B,Bartlett Reservoir near outlet,verde,1999-08-16,490.0
3,R7,Verde River below Bartlett Reservoir,verde,1999-08-16,500.0
4,R10,Salt River below Saguaro (Blue Point Bridge),salt,1999-08-17,1200.0


Quarterly Metals CSV - measuring the amount of heavy metals in a body of water. 

In [9]:
grouped_metals = metals.groupby(["Cluster"])
grouped_metals.head(50)
#metals["Cluster"].unique()

Unnamed: 0,Site Name,Site Location,Cluster,Site Acronym,7Li,9Be,23Na,24Mg,27Al,39K,...,95Mo,107Ag,111Cd,115In,121Sb,138Ba,202Hg,208Pb,238U,date
0,APA1A,"Apache Lake, upper",srp,"Apache, eppi, upper",111.60,0.017,163700.0,13520.0,32.22,11550.0,...,1.486,0.004,0.111,,0.219,145.800,0.138,0.126,1.128,2012-11-04
1,APA1B,"Apache Lake, upper",srp,"Apache, hypo, upper",111.10,0.012,160600.0,15710.0,27.67,11480.0,...,1.449,0.006,0.034,,0.196,123.000,0.112,0.120,1.113,2012-11-04
2,APA2A,"Apache Lake, lower",srp,"Apache, eppi, lower",109.20,0.008,157000.0,8994.0,22.21,11610.0,...,1.420,0.008,0.085,,0.187,121.700,0.090,0.072,1.102,2012-11-04
3,APA2B,"Apache Lake, lower",srp,"Apache, hypo, lower",109.30,0.002,156200.0,13160.0,11.89,11330.0,...,1.427,0.009,0.023,,0.171,102.200,0.081,0.044,1.109,2012-11-04
4,Can1A,"Canyon Lake, upper",srp,"Canyon, eppi, upper",106.30,0.002,229000.0,16930.0,19.56,13820.0,...,1.362,0.001,0.046,,0.164,153.200,0.071,0.081,1.063,2012-11-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,Hav,Havasu Lake,,Havasu,36.52,-0.051,79680.0,23320.0,,4599.0,...,4.326,-0.037,-0.029,0.257,168.700,0.083,5.675,,,2015-11-04
206,R10,Salt River below Saguaro (Blue Point Bridge),salt,salt blw saguaro,167.80,0.002,243500.0,19690.0,,7476.0,...,1.974,0.000,0.010,0.097,82.970,0.113,1.326,,,2015-11-04
207,R25,verde river at beeline highway,verde,verde @ beeline,22.20,0.042,26600.0,24640.0,,3005.0,...,1.811,-0.001,0.012,0.062,52.800,1.101,1.159,,,2015-11-04
208,R2A,Lake Pleasant integrated sample,cap,pleasant-epi,36.61,-0.049,79780.0,24740.0,,5017.0,...,4.452,-0.038,-0.027,0.302,152.000,-0.011,5.520,,,2015-11-04


In [7]:
site_loc = metals["Site Location"]
site_loc.unique()

array(['Apache Lake, upper', 'Apache Lake, lower', 'Canyon Lake, upper',
       'Canyon Lake, lower',
       'Salt River below Saguaro (Blue Point Bridge)',
       'verde river at beeline highway',
       'Lake Pleasant integrated sample', 'Waddell Canal',
       'Roosevelt Lake, upper', 'Roosevelt Lake, lower', 'Havasu Lake'],
      dtype=object)