In [226]:
# Import Dependencies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from pylab import *

In [227]:
# Create references to the CSVs and import into Pandas DataFrames

measurements_csv = os.path.join("hawaii_measurements.csv")
stations_csv = os.path.join("hawaii_stations.csv")

measurements_df = pd.read_csv(measurements_csv, low_memory=False)
stations_df = pd.read_csv(stations_csv, low_memory=False)

In [228]:
# measurements_df.head()
# measurements_df.describe()
# print(measurements_df["prcp"].value_counts())
# print(measurements_df["tobs"].value_counts())

In [229]:
# Check for null values anywhere in stations_df
stations_df.isnull().values.any()

# No null values, so we won't have to clean for NaNs.

False

In [230]:
# Review this small dataframe for anything that looks strange
stations_df

# Nothing observed; no changes to be made.

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [231]:
# Check measurements_df columns for null values
measurements_df.isnull().any()

station    False
date       False
prcp        True
tobs       False
dtype: bool

In [232]:
# Count NaNs by measurements_df column
measurements_df.isnull().sum()

# Find which rows contain NaN
# measurements_nan_rows = measurements_df[measurements_df['prcp'].isnull()]

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [233]:
# Given that 'prcp' has 1447 NaNs, review column counts
measurements_df.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [234]:
# Get descriptive statistics for measurements_df 'prcp' column grouped by station

measurements_df['prcp'].groupby(measurements_df['station']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
USC00511918,1932.0,0.047971,0.244826,0.0,0.0,0.0,0.01,4.0
USC00513117,2696.0,0.141921,0.433905,0.0,0.0,0.02,0.1,7.65
USC00514830,1937.0,0.121058,0.413812,0.0,0.0,0.02,0.1,11.53
USC00516128,2484.0,0.429988,0.712999,0.0,0.01,0.16,0.54,8.06
USC00517948,683.0,0.063602,0.243931,0.0,0.0,0.0,0.02,2.8
USC00518838,342.0,0.207222,0.508305,0.0,0.0025,0.03,0.1975,6.3
USC00519281,2772.0,0.212352,0.543312,0.0,0.0,0.04,0.19,9.64
USC00519397,2685.0,0.04902,0.210583,0.0,0.0,0.0,0.02,4.2
USC00519523,2572.0,0.114961,0.410237,0.0,0.0,0.0,0.07,6.38


In [235]:
# Perform 'prcp' mean value imputation per station
measurements_df["prcp"] = measurements_df.groupby("station").transform(lambda x: x.fillna(x.mean()))

# Confirm missing values were substituted (should return False)
# measurements_df['prcp'].isnull().any()

# We can prove means by station were maintained
# measurements_df.groupby("station").mean()["prcp"]

# Means are maintained in the descriptive statistics table
measurements_df['prcp'].groupby(measurements_df['station']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
USC00511918,1979.0,0.047971,0.241899,0.0,0.0,0.0,0.01,4.0
USC00513117,2709.0,0.141921,0.432863,0.0,0.0,0.02,0.1,7.65
USC00514830,2202.0,0.121058,0.388102,0.0,0.0,0.03,0.121058,11.53
USC00516128,2612.0,0.429988,0.695302,0.0,0.02,0.19,0.5125,8.06
USC00517948,1372.0,0.063602,0.172044,0.0,0.0,0.063602,0.063602,2.8
USC00518838,511.0,0.207222,0.41564,0.0,0.02,0.19,0.207222,6.3
USC00519281,2772.0,0.212352,0.543312,0.0,0.0,0.04,0.19,9.64
USC00519397,2724.0,0.04902,0.20907,0.0,0.0,0.0,0.02,4.2
USC00519523,2669.0,0.114961,0.402711,0.0,0.0,0.0,0.09,6.38


In [236]:
# Column value count for 'prcp' have been incremented by the missing 1447 values to match totals
measurements_df.count()

station    19550
date       19550
prcp       19550
tobs       19550
dtype: int64

In [237]:
# Save dataframes as CSVs

measurements_df.to_csv("clean_hawaii_measurements.csv", encoding="utf-8", index=False)
stations_df.to_csv("clean_hawaii_stations.csv", encoding="utf-8", index=False)