# Data engineering Jupyter Notebook to import, clean and export Surfs-Up! data

In [1]:
# Importing Dependencies
import pandas as pd
import numpy as np
import datetime as dt
from collections import OrderedDict, defaultdict

In [2]:
# Path to data files
measurements_csv = "data_files/hawaii_measurements.csv"
stations_csv = "data_files/hawaii_stations.csv"

In [3]:
# Reading files into Pandas Dataframe
hawaii_meas_df = pd.read_csv(measurements_csv, header="infer")
hawaii_stat_df = pd.read_csv(stations_csv, header="infer")

In [4]:
# Verifying Data and potential cleaning actions
hawaii_meas_df.describe(include="all")

Unnamed: 0,station,date,prcp,tobs
count,19550,19550,18103.0,19550.0
unique,9,2792,,
top,USC00519281,2012-01-20,,
freq,2772,9,,
mean,,,0.160644,73.097954
std,,,0.468746,4.523527
min,,,0.0,53.0
25%,,,0.0,70.0
50%,,,0.01,73.0
75%,,,0.11,76.0


In [5]:
# Verifying Data and potential cleaning actions
hawaii_stat_df.describe(include="all")

Unnamed: 0,station,name,latitude,longitude,elevation
count,9,9,9.0,9.0,9.0
unique,9,9,,,
top,USC00519397,"KUALOA RANCH HEADQUARTERS 886.9, HI US",,,
freq,1,1,,,
mean,,,21.393826,-157.867098,60.977778
std,,,0.086442,0.103873,103.465547
min,,,21.2716,-158.0111,0.9
25%,,,21.3331,-157.9751,7.0
50%,,,21.3934,-157.8374,14.6
75%,,,21.45167,-157.8025,32.9


In [6]:
# Creating a set of data that finds the average precipitation by station and month
hawaii_meas_avg_df = pd.DataFrame(hawaii_meas_df.loc[:,["station","date","prcp"]])
hawaii_meas_avg_df["date"]= pd.to_datetime(hawaii_meas_avg_df["date"],format="%Y-%m-%d", errors="coerce")
hawaii_meas_avg_df["month"] = hawaii_meas_avg_df["date"].dt.month
hawaii_meas_avg_df["station_month"] = hawaii_meas_avg_df["station"]+"_"+hawaii_meas_avg_df["month"].astype(str)
avg_prcp_by_month = hawaii_meas_avg_df.groupby(["station_month"])["prcp"].mean()
avg_prcp_by_month.drop_duplicates(keep="first", inplace=True)

In [7]:
# Creating a new map dictionary for each of the 108 Station/Month values
# to be used in replacing NaNs
avg_prcp_by_month = avg_prcp_by_month.to_dict()

In [8]:
# Adding attributes to the Measurements dataframe to capture the 
# dictionary mapping
hawaii_meas_df["date"] = pd.to_datetime(hawaii_meas_df["date"], format="%Y-%m-%d", errors="coerce")
hawaii_meas_df["year"] = hawaii_meas_df["date"].dt.year
hawaii_meas_df["month"] = hawaii_meas_df["date"].dt.month
hawaii_meas_df["day"] = hawaii_meas_df["date"].dt.day
hawaii_meas_df["station_month"] = hawaii_meas_df["station"]+"_"+hawaii_meas_df["month"].astype(str)

In [9]:
hawaii_meas_df.prcp = hawaii_meas_df.prcp.fillna(hawaii_meas_df["station_month"].map(avg_prcp_by_month))

In [10]:
hawaii_meas_df.set_index("station", inplace=True)

In [11]:
hawaii_meas_df = hawaii_meas_df.loc[:,["station_month","date","year","month","day","prcp","tobs"]]

In [12]:
# Outputting cleaned versions of each dataframe
hawaii_meas_df.to_csv(measurements_csv.replace("data_files/","data_files/CLEAN_"))
hawaii_stat_df.to_csv(stations_csv.replace("data_files/","data_files/CLEAN_"))