# Data engineering Jupyter Notebook to import, clean and export Surfs-Up! data

In [1]:
# Importing Dependencies
# ----------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import datetime as dt
from collections import OrderedDict, defaultdict

In [2]:
# Path to data files
# ----------------------------------------------------------------------------------
folder_path = "data_files/"
clean_prfx = "CLEAN_"
measurements_csv = "hawaii_measurements.csv"
stations_csv = "hawaii_stations.csv"

In [3]:
# Reading files into Pandas Dataframe
# ----------------------------------------------------------------------------------
hawaii_meas_df = pd.read_csv(folder_path+measurements_csv, header="infer")
hawaii_stat_df = pd.read_csv(folder_path+stations_csv, header="infer")

In [4]:
# Verifying Data and potential cleaning actions in Measurements CSV
# ----------------------------------------------------------------------------------
hawaii_meas_df.describe(include="all")

Unnamed: 0,station,date,prcp,tobs
count,19550,19550,18103.0,19550.0
unique,9,2792,,
top,USC00519281,2010-11-01,,
freq,2772,9,,
mean,,,0.160644,73.097954
std,,,0.468746,4.523527
min,,,0.0,53.0
25%,,,0.0,70.0
50%,,,0.01,73.0
75%,,,0.11,76.0


In [5]:
# Verifying Data and potential cleaning actions in Stations CSV
# ----------------------------------------------------------------------------------
hawaii_stat_df.describe(include="all")

Unnamed: 0,station,name,latitude,longitude,elevation
count,9,9,9.0,9.0,9.0
unique,9,9,,,
top,USC00519523,"PEARL CITY, HI US",,,
freq,1,1,,,
mean,,,21.393826,-157.867098,60.977778
std,,,0.086442,0.103873,103.465547
min,,,21.2716,-158.0111,0.9
25%,,,21.3331,-157.9751,7.0
50%,,,21.3934,-157.8374,14.6
75%,,,21.45167,-157.8025,32.9


In [6]:
# Creating a set of data that finds the average precipitation by station and month
# ----------------------------------------------------------------------------------


# creating a dataframe to consolidate the columns needed for average & dropping NaN
hawaii_meas_avg_df = pd.DataFrame(hawaii_meas_df.loc[:,["station","date","prcp"]])
hawaii_meas_avg_df = hawaii_meas_avg_df.dropna()

# assigning the date column to a datetime data type
hawaii_meas_avg_df["date"]= pd.to_datetime(hawaii_meas_avg_df["date"],
                                           format="%Y-%m-%d", errors="coerce")

# creating a month column for finding averages
hawaii_meas_avg_df["month"] = hawaii_meas_avg_df["date"].dt.month

# creating the lookup dictionary key values for Station/Month
hawaii_meas_avg_df["station_month"] = hawaii_meas_avg_df["station"]+"_"+\
                                            hawaii_meas_avg_df["month"].astype(str)


In [7]:
avg_prcp_by_month = round(hawaii_meas_avg_df.groupby(["station_month"])["prcp"].mean(),2)
grouped_map_df = avg_prcp_by_month.drop_duplicates(keep="first")

In [8]:
# Creating a new map dictionary for each of the 108 Station/Month values
# to be used in replacing NaNs
# ----------------------------------------------------------------------------------
grouped_map_df = grouped_map_df.to_dict()

In [9]:
# Adding attributes to the Measurements dataframe to capture the 
# dictionary mapping
# ----------------------------------------------------------------------------------
hawaii_meas_df["date"] = pd.to_datetime(hawaii_meas_df["date"], format="%Y-%m-%d", errors="coerce")
hawaii_meas_df["year"] = hawaii_meas_df["date"].dt.year
hawaii_meas_df["month"] = hawaii_meas_df["date"].dt.month
hawaii_meas_df["day"] = hawaii_meas_df["date"].dt.day
hawaii_meas_df["station_month"] = hawaii_meas_df["station"]+"_"+hawaii_meas_df["month"].astype(str)

In [10]:
hawaii_meas_df.prcp = hawaii_meas_df.prcp.fillna(hawaii_meas_df["station_month"].map(grouped_map_df))
analysis_df = hawaii_meas_df

In [11]:
hawaii_meas_df.reset_index()

Unnamed: 0,index,station,date,prcp,tobs,year,month,day,station_month
0,0,USC00519397,2010-01-01,0.08,65,2010,1,1,USC00519397_1
1,1,USC00519397,2010-01-02,0.00,63,2010,1,2,USC00519397_1
2,2,USC00519397,2010-01-03,0.00,74,2010,1,3,USC00519397_1
3,3,USC00519397,2010-01-04,0.00,76,2010,1,4,USC00519397_1
4,4,USC00519397,2010-01-06,,73,2010,1,6,USC00519397_1
5,5,USC00519397,2010-01-07,0.06,70,2010,1,7,USC00519397_1
6,6,USC00519397,2010-01-08,0.00,64,2010,1,8,USC00519397_1
7,7,USC00519397,2010-01-09,0.00,68,2010,1,9,USC00519397_1
8,8,USC00519397,2010-01-10,0.00,73,2010,1,10,USC00519397_1
9,9,USC00519397,2010-01-11,0.01,64,2010,1,11,USC00519397_1


In [12]:
hawaii_meas_df = hawaii_meas_df.loc[:,["station","date","prcp","tobs"]]
hawaii_meas_df.set_index("station", inplace=True)

In [13]:
hawaii_stat_df.set_index("station", inplace=True)

In [14]:
# Outputting cleaned versions of each dataframe
# ----------------------------------------------------------------------------------
hawaii_meas_df.to_csv(folder_path+clean_prfx+measurements_csv)
hawaii_stat_df.to_csv(folder_path+clean_prfx+stations_csv)

In [15]:
# Outputting copy of Measurement DF with Dates separately 
# ----------------------------------------------------------------------------------
wdates_hawaii_meas_df = analysis_df

In [16]:
wdates_hawaii_meas_df.to_csv(folder_path+"wDates_"+measurements_csv)