### Goal:
Automate cleaning for five years of tide data.
Cleaning includes:
- making a UTC datetime column
- dropping unessesary columns

In [11]:
import pandas as pd
import numpy as np

In [8]:
#df is tide dataframe to clean
#csv is file path name, a string
def cleanTide(df, csv):
    
    #making string for use in to_datetime
    df['Date'] = df['Date'].str.replace('/','-')
    df['UTC'] = df['Date']+' '+df['Time (GMT)']

    #converting
    df['UTC'] = pd.to_datetime(df['UTC'], utc=True)
    
    #renaming
    df['Tide'] = df['Verified (ft)']

    #dropping and saving to csv
    to_drop = ['Date', 'Time (GMT)', 'Predicted (ft)', 'Preliminary (ft)', 'Verified (ft)']
    df.drop(columns = to_drop).to_csv(csv, index=False)
    
    return

In [9]:
t17 = pd.read_csv("data/raw_tide2017.csv")
t18 = pd.read_csv("data/raw_tide2018.csv")
t19 = pd.read_csv("data/raw_tide2019.csv")
t20 = pd.read_csv("data/raw_tide2020.csv")
t21 = pd.read_csv("data/raw_tide2021.csv")

In [10]:
cleanTide(t17,'data/00-tide2017.csv')
cleanTide(t18,'data/00-tide2018.csv')
cleanTide(t19,'data/00-tide2019.csv')
cleanTide(t20,'data/00-tide2020.csv')
cleanTide(t21,'data/00-tide2021.csv')

## New Goal:
append all years together for one dataset

### csv files cannot store datetime objects, so need to convert the UTC column to a datetime object while reading the file to a dataframe. 

In [12]:
date_parser = pd.to_datetime

t17 = pd.read_csv('data/00-tide2017.csv', parse_dates=['UTC'], date_parser=date_parser)
t18 = pd.read_csv('data/00-tide2018.csv', parse_dates=['UTC'], date_parser=date_parser)
t19 = pd.read_csv('data/00-tide2019.csv', parse_dates=['UTC'], date_parser=date_parser)
t20 = pd.read_csv('data/00-tide2020.csv', parse_dates=['UTC'], date_parser=date_parser)
t21 = pd.read_csv('data/00-tide2021.csv', parse_dates=['UTC'], date_parser=date_parser)

In [13]:
#making the fatty
bigt = pd.concat([t17,t18,t19,t20,t21], ignore_index=True)
bigt.shape

(43824, 2)

No missing hours!!

In [14]:
24*365*5+24

43824

In [15]:
#checking to make sure there arn't duplicate dates
date_count = bigt['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==bigt.shape[0]

True

In [16]:
bigt.to_csv('data/00-tide.csv',index=False)