# Goal: 
### Merge swell, period, wind, and tide data into a singe dataframe
### Convert UTC to PST and filter for daylight hours
#### This file is the clean version of ```the_big_merge.ipynb```

In [78]:
import pandas as pd
import numpy as np
import random
from pathlib import Path

from datetime import datetime, timezone, timedelta
from zoneinfo import ZoneInfo
from astral import LocationInfo
from astral.sun import sun

In [79]:
root_folder = Path.cwd().parents[1]

In [80]:
#reading cleaned datasets
swell = pd.read_csv(root_folder/'data/interim/00-swell.csv')
tide = pd.read_csv(root_folder/'data/interim/00-tide.csv')
wind = pd.read_csv(root_folder/'data/interim/00-wind.csv')
period = pd.read_csv(root_folder/'data/interim/01-period.csv')

#converting to datetime
swell['UTC'] = pd.to_datetime(swell['UTC'], utc=True)
tide['UTC'] = pd.to_datetime(tide['UTC'], utc=True)
wind['UTC'] = pd.to_datetime(wind['UTC'], utc=True)
period['UTC'] = pd.to_datetime(period['UTC'], utc=True)

In [81]:
#merging - make sure to convert to datetime before merge!!!
tw = tide.merge(wind, how='left', on='UTC')
tws = tw.merge(swell, how='left', on='UTC')
twsp = tws.merge(period, how='left', on='UTC')
twsp.isna().sum()

UTC                  0
Tide                 0
Wind Speed          63
Wind Direction     115
Height            2266
Deg               2270
Period            2269
dtype: int64

In [82]:
#fixing duplicate value
# twsp.iloc[21452,6]=10
# twsp = twsp.drop(index=21453)

In [83]:
#creating datetime column in PST
PST = ZoneInfo('America/Los_Angeles')
pstlist = [x.astimezone(tz=PST) for x in twsp['UTC']]
twsp['PST']=pstlist

In [84]:
#create Dawn column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)
dawn=[]
for i in twsp.index.to_list():
    s = sun(iv.observer, date=twsp['PST'][i], tzinfo=PST)
    dawn.append(s['dawn'])
twsp['Dawn'] = dawn

#create Dusk column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)
dusk=[]
for i in twsp.index.to_list():
    s = sun(iv.observer, date=twsp['PST'][i], tzinfo=PST)
    dusk.append(s['dusk'])
twsp['Dusk'] = dusk

In [85]:
#filtering for daylight hours
day=[]
for i in twsp.index.to_list():
    day.append(twsp['Dawn'][i] <= twsp['PST'][i] <= twsp['Dusk'][i])

In [86]:
#check nan values
twsp[day].isna().sum()

UTC                  0
Tide                 0
Wind Speed          33
Wind Direction      54
Height            1329
Deg               1331
Period            1330
PST                  0
Dawn                 0
Dusk                 0
dtype: int64

Instead of chucking rows with any nan values, I'll keep them in the dataset and classify good days based on how many parameters met ideal conditions. That way, some of these hours with nan values could still classify as "4/6 ideal" or smth... might have to change this later based on how nan values act in the analysis bit, and if I change my measuring method.

To the best of my knowledge, these nan values are missing from the data scource, not because of some mistake I made while cleaning, so there's not much to be done about that.

In [87]:
61*12

732

April and May of 2016 are missing swell and period data so that makes sense for the height and degree nan values ^ however the period nan values are way out of control

In [88]:
df = twsp[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']]

#last bit of clean up then saving as csv
df.drop(index=[0,1]).reset_index(drop=True).to_csv(root_folder/'data/processed/00-final.csv', index=False)

In [89]:
#writing to csv
# twsp[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']].to_csv(root_folder/'data/processed/final.csv', index=False)