# Goal: 
### Merge swell, period, wind, and tide data into a singe dataframe
### Convert UTC to PST and filter for daylight hours
#### This file is the clean version of ```the_big_merge.ipynb```

In [2]:
import pandas as pd
import numpy as np
import random
from pathlib import Path

from datetime import datetime, timezone, timedelta
from zoneinfo import ZoneInfo
from astral import LocationInfo
from astral.sun import sun

In [3]:
root_folder = Path.cwd().parents[1]

In [4]:
#reading cleaned datasets
swell = pd.read_csv(root_folder/'data/interim/00-swell.csv')
tide = pd.read_csv(root_folder/'data/interim/00-tide.csv')
wind = pd.read_csv(root_folder/'data/interim/00-wind.csv')
period = pd.read_csv(root_folder/'data/interim/01-period.csv')

In [5]:
#merging
ts = tide.merge(swell, how='left', on='UTC')
tsp= ts.merge(period, how='left', on='UTC')
tspw = tsp.merge(wind, how='left', on='UTC')

In [6]:
#fixing duplicate value
tspw.iloc[21452,6]=10
tspw = tspw.drop(index=21453)

In [7]:
#converting to datetime object because csvs can't store
tspw['UTC'] = pd.to_datetime(tspw['UTC'], utc=True)

In [8]:
#creating datetime column in PST
PST = ZoneInfo('America/Los_Angeles')
pstlist = [x.astimezone(tz=PST) for x in tspw['UTC']]
tspw['PST']=pstlist

In [9]:
#create Dawn column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)
dawn=[]
for i in tspw.index.to_list():
    s = sun(iv.observer, date=tspw['PST'][i], tzinfo=PST)
    dawn.append(s['dawn'])
tspw['Dawn'] = dawn

#create Dusk column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)
dusk=[]
for i in tspw.index.to_list():
    s = sun(iv.observer, date=tspw['PST'][i], tzinfo=PST)
    dusk.append(s['dusk'])
tspw['Dusk'] = dusk

In [10]:
#filtering for daylight hours
day=[]
for i in tspw.index.to_list():
    day.append(tspw['Dawn'][i] <= tspw['PST'][i] <= tspw['Dusk'][i])

In [11]:
#check nan values
tspw[day].isna().sum()

UTC                0
Tide               0
Height            36
Deg               37
Period            37
Wind Speed        31
Wind Direction    46
PST                0
Dawn               0
Dusk               0
dtype: int64

Instead of chucking rows with any nan values, I'll keep them in the dataset and classify good days based on how many parameters met ideal conditions. That way, some of these hours with nan values could still classify as "4/6 ideal" or smth... might have to change this later based on how nan values act in the analysis bit, and if I change my measuring method.

In [15]:
df = tspw[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']]

#last bit of clean up then saving as csv
df.drop(index=[0,1]).reset_index(drop=True).to_csv(root_folder/'data/processed/final.csv', index=False)

In [13]:
#writing to csv
# tspw[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']].to_csv(root_folder/'data/processed/final.csv', index=False)