# Goal: 
### Merge swell, period, wind, and tide data into a singe dataframe

In [210]:
import pandas as pd
import numpy as np
import random

from datetime import datetime, timezone, timedelta
from zoneinfo import ZoneInfo
from astral import LocationInfo
from astral.sun import sun

In [211]:
swell = pd.read_csv('data/swell.csv')
tide = pd.read_csv('data/tide.csv')
wind = pd.read_csv('data/wind.csv')
period = pd.read_csv('data/period.csv')

In [212]:
ts = tide.merge(swell, how='left', on='UTC')

#checking to make sure there isn't weirdness with dates
date_count = ts['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==ts.shape[0]

True

In [213]:
tsp= ts.merge(period, how='left', on='UTC')

#checking to make sure there isn't weirdness with dates
date_count = tsp['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==tsp.shape[0]

True

In [214]:
tsp.isna().sum()

UTC        0
Tide       0
Height    55
Deg       57
Period    57
dtype: int64

In [215]:
tspw = tsp.merge(wind, how='left', on='UTC')

#checking to make sure there isn't weirdness with dates
date_count = tspw['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==tspw.shape[0]

False

In [216]:
tspw['UTC'].value_counts()

2019-06-13 20:00:00+00:00    2
2017-01-01 00:00:00+00:00    1
2020-05-02 04:00:00+00:00    1
2020-05-02 05:00:00+00:00    1
2020-05-02 06:00:00+00:00    1
                            ..
2018-09-01 17:00:00+00:00    1
2018-09-01 18:00:00+00:00    1
2018-09-01 19:00:00+00:00    1
2018-09-01 20:00:00+00:00    1
2021-12-31 23:00:00+00:00    1
Name: UTC, Length: 43824, dtype: int64

In [217]:
tspw[tspw['UTC']=='2019-06-13 20:00:00+00:00']

Unnamed: 0,UTC,Tide,Height,Deg,Period,Wind Speed,Wind Direction
21452,2019-06-13 20:00:00+00:00,1.51,1.07,200.0,4.0,9.0,150
21453,2019-06-13 20:00:00+00:00,1.51,1.07,200.0,4.0,11.0,150


Only a little bit of weirdness as far as I can tell, will just drop one of these observations, and make the wind speed 10

In [218]:
tspw.iloc[21452,6]=10

tspw = tspw.drop(index=21453)

In [219]:
#checking to make sure there isn't weirdness with dates
date_count = tspw['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==tspw.shape[0]

True

In [220]:
tspw.isna().sum()

UTC                0
Tide               0
Height            55
Deg               57
Period            57
Wind Speed        57
Wind Direction    91
dtype: int64

## Got extremely sidetracked
because the ZoneInfo module didn't exist on the version of python I was using, went to update python, remembered the thing about virtual environments Alan told me about, was sitting here updating python and realized I should practice using virtual environments....before one minute ago my conda base was the only thing I had altered/downloaded stuff on, which is exactly what you're not supposed to do. So I uninstalled and reinstalled anaconda in order to start from scratch, created a virtual environment called ivsurf-env, and now will be doing my project out of here. Along the way, learned I should be using github as I go and also could potentially be using vscode, so will start to do that 
## New plan:
finish up with the data cleaning and everything, make sure I have a nice pretty dataset I can work with. Then try and reproduce everything I've done and put parts of it into vscode/github according to [this video](https://youtu.be/cn7CnFIQUBo). From there, continue those practices in the analysis/visualization. **Main thing is to have a really nice looking project at the end of this, with a respectable looking github and everything**
<br>kinda fun to get organized :)


Now the mission is to convert everything to PST, hopefully can use the timezone package built into datetime 

### gotta convert the UTC column to datetime object because csv's cannot store those

In [221]:
tspw['UTC'] = pd.to_datetime(tspw['UTC'], utc=True)

tspw['UTC'].dtype

datetime64[ns, UTC]

In [222]:
tspw['UTC'][0].tzinfo

<UTC>

In [223]:
tspw['UTC'][0].tzinfo.utcoffset(tspw['UTC'][0])

datetime.timedelta(0)

My datetime objects are *aware*

In [224]:
LA = ZoneInfo('America/Los_Angeles')
timezone = zoneinfo.ZoneInfo("America/Los_Angeles")
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206) #lat and long from google maps

s = sun(iv.observer, date=tspw['UTC'][4128].astimezone(tz=LA), tzinfo=timezone)

In [225]:
print((
    f'Dawn:    {s["dawn"]}\n'
    f'Sunrise: {s["sunrise"]}\n'
    f'Noon:    {s["noon"]}\n'
    f'Sunset:  {s["sunset"]}\n'
    f'Dusk:    {s["dusk"]}\n'
))

Dawn:    2017-06-21 05:17:49.291788-07:00
Sunrise: 2017-06-21 05:47:49.250841-07:00
Noon:    2017-06-21 13:01:20-07:00
Sunset:  2017-06-21 20:14:53.486789-07:00
Dusk:    2017-06-21 20:44:53.332313-07:00



#### Above times look right, don't think daylight savings is in here which is good bc irrelevant

#### Method: 
- convert all times to PST with ```.astimezon(tz=ZoneInfo('America/Los_Angeles'))```
- create ```Dawn``` and ```Dusk``` columns for each entry with ```s['dawn']``` and ```s['dusk']```
- drop all entries that do not occur between dawn and dusk
- Daylight Savings Time isn't an issue bc astral takes care of it

In [226]:
PST = ZoneInfo('America/Los_Angeles')

pstlist = [x.astimezone(tz=PST) for x in tspw['UTC']]

tspw['PST']=pstlist

In [228]:
tspw.head(25)

Unnamed: 0,UTC,Tide,Height,Deg,Period,Wind Speed,Wind Direction,PST
0,2017-01-01 00:00:00+00:00,0.31,2.79,300.0,11.0,,,2016-12-31 16:00:00-08:00
1,2017-01-01 01:00:00+00:00,-0.19,2.82,313.0,11.0,,,2016-12-31 17:00:00-08:00
2,2017-01-01 02:00:00+00:00,-0.09,2.97,309.0,11.0,,,2016-12-31 18:00:00-08:00
3,2017-01-01 03:00:00+00:00,0.62,2.97,312.0,11.0,,,2016-12-31 19:00:00-08:00
4,2017-01-01 04:00:00+00:00,1.44,2.67,310.0,11.0,,,2016-12-31 20:00:00-08:00
5,2017-01-01 05:00:00+00:00,2.63,3.18,309.0,11.0,,,2016-12-31 21:00:00-08:00
6,2017-01-01 06:00:00+00:00,3.38,3.01,314.0,11.0,,,2016-12-31 22:00:00-08:00
7,2017-01-01 07:00:00+00:00,4.03,3.04,310.0,11.0,,,2016-12-31 23:00:00-08:00
8,2017-01-01 08:00:00+00:00,4.32,2.79,299.0,11.0,,,2017-01-01 00:00:00-08:00
9,2017-01-01 09:00:00+00:00,3.83,3.08,312.0,13.0,3.0,50,2017-01-01 01:00:00-08:00


In [229]:
#check random time
i = random.randrange(tspw.iloc[-1].name)
tspw['UTC'][i] == tspw['PST'][i].astimezone(tz=ZoneInfo('UTC'))

True

In [230]:
s = sun(iv.observer, date=tspw['PST'][4128], tzinfo=PST)

In [231]:
print((
    f'Dawn:    {s["dawn"]}\n'
    f'Sunrise: {s["sunrise"]}\n'
    f'Noon:    {s["noon"]}\n'
    f'Sunset:  {s["sunset"]}\n'
    f'Dusk:    {s["dusk"]}\n'
))

Dawn:    2017-06-21 05:17:49.291788-07:00
Sunrise: 2017-06-21 05:47:49.250841-07:00
Noon:    2017-06-21 13:01:20-07:00
Sunset:  2017-06-21 20:14:53.486789-07:00
Dusk:    2017-06-21 20:44:53.332313-07:00



### ^^^ with this -7 thing instead of -8, there might actually be daylight savings info in there....looks like it's taken care of though?

In [232]:
s['dawn']

datetime.datetime(2017, 6, 21, 5, 17, 49, 291788, tzinfo=zoneinfo.ZoneInfo(key='America/Los_Angeles'))

In [233]:
len(tspw.index.to_list())

43824

In [234]:
tspw.iloc[-1]

UTC               2021-12-31 23:00:00+00:00
Tide                                  -0.71
Height                                  2.9
Deg                                   306.0
Period                                  7.0
Wind Speed                             18.0
Wind Direction                          270
PST               2021-12-31 15:00:00-08:00
Name: 43824, dtype: object

In [235]:
#create Dawn column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)

dawn=[]
for i in tspw.index.to_list():
    s = sun(iv.observer, date=tspw['PST'][i], tzinfo=PST)
    dawn.append(s['dawn'])

tspw['Dawn'] = dawn

#create Dusk column
iv = LocationInfo("Isla Vista", "California", "America/Los_Angeles", 34.41302853802114, -119.8615254859206)

dusk=[]
for i in tspw.index.to_list():
    s = sun(iv.observer, date=tspw['PST'][i], tzinfo=PST)
    dusk.append(s['dusk'])

tspw['Dusk'] = dusk

Daylight savings time is built into astral, and the dawn/dusk times take this into account, so no need to worry

In [237]:
tspw.head()

Unnamed: 0,UTC,Tide,Height,Deg,Period,Wind Speed,Wind Direction,PST,Dawn,Dusk
0,2017-01-01 00:00:00+00:00,0.31,2.79,300.0,11.0,,,2016-12-31 16:00:00-08:00,2016-12-31 06:37:44.271510-08:00,2016-12-31 17:27:59.126429-08:00
1,2017-01-01 01:00:00+00:00,-0.19,2.82,313.0,11.0,,,2016-12-31 17:00:00-08:00,2016-12-31 06:37:44.271510-08:00,2016-12-31 17:27:59.126429-08:00
2,2017-01-01 02:00:00+00:00,-0.09,2.97,309.0,11.0,,,2016-12-31 18:00:00-08:00,2016-12-31 06:37:44.271510-08:00,2016-12-31 17:27:59.126429-08:00
3,2017-01-01 03:00:00+00:00,0.62,2.97,312.0,11.0,,,2016-12-31 19:00:00-08:00,2016-12-31 06:37:44.271510-08:00,2016-12-31 17:27:59.126429-08:00
4,2017-01-01 04:00:00+00:00,1.44,2.67,310.0,11.0,,,2016-12-31 20:00:00-08:00,2016-12-31 06:37:44.271510-08:00,2016-12-31 17:27:59.126429-08:00


In [238]:
day=[]
for i in tspw.index.to_list():
    day.append(tspw['Dawn'][i] <= tspw['PST'][i] <= tspw['Dusk'][i])

In [242]:
tspw[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']].isna().sum()

PST                0
Tide               0
Height            36
Deg               37
Period            37
Wind Speed        31
Wind Direction    46
dtype: int64

In [245]:
tspw[day]['Height'].isna()

0        False
1        False
15       False
16       False
17       False
         ...  
43820    False
43821    False
43822    False
43823    False
43824    False
Name: Height, Length: 23852, dtype: bool

In [248]:
dl = tspw[day][['PST','Tide','Height','Deg','Period','Wind Speed', 'Wind Direction']].drop(index=[0,1])

In [250]:
dl.isna().sum()

PST                0
Tide               0
Height            36
Deg               37
Period            37
Wind Speed        29
Wind Direction    44
dtype: int64

In [256]:
dl[dl['Wind Speed'].isna()]

Unnamed: 0,PST,Tide,Height,Deg,Period,Wind Speed,Wind Direction
4430,2017-07-04 07:00:00-07:00,3.18,1.36,191.0,15.0,,
6255,2017-09-18 08:00:00-07:00,4.47,1.67,313.0,7.0,,
6256,2017-09-18 09:00:00-07:00,5.15,1.78,313.0,7.0,,
6257,2017-09-18 10:00:00-07:00,5.22,1.69,310.0,7.0,,
6258,2017-09-18 11:00:00-07:00,4.43,1.46,307.0,4.0,,
10915,2018-03-31 12:00:00-07:00,4.21,1.04,193.0,13.0,,
13573,2018-07-20 06:00:00-07:00,3.56,1.74,321.0,9.0,,
13574,2018-07-20 07:00:00-07:00,3.4,1.74,319.0,11.0,,
13575,2018-07-20 08:00:00-07:00,2.81,1.72,317.0,9.0,,
13576,2018-07-20 09:00:00-07:00,2.33,1.82,321.0,9.0,,


Instead of chucking rows with any nan values, I'll keep them in the dataset and classify good days based on how many parameters met ideal conditions. That way, some of these hours with nan values could still classify as "4/6 ideal" or smth... might have to change this later based on how nan values act in the analysis bit, and if I change my measuring method.

### not sure why I have so many nan values for period...my original file has 0 nan values, maybe something weird happened with the merging
look at file ```missing_period``` for exploration