<h1>Getting structured tides from hydrographic data</h1>
<hr>
<h2>Requirement</h2>
Starting with hydrographic html for a given location, build structured tide data for a set of dates.    

<hr>
<h2>Entities</h2>
<hr>
<h3>TideForDay</h3>
<h4>Definition</h4> Gathers data for a tide on a given calendar day
<h4>Key</h4> t_date 
<h4>Cardinalities</h4> 1-M TideMark
<h4>Data attributes</h4>
<p></p>

|Name|aka|Definition| 
|:-------------|----|:-------------| 
|
|t_date| Tide Date | UID of the tide for a given day. Strictly an alternate key, but no need for surrogates here. Not "date" due to keyword conflict |
|tidal_range|Tidal Range |The distance between the minimum low water mark, and the maximum high water mark, for that day |





<hr>
<h3>TideMark</h3>
<h4>Definition</h4> An instance of a high or low water mark
<h4>Key</h4> 1. t_date 2. t_seq 
<h4>Cardinalities</h4> M->1 with parent TideForDay
<h4>Data attributes</h4>
<p></p>

|Name|aka|Definition| 
|:-------------|:----|:-------------| 
|
|t_date| Date | foreign key onto the parent TideForDay |
|t_seq| Sequence in Day | Where this tide mark falls in the sequence of tides after midnight. Eg - first tide after midnight has sequence 1|
|t_type| Type | Values are "High" or "Low", referring to whether this is a high or low water mark |
|t_height| Height | the height of the high or low water mark. float to 1 decimal point. The incoming value may be negative, which is to be resolved to zero  |
|t_time| Time | the time of the high or low water mark |






In [1]:
# pip install regex
import re
from dataclasses import dataclass
from datetime import datetime

@dataclass
class TideMark:
    t_date: int
    t_seq: int
    t_type: str # High or Low
    t_height: float
    t_time: datetime

@dataclass
class TideForDay:
    t_date: int
    tidal_range: float # distance between min and max tide_marks


tide_marks = []
tides = []

with open("./hydro_2024.08.04.txt","r") as infile:
    lines = infile.readlines()



In [2]:
cleaned_lines = [line.strip() for line in lines]
high_lows = [line for line in cleaned_lines if line.startswith(("Low","High","Fri","Sat","Sun","Mon","Tue","Wed","Thur"))]

In [3]:
formatted_tides = []
for line in high_lows:
    words = line.split()
    print(words)
    # If the sentence starts with Sun, Mon, etc, then keep words 1 (base 0)
    # (date, but not day). Month can be inferred downstream
    if words[0] in ("Sun","Mon","Tues","Weds","Thurs","Fri","Sat"):
        formatted_tides.append("----------------")
        formatted_tides.append(f"{words[1]},")
        curr_date = words[1]
        curr_seq = 0
        # new record required
        
        continue
    # After previous block, 7 or more words means a High or Low water sentence. 
    # Words 0 (base zero), 3 and 6 are respectively High/Low, tide-height, and 
    # time. Other words and characters are discarded
    if len(words) >= 7:
        formatted_tides.append(f"{words[0]},{words[6].rstrip('.')},{words[3]},")
        
        curr_type = words[0]
        curr_time = words[6].rstrip('.')
        curr_height = words[3]
        # dataclass populates here:
        tide_mark = TideMark(curr_date, curr_seq, curr_type, curr_height, curr_time)
        tide_marks.append(tide_mark)
        print(tide_marks)
        curr_seq += 1
        continue     
for line in formatted_tides:
    print(line)

for tide in tides:
    print(tide)

print(tides)

['Sun', '04', 'AugNew', 'moon', 'on', 'this', 'day']
['Low', 'Water', 'of', '0.7', 'metres,', 'at', '01:08.']
[TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08')]
['High', 'Water', 'of', '3.4', 'metres,', 'at', '07:46.']
[TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08'), TideMark(t_date='04', t_seq=1, t_type='High', t_height='3.4', t_time='07:46')]
['Low', 'Water', 'of', '0.7', 'metres,', 'at', '13:21.']
[TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08'), TideMark(t_date='04', t_seq=1, t_type='High', t_height='3.4', t_time='07:46'), TideMark(t_date='04', t_seq=2, t_type='Low', t_height='0.7', t_time='13:21')]
['High', 'Water', 'of', '3.7', 'metres,', 'at', '19:59.']
[TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08'), TideMark(t_date='04', t_seq=1, t_type='High', t_height='3.4', t_time='07:46'), TideMark(t_date='04', t_seq=2, t_type='Low', t_height='0.7', t_time='13:21'), TideMark

In [4]:
lines2 = [line for line in formatted_tides if not line.startswith('-')]
print(lines2)

['04,', 'Low,01:08,0.7,', 'High,07:46,3.4,', 'Low,13:21,0.7,', 'High,19:59,3.7,', '05,', 'Low,01:53,0.6,', 'High,08:28,3.5,', 'Low,14:03,0.6,', 'High,20:37,3.8,', '06,', 'Low,02:33,0.6,', 'High,09:01,3.5,', 'Low,14:39,0.6,', 'High,21:08,3.8,', '07,', 'Low,03:06,0.6,', 'High,09:28,3.5,', 'Low,15:10,0.6,', 'High,21:33,3.7,', '08,', 'Low,03:33,0.6,', 'High,09:49,3.5,', 'Low,15:36,0.6,', 'High,21:55,3.7,', '09,', 'Low,03:56,0.6,', 'High,10:09,3.4,', 'Low,16:01,0.6,', 'High,22:18,3.6,', '10,', 'Low,04:19,0.6,', 'High,10:34,3.3,', 'Low,16:26,0.7,', 'High,22:45,3.4,']


In [5]:
tide_days = []
data = lines2
day_tides = []
current_day_tide = ""

# Iterate through the data
for item in data:
    # Check if the item has 2 characters (indicating the start of a record)
    if len(item) == 3:
        # If we have a current record, add it to the grouped_records list
        if current_day_tide:
            day_tides.append(current_day_tide)
        # Start a new record
        current_day_tide = item
    else:
        # Concatenate the item to the current record
        current_day_tide += item

# Add the last record (if any) to the grouped_records list
if current_day_tide:
    day_tides.append(current_day_tide)

# Print the grouped records
for tide in day_tides:
    print(tide)

04,Low,01:08,0.7,High,07:46,3.4,Low,13:21,0.7,High,19:59,3.7,
05,Low,01:53,0.6,High,08:28,3.5,Low,14:03,0.6,High,20:37,3.8,
06,Low,02:33,0.6,High,09:01,3.5,Low,14:39,0.6,High,21:08,3.8,
07,Low,03:06,0.6,High,09:28,3.5,Low,15:10,0.6,High,21:33,3.7,
08,Low,03:33,0.6,High,09:49,3.5,Low,15:36,0.6,High,21:55,3.7,
09,Low,03:56,0.6,High,10:09,3.4,Low,16:01,0.6,High,22:18,3.6,
10,Low,04:19,0.6,High,10:34,3.3,Low,16:26,0.7,High,22:45,3.4,


In [6]:
def insert_tidal_range(record):
    # Split the record by commas
    parts = record.split(',')
    
    # Extract the decimal values (skipping the first word)
    values = [float(part) for part in parts[1:] if part.replace('.', '', 1).isdigit()]
    
    # Calculate the difference between the highest and lowest values
    difference = max(values) - min(values)
    
    # Insert the difference between "Low" and "01:53"
    parts.insert(1, f"{difference:.1f}")
    
    # Join the modified parts back into a string
    modified_record = ','.join(parts)
    
    return modified_record

# Example usage:
#record = "05,Low,01:53,0.6,High,08:28,3.5,Low,14:03,0.6,High,20:37,3.8"
#result = insert_tidal_range(record)
#print(result)

In [7]:
# insert the tidal range - this is the difference between the max and
# min tidal heights for the day
tides_with_range = [insert_tidal_range(line) for line in day_tides]
for tide in tides_with_range:
    print(tide)

04,3.0,Low,01:08,0.7,High,07:46,3.4,Low,13:21,0.7,High,19:59,3.7,
05,3.2,Low,01:53,0.6,High,08:28,3.5,Low,14:03,0.6,High,20:37,3.8,
06,3.2,Low,02:33,0.6,High,09:01,3.5,Low,14:39,0.6,High,21:08,3.8,
07,3.1,Low,03:06,0.6,High,09:28,3.5,Low,15:10,0.6,High,21:33,3.7,
08,3.1,Low,03:33,0.6,High,09:49,3.5,Low,15:36,0.6,High,21:55,3.7,
09,3.0,Low,03:56,0.6,High,10:09,3.4,Low,16:01,0.6,High,22:18,3.6,
10,2.8,Low,04:19,0.6,High,10:34,3.3,Low,16:26,0.7,High,22:45,3.4,


In [8]:
# return the usable tides for the day. Assume there are always 2,
# from a set of 4, and sometimes 3. 
# For the 4-set case, I keep the middle two tides.
# More work needed for the 3-set case, as needs more judgment about
# which end-tide to keep.


def get_usable_tides_for_day(record):
     # Split the record by commas
    parts = record.split(',')
    # remove the last tide of the day, as typically, this is not
    # usable:
    parts = parts[:-4] 
    # for a 4-tide day, remove the first tide
    if len(parts) == 11:
        parts = parts[:2] + parts[5:]
    modified_record = ','.join(parts)
    return modified_record

In [9]:
usable_tides = [get_usable_tides_for_day(line) for line in tides_with_range]
for tide in usable_tides:
    print(tide)

04,3.0,High,07:46,3.4,Low,13:21,0.7
05,3.2,High,08:28,3.5,Low,14:03,0.6
06,3.2,High,09:01,3.5,Low,14:39,0.6
07,3.1,High,09:28,3.5,Low,15:10,0.6
08,3.1,High,09:49,3.5,Low,15:36,0.6
09,3.0,High,10:09,3.4,Low,16:01,0.6
10,2.8,High,10:34,3.3,Low,16:26,0.7


In [10]:
def set_month_year(record, month_year):
    parts = record.split(',')
    # word 0 is the date - add the month and year
    parts[0] = f"{parts[0]}/{month_year}"
    modified_record = ','.join(parts)
    return modified_record
    

In [11]:
full_tides = [set_month_year(line,"08/2024") for line in usable_tides]
for tide in full_tides:
    print(tide)

04/08/2024,3.0,High,07:46,3.4,Low,13:21,0.7
05/08/2024,3.2,High,08:28,3.5,Low,14:03,0.6
06/08/2024,3.2,High,09:01,3.5,Low,14:39,0.6
07/08/2024,3.1,High,09:28,3.5,Low,15:10,0.6
08/08/2024,3.1,High,09:49,3.5,Low,15:36,0.6
09/08/2024,3.0,High,10:09,3.4,Low,16:01,0.6
10/08/2024,2.8,High,10:34,3.3,Low,16:26,0.7
