<h1>Getting structured tides from hydrographic data</h1>
<hr>
<h2>Requirement</h2>
Starting with hydrographic html for a given location, build structured tide data for a set of dates.    

<hr>
<h2>Entities</h2>
<hr>
<h3>TideForDay</h3>
<h4>Definition</h4> Gathers data for a tide on a given calendar day
<h4>Key</h4> t_date 
<h4>Cardinalities</h4> 1-M TideMark
<h4>Data attributes</h4>
<p></p>

|Name|aka|Definition| 
|:-------------|----|:-------------| 
|
|t_date| Tide Date | UID of the tide for a given day. Strictly an alternate key, but no need for surrogates here. Not "date" due to keyword conflict |
|tidal_range|Tidal Range |The distance between the minimum low water mark, and the maximum high water mark, for that day |





<hr>
<h3>TideMark</h3>
<h4>Definition</h4> An instance of a high or low water mark
<h4>Key</h4> 1. t_date 2. t_seq 
<h4>Cardinalities</h4> M->1 with parent TideForDay
<h4>Data attributes</h4>
<p></p>

|Name|aka|Definition| 
|:-------------|:----|:-------------| 
|
|t_date| Date | foreign key onto the parent TideForDay |
|t_seq| Sequence in Day | Where this tide mark falls in the sequence of tides after midnight. Eg - first tide after midnight has sequence 1|
|t_type| Type | Values are "High" or "Low", referring to whether this is a high or low water mark |
|t_height| Height | the height of the high or low water mark. float to 1 decimal point. The incoming value may be negative, which is to be resolved to zero  |
|t_time| Time | the time of the high or low water mark |






In [9]:
# pip install regex
import re
from dataclasses import dataclass
from datetime import datetime

@dataclass
class TideMark:
    t_date: int
    t_seq: int
    t_type: str # High or Low
    t_height: float
    t_time: datetime

@dataclass
class TideForDay:
    t_date: int
    tidal_range: float # distance between min and max tide_marks


tide_marks = []
tides = []

with open("./hydro_2024.08.04.txt","r") as infile:
    lines = infile.readlines()



In [10]:
# Data example:
# Sun 04 AugNew moon on this day
# High Water of 3.7 metres, at 19:59.

# Remove any blank lines, then I only want lines that start with the values in ok_search_terms
# Where a value is Sun, Mon, etc, subsequent records refer to the low or high-water times for
# that day... until you hit a new day. Etc.
ok_search_terms = ("Low","High","Fri","Sat","Sun","Mon","Tue","Wed","Thur")
#remove non-printing chars - typically tab etc
cleaned_lines = [line.strip() for line in lines]
high_lows = [line for line in cleaned_lines if line.startswith(ok_search_terms)]
print(high_lows)

['Sun 04 AugNew moon on this day', 'Low Water of 0.7 metres, at 01:08.', 'High Water of 3.4 metres, at 07:46.', 'Low Water of 0.7 metres, at 13:21.', 'High Water of 3.7 metres, at 19:59.', 'Mon 05 Aug', 'Low Water of 0.6 metres, at 01:53.', 'High Water of 3.5 metres, at 08:28.', 'Low Water of 0.6 metres, at 14:03.', 'High Water of 3.8 metres, at 20:37.', 'Tues 06 Aug', 'Low Water of 0.6 metres, at 02:33.', 'High Water of 3.5 metres, at 09:01.', 'Low Water of 0.6 metres, at 14:39.', 'High Water of 3.8 metres, at 21:08.', 'Weds 07 Aug', 'Low Water of 0.6 metres, at 03:06.', 'High Water of 3.5 metres, at 09:28.', 'Low Water of 0.6 metres, at 15:10.', 'High Water of 3.7 metres, at 21:33.', 'Thurs 08 Aug', 'Low Water of 0.6 metres, at 03:33.', 'High Water of 3.5 metres, at 09:49.', 'Low Water of 0.6 metres, at 15:36.', 'High Water of 3.7 metres, at 21:55.', 'Fri 09 Aug', 'Low Water of 0.6 metres, at 03:56.', 'High Water of 3.4 metres, at 10:09.', 'Low Water of 0.6 metres, at 16:01.', 'High 

In [11]:
for line in high_lows:
    words = line.split()

    # If the sentence starts with Sun, Mon, etc, then subsequent records are
    # high/low water times. Keep words 1 (base 0)
    # (date, but not day). Month and year will be entered downstream
    if words[0] in ("Sun","Mon","Tues","Weds","Thurs","Fri","Sat"):
        curr_date = words[1]
        curr_seq = 0
        # new record required
        
        continue
    # After previous block, 7 or more words means a High or Low water sentence. 
    # Words 0 (base zero), 3 and 6 are respectively High/Low, tide-height, and 
    # time. Other words and characters are discarded
    if len(words) >= 7:
       
        curr_type = words[0]
        curr_time = words[6].rstrip('.')
        curr_height = words[3]
        # dataclass populates here:
        tide_mark = TideMark(curr_date, curr_seq, curr_type, curr_height, curr_time)
        tide_marks.append(tide_mark)
        #print(tide_mark)
        curr_seq += 1
        continue     


for tide in tide_marks:
    print(tide)

#print(tides)

TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08')
TideMark(t_date='04', t_seq=1, t_type='High', t_height='3.4', t_time='07:46')
TideMark(t_date='04', t_seq=2, t_type='Low', t_height='0.7', t_time='13:21')
TideMark(t_date='04', t_seq=3, t_type='High', t_height='3.7', t_time='19:59')
TideMark(t_date='05', t_seq=0, t_type='Low', t_height='0.6', t_time='01:53')
TideMark(t_date='05', t_seq=1, t_type='High', t_height='3.5', t_time='08:28')
TideMark(t_date='05', t_seq=2, t_type='Low', t_height='0.6', t_time='14:03')
TideMark(t_date='05', t_seq=3, t_type='High', t_height='3.8', t_time='20:37')
TideMark(t_date='06', t_seq=0, t_type='Low', t_height='0.6', t_time='02:33')
TideMark(t_date='06', t_seq=1, t_type='High', t_height='3.5', t_time='09:01')
TideMark(t_date='06', t_seq=2, t_type='Low', t_height='0.6', t_time='14:39')
TideMark(t_date='06', t_seq=3, t_type='High', t_height='3.8', t_time='21:08')
TideMark(t_date='07', t_seq=0, t_type='Low', t_height='0.6', t_time='0

In [12]:
def pp(object_to_print):
    pprint.pprint(object_to_print)

In [14]:
import pprint
# Initialize a dictionary to store the highest and lowest heights for each date
heights_by_date = {}

# Get a sorted list of the unique dates for this set of tides. 
# Example: [4, 5, 6, 7]
unique_dates = list(set(int(mark.t_date) for mark in tide_marks))
unique_dates.sort()
pp(unique_dates)
water_marks = {}
for d in unique_dates:
    water_marks[d] = {'Low':100,'High':-100}

pp(water_marks)

tide_heights = [(int(mark.t_date), mark.t_seq, mark.t_type, float(mark.t_height), mark.t_time) for mark in tide_marks]

pp(tide_heights)

for test_tide in tide_heights:
    test_date = test_tide[0]
    test_type = test_tide[2]
    new_height = test_tide[3]
    
    #print(water_marks[test_date])
    #print(water_marks[test_date][test_type])
    if  test_type == 'Low':
        if  new_height < water_marks[test_date][test_type]:
            water_marks[test_date][test_type] = new_height
    else: # High
        if  new_height > water_marks[test_date][test_type]:
            water_marks[test_date][test_type] = new_height

pprint.pprint(water_marks)
    
pprint.pprint(water_marks)

[4, 5, 6, 7, 8, 9, 10]
{4: {'High': -100, 'Low': 100},
 5: {'High': -100, 'Low': 100},
 6: {'High': -100, 'Low': 100},
 7: {'High': -100, 'Low': 100},
 8: {'High': -100, 'Low': 100},
 9: {'High': -100, 'Low': 100},
 10: {'High': -100, 'Low': 100}}
[(4, 0, 'Low', 0.7, '01:08'),
 (4, 1, 'High', 3.4, '07:46'),
 (4, 2, 'Low', 0.7, '13:21'),
 (4, 3, 'High', 3.7, '19:59'),
 (5, 0, 'Low', 0.6, '01:53'),
 (5, 1, 'High', 3.5, '08:28'),
 (5, 2, 'Low', 0.6, '14:03'),
 (5, 3, 'High', 3.8, '20:37'),
 (6, 0, 'Low', 0.6, '02:33'),
 (6, 1, 'High', 3.5, '09:01'),
 (6, 2, 'Low', 0.6, '14:39'),
 (6, 3, 'High', 3.8, '21:08'),
 (7, 0, 'Low', 0.6, '03:06'),
 (7, 1, 'High', 3.5, '09:28'),
 (7, 2, 'Low', 0.6, '15:10'),
 (7, 3, 'High', 3.7, '21:33'),
 (8, 0, 'Low', 0.6, '03:33'),
 (8, 1, 'High', 3.5, '09:49'),
 (8, 2, 'Low', 0.6, '15:36'),
 (8, 3, 'High', 3.7, '21:55'),
 (9, 0, 'Low', 0.6, '03:56'),
 (9, 1, 'High', 3.4, '10:09'),
 (9, 2, 'Low', 0.6, '16:01'),
 (9, 3, 'High', 3.6, '22:18'),
 (10, 0, 'Low', 0.6,

In [15]:
# get the tidal range
for i in water_marks:
    t_high = water_marks[i]['High']
    t_low = water_marks[i]['Low']
    water_marks[i]['TidalRange'] = round(t_high - t_low,2)
pprint.pprint(water_marks)

{4: {'High': 3.7, 'Low': 0.7, 'TidalRange': 3.0},
 5: {'High': 3.8, 'Low': 0.6, 'TidalRange': 3.2},
 6: {'High': 3.8, 'Low': 0.6, 'TidalRange': 3.2},
 7: {'High': 3.7, 'Low': 0.6, 'TidalRange': 3.1},
 8: {'High': 3.7, 'Low': 0.6, 'TidalRange': 3.1},
 9: {'High': 3.6, 'Low': 0.6, 'TidalRange': 3.0},
 10: {'High': 3.4, 'Low': 0.6, 'TidalRange': 2.8}}


In [19]:
# populate the tide_day class from the dictionary
tidal_ranges_by_date = [TideForDay(t_date=key, tidal_range=value['TidalRange']) for key, value in water_marks.items()]
pp(tidal_ranges_by_date)
pp(tide_marks)


[TideForDay(t_date=4, tidal_range=3.0),
 TideForDay(t_date=5, tidal_range=3.2),
 TideForDay(t_date=6, tidal_range=3.2),
 TideForDay(t_date=7, tidal_range=3.1),
 TideForDay(t_date=8, tidal_range=3.1),
 TideForDay(t_date=9, tidal_range=3.0),
 TideForDay(t_date=10, tidal_range=2.8)]
[TideMark(t_date='04', t_seq=0, t_type='Low', t_height='0.7', t_time='01:08'),
 TideMark(t_date='04', t_seq=1, t_type='High', t_height='3.4', t_time='07:46'),
 TideMark(t_date='04', t_seq=2, t_type='Low', t_height='0.7', t_time='13:21'),
 TideMark(t_date='04', t_seq=3, t_type='High', t_height='3.7', t_time='19:59'),
 TideMark(t_date='05', t_seq=0, t_type='Low', t_height='0.6', t_time='01:53'),
 TideMark(t_date='05', t_seq=1, t_type='High', t_height='3.5', t_time='08:28'),
 TideMark(t_date='05', t_seq=2, t_type='Low', t_height='0.6', t_time='14:03'),
 TideMark(t_date='05', t_seq=3, t_type='High', t_height='3.8', t_time='20:37'),
 TideMark(t_date='06', t_seq=0, t_type='Low', t_height='0.6', t_time='02:33'),
 Tid

In [None]:
def insert_tidal_range(record):
    # Split the record by commas
    parts = record.split(',')
    
    # Extract the decimal values (skipping the first word)
    values = [float(part) for part in parts[1:] if part.replace('.', '', 1).isdigit()]
    
    # Calculate the difference between the highest and lowest values
    difference = max(values) - min(values)
    
    # Insert the difference between "Low" and "01:53"
    parts.insert(1, f"{difference:.1f}")
    
    # Join the modified parts back into a string
    modified_record = ','.join(parts)
    
    return modified_record

# Example usage:
#record = "05,Low,01:53,0.6,High,08:28,3.5,Low,14:03,0.6,High,20:37,3.8"
#result = insert_tidal_range(record)
#print(result)

In [None]:
# insert the tidal range - this is the difference between the max and
# min tidal heights for the day
tides_with_range = [insert_tidal_range(line) for line in day_tides]
for tide in tides_with_range:
    print(tide)

In [None]:
# return the usable tides for the day. Assume there are always 2,
# from a set of 4, and sometimes 3. 
# For the 4-set case, I keep the middle two tides.
# More work needed for the 3-set case, as needs more judgment about
# which end-tide to keep.


def get_usable_tides_for_day(record):
     # Split the record by commas
    parts = record.split(',')
    # remove the last tide of the day, as typically, this is not
    # usable:
    parts = parts[:-4] 
    # for a 4-tide day, remove the first tide
    if len(parts) == 11:
        parts = parts[:2] + parts[5:]
    modified_record = ','.join(parts)
    return modified_record

In [None]:
usable_tides = [get_usable_tides_for_day(line) for line in tides_with_range]
for tide in usable_tides:
    print(tide)

In [None]:
def set_month_year(record, month_year):
    parts = record.split(',')
    # word 0 is the date - add the month and year
    parts[0] = f"{parts[0]}/{month_year}"
    modified_record = ','.join(parts)
    return modified_record
    

In [None]:
full_tides = [set_month_year(line,"08/2024") for line in usable_tides]
for tide in full_tides:
    print(tide)