In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Paths and site names setup
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"
site_names = ['CVCC','CCFT','FTAD','ADLA','LABI',
              'TRAT','ATMC','MCCO','CCCL','NNOR',
              'MEIA','TORR','CVMR','MRMG','MGVR',
              'COSN','VAGR','GBHA','BARR','MIRA']

In [3]:
# Create an empty dictionary to store DataFrames
data = {}

In [5]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_T.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data is not available for that year
    
    
    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'

    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'pp1d_from_{direction}'
        swh_column_name = f'swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])

   
    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

In [6]:
data

{'CVCC': {'waves':               years  months  pp1d_from_N  swh_from_N  pp1d_from_NNE  \
  years months                                                          
  1984  1        1984       1          0.0         0.0            0.0   
        1        1984       1          0.0         0.0            0.0   
        1        1984       1          0.0         0.0            0.0   
        1        1984       1          0.0         0.0            0.0   
        1        1984       1          0.0         0.0            0.0   
  ...             ...     ...          ...         ...            ...   
  2022  12       2022      12          0.0         0.0            0.0   
        12       2022      12          0.0         0.0            0.0   
        12       2022      12          0.0         0.0            0.0   
        12       2022      12          0.0         0.0            0.0   
        12       2022      12          0.0         0.0            0.0   
  
                swh_from_NNE  p

In [7]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [15]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']

    #waves_df = waves_df.drop(['years', 'months'], axis=1)
    
    waves_df_annual = waves_df.groupby([waves_df.index.get_level_values('years'), waves_df.index.get_level_values('months')]).agg(
           {
        'pp1d_from_N': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_N': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_NNE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_NNE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_NE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_NE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_ENE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_ENE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_E': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_E': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None) 
        ],
        'pp1d_from_ESE': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None) 
        ],
        'swh_from_ESE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None) 
        ],
        'pp1d_from_SE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ], 
        'swh_from_SE': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_SSE': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_SSE': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_S': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_S': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_SSW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_SSW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_SW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_SW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_WSW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_WSW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_W': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_W': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_WNW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_WNW': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_NW': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_NW': [ 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'pp1d_from_NNW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ],
        'swh_from_NNW': [
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None)
        ]})

    waves_df_annual = waves_df_annual.fillna(0)
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = shoreline_df.index.get_level_values('years').unique()
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)

    # If needed, you can drop the existing 'months' column
    #shoreline_df_annual = shoreline_df_annual.drop('months', axis=1)
    
    # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

    # Iterate over each column in the DataFrame

    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]
    
        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
            next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

            # Check if there are any NaN values in the current column
            if shoreline_df_annual[col].isnull().any():
                # Fill NaN values with the mean of the available previous and next columns
                if prev_col is not None and next_col is not None:
                    shoreline_df_annual[col] = (shoreline_df_annual[prev_col] + shoreline_df_annual[next_col]) / 2
                elif prev_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[prev_col]
                elif next_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[next_col]
                else:
                    # If there are no immediate previous and next columns, extend the search to 3 columns
                    prev_cols = [shoreline_df_annual.columns[j] for j in range(i - 2, i) if j >= 0]
                    next_cols = [shoreline_df_annual.columns[j] for j in range(i + 1, i + 4) if j < len(shoreline_df_annual.columns)]

                    available_cols = prev_cols + next_cols

                    # Filter out None values (columns that are out of range)
                    available_cols = [col for col in available_cols if col is not None]

                    # Take the mean of available columns
                    if len(available_cols) > 0:
                        shoreline_df_annual[col] = shoreline_df_annual[available_cols].mean(axis=1)

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)
        
    # Exclude 'years' and 'months' columns for row-wise median calculation
    columns_to_consider = [col for col in shoreline_df_annual.columns if col.lower() not in ['years', 'months']]

    # Calculate the median for each row, excluding NaNs, across the specified columns
    row_median = shoreline_df_annual[columns_to_consider].median(axis=1)

    # Use apply along with lambda to replace NaN values in each row with the row's median
    shoreline_df_annual[columns_to_consider] = shoreline_df_annual[columns_to_consider].apply(
        lambda x: x.fillna(row_median[x.name]), axis=1)    
              
    # Ensure no NaNs are left before model training
    if shoreline_df_annual.isna().any().any():
        print(f"NaNs remain in shorelines data for {name}")
        continue  # Skip this iteration if NaNs are still present
            
                
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': waves_df_annual,
        'shorelines': shoreline_df_annual
    }

In [16]:
annual_data['CVCC']['shorelines']

Unnamed: 0_level_0,Unnamed: 1_level_0,CVCC_1,CVCC_2,CVCC_3,CVCC_4,CVCC_5,CVCC_6,CVCC_7,CVCC_8,CVCC_9,CVCC_10
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1984,1,561.538441,435.299056,370.403621,342.310336,313.467875,284.241889,233.135140,192.467087,182.631254,170.888232
1984,2,561.538441,435.299056,370.403621,342.310336,313.467875,284.241889,233.135140,192.467087,182.631254,170.888232
1984,3,561.538441,435.299056,370.403621,342.310336,313.467875,284.241889,233.135140,192.467087,182.631254,170.888232
1984,4,443.940701,435.299056,370.403621,342.310336,313.467875,284.241889,233.135140,192.467087,182.631254,170.888232
1984,5,561.538441,435.299056,370.403621,342.310336,313.467875,284.241889,233.135140,192.467087,182.631254,170.888232
...,...,...,...,...,...,...,...,...,...,...,...
2022,8,622.612417,472.860604,408.275353,369.223884,344.100295,312.176212,257.712198,227.819054,207.949689,188.080323
2022,9,622.641539,470.311391,401.480086,366.508152,339.292681,301.569043,254.062433,214.755789,203.450509,192.145228
2022,10,629.934427,483.057115,409.848238,371.576057,344.757205,310.048846,259.399465,217.948198,207.578082,197.207965
2022,11,573.758962,431.308617,365.060269,327.841843,298.891870,263.616561,216.429622,181.858568,175.312474,168.766380
