In [2]:
#!pip install geopandas

Collecting geopandas
  Obtaining dependency information for geopandas from https://files.pythonhosted.org/packages/3e/cf/ede993ed7070c5487e6db550aad490178c06eaa48ad26ff5e5263f995fba/geopandas-0.14.1-py3-none-any.whl.metadata
  Downloading geopandas-0.14.1-py3-none-any.whl.metadata (1.5 kB)
Collecting fiona>=1.8.21 (from geopandas)
  Obtaining dependency information for fiona>=1.8.21 from https://files.pythonhosted.org/packages/7f/27/b24c1610c7ae5716709321f04d38d7b8b71ed531f80df4f697b9ad99cfc3/fiona-1.9.5-cp311-cp311-win_amd64.whl.metadata
  Downloading fiona-1.9.5-cp311-cp311-win_amd64.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.1 kB ? eta -:--:--
     ---------------------------------------- 51.1/51.1 kB 1.3 MB/s eta 0:00:00
Collecting pyproj>=3.3.0 (from geopandas)
  Obtaining dependency information for pyproj>=3.3.0 from https://files.pythonhosted.org/packages/79/95/eb68113c5b5737c342bde1bab92705dabe69c16299c5a122616e50f1fbd6/pyproj-3.6.1-cp311-cp311-wi

In [3]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor

In [96]:
# Specify the folder paths relative to the notebook's location
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"

In [97]:
# List of file names 
#site_names = ["CCFT", "TROI", "NNOR", "MEIA", "VAGR"]
site_names = ["CCFT", "NNOR", "MEIA"]

In [98]:
# Create an empty dictionary to store DataFrames
data = {}

In [99]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_transects.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data for shorelines is not available for that year
    

    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'
  
    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'pp1d_from_{direction}'
        swh_column_name = f'swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])
    
    # Add a new column to waves and shorelines dataframes to indicate the site name
    waves_df['site'] = name
    shorelines_df['site'] = name

    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

In [100]:
print(data['CCFT']['shorelines'].head())

                 CCFT_1     CCFT_2      CCFT_3      CCFT_4      CCFT_5  \
years months                                                             
1984  4       84.751213  93.216140  131.717585  157.047471  100.058992   
      5       66.886950  81.108110  129.465202  154.734336         NaN   
      7       71.531519  67.350891  119.668047  140.186758   81.285109   
      7       71.088816  67.438465  115.009565  140.487681   84.992270   
      8       70.654654  66.005839  111.353530  141.966511   87.893720   

                 CCFT_6     CCFT_7     CCFT_8  CCFT_9  CCFT_10  ...  \
years months                                                    ...   
1984  4       65.354326  56.503256  26.773429     NaN      NaN  ...   
      5             NaN        NaN        NaN     NaN      NaN  ...   
      7       48.430575  36.155372  24.228479     NaN      NaN  ...   
      7       49.368228  35.643790  30.910373     NaN      NaN  ...   
      8       49.643060  37.612934  30.549023     NaN  

In [84]:
#print(data["TROI"]['shorelines'].head())

In [101]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [102]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']
    
    # Group by 'years' and calculate quantiles for each column
    wave_df_annual = waves_df.groupby(level=['years', 'months']).agg(
           {
        'pp1d_from_N': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_N': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_NNE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_NNE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_NE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_NE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_ENE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_ENE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_E': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_E': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'pp1d_from_ESE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_ESE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'pp1d_from_SE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ], 
        'swh_from_SE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'pp1d_from_SSE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_SSE': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_S': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_S': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'pp1d_from_SSW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_SSW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_SW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_SW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_WSW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None) 
        ],
        'swh_from_WSW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_W': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_W': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_WNW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_WNW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_NW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_NW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None), 
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None), 
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'pp1d_from_NNW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ],
        'swh_from_NNW': [
            ('10th_quantile', lambda x: x[x != 0].quantile(0.1) if any(x != 0) else None),
            ('50th_quantile', lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None),
            ('90th_quantile', lambda x: x[x != 0].quantile(0.9) if any(x != 0) else None)
        ]})
    

    # Replace NaN values with zero
    wave_df_annual = wave_df_annual.fillna(0)

    shoreline_df = data[name]['shorelines']

    # Group by 'years' and calculate median for each column
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)
    
    # Drop year and month columns
    #shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

    # Iterate over each column in the DataFrame

    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]
        prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
        next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

        # Check if there are any NaN values in the current column
        if shoreline_df_annual[col].isnull().any():
            # Fill NaN values with the mean of the available previous and next columns
            if prev_col is not None and next_col is not None:
                shoreline_df_annual[col] = (shoreline_df_annual[prev_col] + shoreline_df_annual[next_col]) / 2
            elif prev_col is not None:
                shoreline_df_annual[col] = shoreline_df_annual[prev_col]
            elif next_col is not None:
                shoreline_df_annual[col] = shoreline_df_annual[next_col]
            else:
                # If there are no immediate previous and next columns, extend the search to 3 columns
                prev_cols = [shoreline_df_annual.columns[j] for j in range(i - 2, i) if j >= 0]
                next_cols = [shoreline_df_annual.columns[j] for j in range(i + 1, i + 4) if j < len(shoreline_df_annual.columns)]

                available_cols = prev_cols + next_cols

                # Filter out None values (columns that are out of range)
                available_cols = [col for col in available_cols if col is not None]

                # Take the mean of available columns
                if len(available_cols) > 0:
                    shoreline_df_annual[col] = shoreline_df_annual[available_cols].mean(axis=1)

    for column in shoreline_df_annual.columns:
        # Check if there are any NaN values in the column
        if shoreline_df_annual[column].isnull().any():
            # Calculate the median value of the column (excluding NaN values)
            median_value = shoreline_df_annual[column].median()
        
            # Replace NaN values with the calculated median value
            shoreline_df_annual[column].fillna(median_value, inplace=True)
            
            # Final check and fill/drop remaining NaNs
            shoreline_df_annual.fillna(method='ffill', inplace=True)  # Forward fill
            shoreline_df_annual.fillna(method='bfill', inplace=True)  # Backward fill

            # Optional: Drop any rows that still have NaNs
            shoreline_df_annual.dropna(inplace=True)

            # Ensure no NaNs are left before model training
            if shoreline_df_annual.isna().any().any():
                print(f"NaNs remain in shorelines data for {name}")
                continue  # Skip this iteration if NaNs are still present
    
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': wave_df_annual,
        'shorelines': shoreline_df_annual
    }

In [103]:
wave_df_annual.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pp1d_from_N,pp1d_from_N,pp1d_from_N,swh_from_N,swh_from_N,swh_from_N,pp1d_from_NNE,pp1d_from_NNE,pp1d_from_NNE,swh_from_NNE,...,pp1d_from_NW,swh_from_NW,swh_from_NW,swh_from_NW,pp1d_from_NNW,pp1d_from_NNW,pp1d_from_NNW,swh_from_NNW,swh_from_NNW,swh_from_NNW
Unnamed: 0_level_1,Unnamed: 1_level_1,10th_quantile,50th_quantile,90th_quantile,10th_quantile,50th_quantile,90th_quantile,10th_quantile,50th_quantile,90th_quantile,10th_quantile,...,90th_quantile,10th_quantile,50th_quantile,90th_quantile,10th_quantile,50th_quantile,90th_quantile,10th_quantile,50th_quantile,90th_quantile
years,months,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16.568388,1.159329,2.113522,3.003425,10.858882,11.48967,12.711223,0.784305,1.325311,2.069451
1984,2,11.0015,11.651595,12.257721,0.581329,0.698613,0.849621,11.09175,12.218543,12.235737,0.602527,...,15.556421,0.944696,1.942769,2.496285,8.990899,10.270372,12.130041,0.64854,1.061916,1.478057
1984,3,10.03587,10.03587,10.03587,0.782094,0.782094,0.782094,10.085448,10.130714,10.176769,0.842339,...,16.012798,0.664541,1.757484,2.532742,10.067748,10.614656,11.01813,0.748528,0.768114,0.841217
1984,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.969103,0.695217,1.11033,1.613585,0.0,0.0,0.0,0.0,0.0,0.0
1984,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.222614,0.71959,1.001971,1.661999,7.46035,7.915628,11.252801,0.975197,1.12968,1.177368


In [104]:
# Initialize empty dataframes for combined data
combined_waves = pd.DataFrame()
combined_shorelines = pd.DataFrame()

# Concatenate data from all sites
for name in site_names:
    combined_waves = pd.concat([combined_waves, annual_data[name]['waves']], axis=0)
    combined_shorelines = pd.concat([combined_shorelines, annual_data[name]['shorelines']], axis=0)

In [109]:
combined_shorelines.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,CCFT_1,CCFT_2,CCFT_3,CCFT_4,CCFT_5,CCFT_6,CCFT_7,CCFT_8,CCFT_9,CCFT_10,...,MEIA_2,MEIA_3,MEIA_4,MEIA_5,MEIA_6,MEIA_7,MEIA_8,MEIA_9,MEIA_10,MEIA_11
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021,5,,,,,,,,,,,...,229.309016,198.395144,190.914321,172.296611,152.826332,127.555248,108.777387,105.197785,103.552901,1062.276451
2021,6,,,,,,,,,,,...,229.309016,198.395144,190.914321,172.296611,152.826332,127.555248,108.777387,105.197785,103.552901,1062.276451
2021,7,,,,,,,,,,,...,229.309016,198.395144,190.914321,172.296611,152.826332,127.555248,108.777387,105.197785,103.552901,1062.276451
2021,8,,,,,,,,,,,...,229.309016,198.395144,190.914321,172.296611,152.826332,127.555248,108.777387,105.197785,103.552901,1062.276451
2021,9,,,,,,,,,,,...,229.309016,198.395144,190.914321,172.296611,152.826332,127.555248,108.777387,105.197785,103.552901,1062.276451
2021,10,,,,,,,,,,,...,209.191402,190.777682,185.322991,172.088571,157.297331,130.333228,106.715292,104.528854,97.019131,1059.009566
2021,11,,,,,,,,,,,...,209.191402,190.777682,185.322991,172.088571,157.297331,130.333228,106.715292,104.528854,97.019131,1059.009566
2021,12,,,,,,,,,,,...,209.191402,190.777682,185.322991,172.088571,157.297331,130.333228,106.715292,104.528854,97.019131,1059.009566
2022,1,,,,,,,,,,,...,193.040881,172.605021,163.081957,151.118704,139.907053,113.667741,88.052686,81.237752,74.231731,1048.115865
2022,2,,,,,,,,,,,...,193.040881,172.605021,163.081957,151.118704,139.907053,113.667741,88.052686,81.237752,74.231731,1048.115865


In [89]:
# Modified section for aligning waves and shorelines data and creating training/testing sets
for name in annual_data.keys():
    # Extracting waves and shorelines data
    waves_df = annual_data[name]['waves']
    shorelines_df = annual_data[name]['shorelines']

    # Identify common timestamps
    common_index = waves_df.index.intersection(shorelines_df.index)

    # Filter waves data to have only common timestamps
    waves_df_filtered = waves_df.loc[common_index]

    # Split into training and testing sets
    x_train = waves_df_filtered[waves_df_filtered.index.get_level_values(0) <= 2014]
    y_train = shorelines_df[shorelines_df.index.get_level_values(0) <= 2014]
    x_test = waves_df_filtered[waves_df_filtered.index.get_level_values(0) > 2014]
    y_test = shorelines_df[shorelines_df.index.get_level_values(0) > 2014]

    # Create an instance of the DecisionTreeRegressor model
    model = DecisionTreeRegressor()

    # Train the model
    model.fit(x_train, y_train)

    # Use the model to predict shoreline positions
    y_pred = model.predict(x_test)
    y_pred = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)

    # Calculate the RMSE
    from sklearn.metrics import mean_squared_error
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print the RMSE by name
    print(f'{name}: {rmse}')

CCFT: 21.065719684713606
NNOR: 20.288542313344408
MEIA: 30.11058159694361
