In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

In [3]:
# Paths and site names setup
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"
site_names = ['CVCC','CCFT','FTAD','ADLA','LABI',
              'TRAT','ATMC','MCCO','CCCL','NNOR',
              'MEIA','TORR','CVMR','MRMG','MGVR',
              'COSN','VAGR','GBHA','BARR','MIRA']

In [2]:
# Create an empty dictionary to store DataFrames
data = {}

In [4]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_T.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data is not available for that year
    
    
    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'

    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'{name}_pp1d_from_{direction}'
        swh_column_name = f'{name}_swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])

   
    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

In [5]:
data['CCFT']['shorelines']

Unnamed: 0_level_0,Unnamed: 1_level_0,CCFT_1,CCFT_2,CCFT_3,CCFT_4,CCFT_5,CCFT_6,CCFT_7,CCFT_8,CCFT_9,CCFT_10,years,months
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1984,4,131.717585,157.047471,100.058992,65.354326,56.503256,26.773429,,,117.167081,78.879555,1984,4
1984,5,129.465202,154.734336,,,,,,,,,1984,5
1984,7,119.668047,140.186758,81.285109,48.430575,36.155372,24.228479,,,113.328121,67.895648,1984,7
1984,7,115.009565,140.487681,84.992270,49.368228,35.643790,30.910373,,,119.407154,77.699004,1984,7
1984,8,111.353530,141.966511,87.893720,49.643060,37.612934,30.549023,,,118.115768,64.493328,1984,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,11,,157.442800,87.408285,,,,5.619913,,94.378403,59.565591,2022,11
2022,11,,,,,,,,47.411269,105.570633,67.595964,2022,11
2022,12,136.089446,148.179310,57.405931,31.692820,6.991199,-12.821248,-22.457253,27.869996,88.559314,52.358776,2022,12
2022,12,139.734505,152.850217,84.838118,34.742395,11.491148,-7.419695,-2.770608,31.340796,86.182920,53.393450,2022,12


In [6]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [14]:
# Iterate over keys in the data dictionary
for name in data.keys():
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = range(1984, 2023)
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)
    
    # Check for columns that only have NaN values after reindexing
    empty_columns_after_reindexing = shoreline_df_annual.columns[shoreline_df_annual.isnull().all()]

   # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

        # Iterate over each column in the DataFrame 

    # Iterate over each column in the DataFrame
    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]

        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            for idx in shoreline_df_annual[col][shoreline_df_annual[col].isnull()].index:
                prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
                next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

                # Check if adjacent columns have non-NaN values and use them for filling NaNs
                if prev_col and next_col:
                    prev_val = shoreline_df_annual.at[idx, prev_col]
                    next_val = shoreline_df_annual.at[idx, next_col]
                    if pd.notnull(prev_val) and pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = (prev_val + next_val) / 2
                    elif pd.notnull(prev_val):
                        shoreline_df_annual.at[idx, col] = prev_val
                    elif pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = next_val
                elif prev_col:
                    prev_val = shoreline_df_annual.at[idx, prev_col]
                    if pd.notnull(prev_val):
                        shoreline_df_annual.at[idx, col] = prev_val
                elif next_col:
                    next_val = shoreline_df_annual.at[idx, next_col]
                    if pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = next_val

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)

    # Ensure no NaNs are left before model training
    if shoreline_df_annual.isna().any().any():
        print(f"NaNs remain in shorelines data for {name}")
        continue  # Skip this iteration if NaNs are still present


    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'shorelines': shoreline_df_annual
    }

In [13]:
annual_data['CCFT']['shorelines']

Unnamed: 0_level_0,Unnamed: 1_level_0,CCFT_1,CCFT_2,CCFT_3,CCFT_4,CCFT_5,CCFT_6,CCFT_7,CCFT_8,CCFT_9,CCFT_10
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1984,1,126.276176,146.092807,87.699840,42.732468,19.573753,10.433007,12.988661,44.899836,102.411942,69.297154
1984,2,126.276176,146.092807,87.699840,42.732468,19.573753,10.433007,12.988661,44.899836,102.411942,69.297154
1984,3,126.276176,146.092807,87.699840,42.732468,19.573753,10.433007,12.988661,44.899836,102.411942,69.297154
1984,4,131.717585,157.047471,100.058992,65.354326,56.503256,26.773429,26.773429,71.970255,117.167081,78.879555
1984,5,129.465202,154.734336,154.734336,154.734336,154.734336,154.734336,154.734336,154.734336,154.734336,69.297154
...,...,...,...,...,...,...,...,...,...,...,...
2022,8,168.441310,185.994583,112.212179,82.721207,36.299678,13.617888,17.349859,65.297274,113.244690,82.535752
2022,9,163.526648,172.052519,102.733551,81.812022,51.676672,30.309495,12.384210,44.899836,112.504532,86.859968
2022,10,155.641212,181.973479,116.273569,88.437798,42.148920,13.691258,10.669287,42.372890,116.831469,75.864496
2022,11,143.606714,157.366743,85.605895,38.036442,14.352221,-6.690595,-3.293454,27.724979,92.185263,58.708060


In [None]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']

    waves_df = waves_df.drop(['years', 'months'], axis=1)
    
    waves_df_annual = waves_df.groupby([waves_df.index.get_level_values('years'), waves_df.index.get_level_values('months')]).agg(
           {
        f'{name}_pp1d_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None
        })

    #waves_df_annual = waves_df_annual.fillna(0)
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = range(1984, 2023)
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)
    
    # Check for columns that only have NaN values after reindexing
    empty_columns_after_reindexing = shoreline_df_annual.columns[shoreline_df_annual.isnull().all()]

    # Output the results
    print(f"Columns with only NaN values after reindexing in '{name}' dataset: {empty_columns_after_reindexing.tolist()}")
    
    # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

        # Iterate over each column in the DataFrame 

    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]
    
        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
            next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

            # Check if there are any NaN values in the current column
            if shoreline_df_annual[col].isnull().any():
                # Fill NaN values with the mean of the available previous and next columns
                if prev_col is not None and next_col is not None:
                    shoreline_df_annual[col] = (shoreline_df_annual[prev_col] + shoreline_df_annual[next_col]) / 2
                elif prev_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[prev_col]
                elif next_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[next_col]
                else:
                    # If there are no immediate previous and next columns, extend the search to 3 columns
                    prev_cols = [shoreline_df_annual.columns[j] for j in range(i - 2, i) if j >= 0]
                    next_cols = [shoreline_df_annual.columns[j] for j in range(i + 1, i + 4) if j < len(shoreline_df_annual.columns)]

                    available_cols = prev_cols + next_cols

                    # Filter out None values (columns that are out of range)
                    available_cols = [col for col in available_cols if col is not None]

                    # Take the mean of available columns
                    if len(available_cols) > 0:
                        shoreline_df_annual[col] = shoreline_df_annual[available_cols].mean(axis=1)

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)
    
    
    if shoreline_df_annual.isna().any().any():
        print("There are still NaN values in the DataFrame")
    else:
        print("All NaN values have been filled")
    
     
                
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': waves_df_annual,
        'shorelines': shoreline_df_annual
    }

In [None]:
annual_data['CCFT']['shorelines']

In [None]:
# Access the 'shorelines' DataFrame for 'TRAT'
shorelines_df = annual_data['TRAT']['shorelines']

# Check for columns that only have NaN values
empty_columns = shorelines_df.columns[shorelines_df.isnull().all()]

# Output the results
print(f"Columns with only NaN values in 'TRAT' shorelines dataset: {empty_columns.tolist()}")
