In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Paths and site names setup
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"
site_names = ['CVCC','CCFT','FTAD','ADLA','LABI',
              'TRAT','ATMC','MCCO','CCCL','NNOR',
              'MEIA','TORR','CVMR','MRMG','MGVR',
              'COSN','VAGR','GBHA','BARR','MIRA']

In [3]:
# Create an empty dictionary to store DataFrames
data = {}

In [4]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_T.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data is not available for that year
    
    
    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'

    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'{name}_pp1d_from_{direction}'
        swh_column_name = f'{name}_swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])

   
    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

In [15]:
waves_df

Unnamed: 0_level_0,Unnamed: 1_level_0,MIRA_pp1d_from_N,MIRA_swh_from_N,MIRA_pp1d_from_NNE,MIRA_swh_from_NNE,MIRA_pp1d_from_NE,MIRA_swh_from_NE,MIRA_pp1d_from_ENE,MIRA_swh_from_ENE,MIRA_pp1d_from_E,MIRA_swh_from_E,...,MIRA_pp1d_from_WSW,MIRA_swh_from_WSW,MIRA_pp1d_from_W,MIRA_swh_from_W,MIRA_pp1d_from_WNW,MIRA_swh_from_WNW,MIRA_pp1d_from_NW,MIRA_swh_from_NW,MIRA_pp1d_from_NNW,MIRA_swh_from_NNW
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,11.810025,1.683745,0.0,0.0
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,11.818320,1.692335,0.0,0.0
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,11.826023,1.698778,0.0,0.0
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,11.831356,1.703569,0.0,0.0
1984,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,11.833726,1.706873,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.463697,3.416870,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2022,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.729146,3.524581,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2022,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.734775,3.472212,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2022,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.741293,3.392420,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [13]:
data['TORR']['shorelines']

Unnamed: 0_level_0,Unnamed: 1_level_0,TORR_1,TORR_2,TORR_3,TORR_4,TORR_5,TORR_6,TORR_7,TORR_8,TORR_9,TORR_10,years,months
years,months,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1984,4,28.199852,27.857812,21.659705,68.567537,155.424034,167.288366,201.861150,365.596229,264.584021,191.368112,1984,4
1984,4,108.205425,67.909404,63.464646,108.032865,196.230766,208.808396,240.801582,410.720696,279.358381,229.439004,1984,4
1984,5,23.905950,21.591706,8.765710,59.761127,156.563893,174.609099,189.618025,366.513982,278.123575,185.279591,1984,5
1984,5,62.983236,58.327695,51.164384,103.886794,185.742724,195.761140,231.557481,393.618782,295.809669,213.150042,1984,5
1984,6,77.234300,43.599558,45.169561,91.305825,178.547410,194.633333,225.548081,390.151419,274.471132,215.073226,1984,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,11,173.316393,91.003960,92.628162,127.466446,205.662522,215.617481,255.088643,392.634047,293.680964,207.483577,2022,11
2022,11,179.189722,90.467054,88.219415,129.486842,208.831646,217.055888,254.818872,406.313533,288.007809,205.895226,2022,11
2022,12,141.829550,65.652545,68.701217,108.641498,184.574416,190.444602,237.020617,385.907255,255.913321,196.741275,2022,12
2022,12,119.157270,78.867719,80.334237,118.975886,189.785496,199.457001,255.673885,377.107906,208.542536,192.453560,2022,12


In [5]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [6]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']

    waves_df = waves_df.drop(['years', 'months'], axis=1)
    
    waves_df_annual = waves_df.groupby([waves_df.index.get_level_values('years'), waves_df.index.get_level_values('months')]).agg(
           {
        f'{name}_pp1d_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None]
           })

    waves_df_annual = waves_df_annual.fillna(0)
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = shoreline_df.index.get_level_values('years').unique()
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)
    
    # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

    # Iterate over each column in the DataFrame

    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]
    
        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
            next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

            # Check if there are any NaN values in the current column
            if shoreline_df_annual[col].isnull().any():
                # Fill NaN values with the mean of the available previous and next columns
                if prev_col is not None and next_col is not None:
                    shoreline_df_annual[col] = (shoreline_df_annual[prev_col] + shoreline_df_annual[next_col]) / 2
                elif prev_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[prev_col]
                elif next_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[next_col]
                else:
                    # If there are no immediate previous and next columns, extend the search to 3 columns
                    prev_cols = [shoreline_df_annual.columns[j] for j in range(i - 2, i) if j >= 0]
                    next_cols = [shoreline_df_annual.columns[j] for j in range(i + 1, i + 4) if j < len(shoreline_df_annual.columns)]

                    available_cols = prev_cols + next_cols

                    # Filter out None values (columns that are out of range)
                    available_cols = [col for col in available_cols if col is not None]

                    # Take the mean of available columns
                    if len(available_cols) > 0:
                        shoreline_df_annual[col] = shoreline_df_annual[available_cols].mean(axis=1)

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)
        
    # Exclude 'years' and 'months' columns for row-wise median calculation
    columns_to_consider = [col for col in shoreline_df_annual.columns if col.lower() not in ['years', 'months']]

    # Calculate the median for each row, excluding NaNs, across the specified columns
    row_median = shoreline_df_annual[columns_to_consider].median(axis=1)

    # Use apply along with lambda to replace NaN values in each row with the row's median
    shoreline_df_annual[columns_to_consider] = shoreline_df_annual[columns_to_consider].apply(
        lambda x: x.fillna(row_median[x.name]), axis=1)    
              
    # Ensure no NaNs are left before model training
    if shoreline_df_annual.isna().any().any():
        print(f"NaNs remain in shorelines data for {name}")
        continue  # Skip this iteration if NaNs are still present
            
                
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': waves_df_annual,
        'shorelines': shoreline_df_annual
    }

In [None]:
#annual_data['TORR']['waves']

In [None]:
#annual_data['CVCC']['shorelines']

In [7]:
# Combine standardized data into a single empty DataFrame with all months and years from the previous tables
combined_shorelines = pd.DataFrame(index=pd.MultiIndex.from_product(
    [shoreline_df.index.get_level_values(0).unique(), shoreline_df.index.get_level_values(1).unique()],
    names=['years', 'months']))


In [8]:
# Merge data from all sites - SHORELINES
for name in site_names:
    df_to_merge = annual_data[name]['shorelines']

    # Reset index if 'years' and 'months' are part of the index
    if 'years' in df_to_merge.index.names and 'months' in df_to_merge.index.names:
        df_to_merge = df_to_merge.reset_index()

    combined_shorelines = combined_shorelines.merge(df_to_merge, 
                                                   on=['years', 'months'], 
                                                   how='left')

# Handling NaNs - Group by 'years' and fill NaNs with the mean of the respective year
for column in combined_shorelines.columns:
    if column not in ['site', 'years', 'months']:
        combined_shorelines[column] = combined_shorelines.groupby('years')[column].transform(lambda x: x.fillna(x.mean()))


In [None]:
combined_shorelines

In [9]:
# making sure that the waves dataframe has the same dimensions as the shorelines

combined_waves = pd.DataFrame(index=pd.MultiIndex.from_product(
    [waves_df.index.get_level_values(0).unique(), waves_df.index.get_level_values(1).unique()],
    names=['years', 'months']))


In [11]:
annual_data['CCFT']['waves']

Unnamed: 0_level_0,Unnamed: 1_level_0,CCFT_pp1d_from_N,CCFT_swh_from_N,CCFT_pp1d_from_NNE,CCFT_swh_from_NNE,CCFT_pp1d_from_NE,CCFT_swh_from_NE,CCFT_pp1d_from_ENE,CCFT_swh_from_ENE,CCFT_pp1d_from_E,CCFT_swh_from_E,...,CCFT_pp1d_from_WSW,CCFT_swh_from_WSW,CCFT_pp1d_from_W,CCFT_swh_from_W,CCFT_pp1d_from_WNW,CCFT_swh_from_WNW,CCFT_pp1d_from_NW,CCFT_swh_from_NW,CCFT_pp1d_from_NNW,CCFT_swh_from_NNW
Unnamed: 0_level_1,Unnamed: 1_level_1,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,...,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile,50th_quantile
years,months,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,11.347860,2.497525,14.537097,2.851056,12.987213,2.683459
1984,2,8.124553,0.898709,10.150081,0.979162,4.325909,0.938027,12.076363,0.864512,12.188941,0.916551,...,0.000000,0.000000,11.780992,1.820531,10.590620,1.852415,13.017728,1.713646,12.319592,2.777541
1984,3,11.416148,1.212591,10.845699,1.179881,9.994397,1.151467,10.096162,1.828461,9.886262,1.731818,...,12.280782,1.696713,11.603828,1.136268,11.095743,1.845146,11.499841,2.191573,8.317714,1.289162
1984,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,10.436861,1.644427,12.143021,1.578842,11.779658,1.670776,9.794569,1.529034,10.800816,1.398195
1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,9.772054,2.445487,9.422171,1.687710,8.291199,1.530686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,8.361857,0.588461,8.272535,0.814704,8.989631,1.534320
2022,9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,9.351217,2.712948,12.298557,2.003739,11.019308,1.208461,10.772375,1.537294,8.437996,1.850680
2022,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,9.328849,1.973673,11.224319,2.195207,11.098557,2.064533,11.716999,1.446846,10.217036,1.355821
2022,11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,12.980251,2.894834,13.631282,2.850725,12.957736,2.471258,0.000000,0.000000


In [None]:
combined_waves

In [None]:
annual_data['NNOR']['waves']

In [None]:
# Convert MultiIndex to columns for combined_waves
combined_waves.reset_index(inplace=True)

for name in site_names:
    df_to_merge_waves = annual_data[name]['waves'].copy()

    # Convert MultiIndex to columns if needed
    if 'years' in df_to_merge_waves.index.names and 'months' in df_to_merge_waves.index.names:
        df_to_merge_waves.reset_index(inplace=True)

    # Check and print the structure of DataFrames
    print("combined_waves structure:", combined_waves.columns)
    print("df_to_merge_waves structure:", df_to_merge_waves.columns)

    # Merge operation
    combined_waves = combined_waves.merge(df_to_merge_waves, on=['years', 'months'], how='left')

In [None]:
df_to_merge_waves

In [None]:
# Merge data from all sites - WAVES
for name in site_names:
    df_to_merge_waves = annual_data[name]['waves']

    # Reset index if 'years' and 'months' are part of the index
    if 'years' in df_to_merge_waves.index.names and 'months' in df_to_merge_waves.index.names:
        df_to_merge_waves = df_to_merge_waves.reset_index()

    combined_waves = combined_waves.merge(df_to_merge_waves, 
                                                   on=['years', 'months'], 
                                                   how='left')

# Handling NaNs - Group by 'years' and fill NaNs with the mean of the respective year
# for column in combined_waves.columns:
#     if column not in ['site', 'years', 'months']:
#         combined_waves[column] = combined_waves.groupby('years')[column].transform(lambda x: x.fillna(x.mean()))


In [None]:
combined_waves

In [None]:
# Merge data from all sites - WAVES
for name in site_names:
    df_to_merge_waves = annual_data[name]['waves'].copy()

    # Reset index and sort
    df_to_merge_waves.reset_index(inplace=True)
    df_to_merge_waves.sort_index(inplace=True)

    # Ensure combined_waves is also properly formatted
    if 'years' in combined_waves.index.names and 'months' in combined_waves.index.names:
        combined_waves.reset_index(inplace=True, drop=True)
        combined_waves.sort_index(inplace=True)

    # Merge operation
    combined_waves = combined_waves.merge(df_to_merge_waves, on=['years', 'months'], how='left')

In [None]:
combined

In [None]:
print(combined_waves.index.levels)
print(annual_data['TORR']['waves'].index.levels)

# sao os mesmos niveis nao sei porque nao está a dar!!!!!!