In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Paths and site names setup
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"
site_names = ['CVCC','CCFT','FTAD','ADLA','LABI',
              'TRAT','ATMC','MCCO','CCCL','NNOR',
              'MEIA','TORR','CVMR','MRMG','MGVR',
              'COSN','VAGR','GBHA','BARR','MIRA']

In [3]:
# Create an empty dictionary to store DataFrames
data = {}

In [4]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_T.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data is not available for that year
    
    
    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'

    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'{name}_pp1d_from_{direction}'
        swh_column_name = f'{name}_swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])

   
    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

In [5]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [6]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']

    waves_df = waves_df.drop(['years', 'months'], axis=1)
    
    waves_df_annual = waves_df.groupby([waves_df.index.get_level_values('years'), waves_df.index.get_level_values('months')]).agg(
           {
        f'{name}_pp1d_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None
        })

    waves_df_annual = waves_df_annual.fillna(0)
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = shoreline_df.index.get_level_values('years').unique()
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)
    
    # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

    # Iterate over each column in the DataFrame

    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]
    
        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
            next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

            # Check if there are any NaN values in the current column
            if shoreline_df_annual[col].isnull().any():
                # Fill NaN values with the mean of the available previous and next columns
                if prev_col is not None and next_col is not None:
                    shoreline_df_annual[col] = (shoreline_df_annual[prev_col] + shoreline_df_annual[next_col]) / 2
                elif prev_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[prev_col]
                elif next_col is not None:
                    shoreline_df_annual[col] = shoreline_df_annual[next_col]
                else:
                    # If there are no immediate previous and next columns, extend the search to 3 columns
                    prev_cols = [shoreline_df_annual.columns[j] for j in range(i - 2, i) if j >= 0]
                    next_cols = [shoreline_df_annual.columns[j] for j in range(i + 1, i + 4) if j < len(shoreline_df_annual.columns)]

                    available_cols = prev_cols + next_cols

                    # Filter out None values (columns that are out of range)
                    available_cols = [col for col in available_cols if col is not None]

                    # Take the mean of available columns
                    if len(available_cols) > 0:
                        shoreline_df_annual[col] = shoreline_df_annual[available_cols].mean(axis=1)

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)
        
    # Exclude 'years' and 'months' columns for row-wise median calculation
    columns_to_consider = [col for col in shoreline_df_annual.columns if col.lower() not in ['years', 'months']]

    # Calculate the median for each row, excluding NaNs, across the specified columns
    row_median = shoreline_df_annual[columns_to_consider].median(axis=1)

    # Use apply along with lambda to replace NaN values in each row with the row's median
    shoreline_df_annual[columns_to_consider] = shoreline_df_annual[columns_to_consider].apply(
        lambda x: x.fillna(row_median[x.name]), axis=1)    
              
    # Ensure no NaNs are left before model training
    if shoreline_df_annual.isna().any().any():
        print(f"NaNs remain in shorelines data for {name}")
        continue  # Skip this iteration if NaNs are still present
            
                
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': waves_df_annual,
        'shorelines': shoreline_df_annual
    }

In [None]:
annual_data['TORR']['shorelines']

In [7]:
# List of all wave dataframes
waves_dfs = []

# Loop through each beach's data
for name in annual_data:
    # Copy the wave DataFrame
    df = annual_data[name]['waves'].copy()

    # Reset index to turn MultiIndex into columns
    df = df.reset_index()

    # Rename columns to remove the beach prefix
    new_columns = {col: col.replace(f'{name}_', '') for col in df.columns}
    df.rename(columns=new_columns, inplace=True)

    # Add a column for the beach name
    df['beach'] = name

    # Reorder columns
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]  # move 'beach' to the front
    df = df[cols]

    # Append to the list
    waves_dfs.append(df)

# Combine all dataframes into one
waves_combined = pd.concat(waves_dfs, ignore_index=True)

In [None]:
waves_combined

In [8]:
# List of all shoreline dataframes
shorelines_dfs = []

# Loop through each beach's data
for name in annual_data:
    # Copy the shoreline DataFrame
    df = annual_data[name]['shorelines'].copy()

    # Reset index to turn MultiIndex into columns
    df = df.reset_index()

    # Rename columns to use generic names
    new_columns = {col: f'{col.split("_")[-1]}' for col in df.columns}
    df.rename(columns=new_columns, inplace=True)

    # Add a column for the beach name
    df['beach'] = name

    # Reorder columns
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]  # move 'beach' to the front
    df = df[cols]

    # Append to the list
    shorelines_dfs.append(df)

# Combine all dataframes into one
shorelines_combined = pd.concat(shorelines_dfs, ignore_index=True)

In [None]:
torr_shorelines = shorelines_combined[shorelines_combined['beach'] == 'TORR']
torr_shorelines

In [9]:
# Check if there are any NaN values in waves_combined
waves_nans = waves_combined.isna().any().any()

if waves_nans:
    print("There are NaN values in the waves_combined table.")
else:
    print("No NaN values in the waves_combined table.")

No NaN values in the waves_combined table.


In [10]:
# Check if there are any NaN values in shorelines_combined
shorelines_nans = shorelines_combined.isna().any().any()

if shorelines_nans:
    print("There are NaN values in the shorelines_combined table.")
else:
    print("No NaN values in the shorelines_combined table.")

No NaN values in the shorelines_combined table.


In [11]:
# Define a function to normalize columns by subtracting the value of month 1, year 1984
def normalize_by_reference(df):
    # Find the reference row (month 1, year 1984) for the current beach
    reference_row = df[(df['years'] == 1984) & (df['months'] == 1)]


    # Otherwise, subtract the reference value from all rows in the dataframe
    for col in df.columns:
        if col not in ['years', 'months', 'beach']:
            df[col] -= reference_row[col].values[0]

    return df

# Apply the normalization function to each beach group
shorelines_normalized = shorelines_combined.groupby('beach', group_keys=True).apply(normalize_by_reference).reset_index(drop=True)
shorelines_normalized

Unnamed: 0,beach,years,months,1,2,3,4,5,6,7,8,9,10
0,ADLA,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ADLA,1984,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ADLA,1984,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ADLA,1984,4,18.235956,20.693159,18.540051,15.133533,14.589777,14.466450,13.992007,16.393314,11.129181,5.573088
4,ADLA,1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,VAGR,2022,8,10.804137,16.260811,-0.163789,-0.930297,10.643900,1.324937,0.999446,13.133079,5.687721,-2.328197
9356,VAGR,2022,9,1.041706,19.780664,-0.959917,-3.462635,9.349269,4.430923,2.758224,6.350465,-3.245225,-13.411474
9357,VAGR,2022,10,4.119814,23.835732,7.196150,1.242256,12.357720,12.165696,7.851352,20.625551,16.372356,11.548602
9358,VAGR,2022,11,-11.645939,-5.928786,-15.845567,-22.010816,-4.342059,-9.861837,-12.485654,-14.475632,-18.131533,-22.357994


In [None]:
torr_shorelines_normalized = shorelines_normalized[shorelines_normalized['beach'] == 'TORR']
torr_shorelines_normalized

In [12]:
# Merge the waves_combined and shorelines_normalized dataframes
# This will join the tables on 'years', 'months', and 'beach'
combined_data = pd.merge(waves_combined, shorelines_normalized, how='inner', on=['years', 'months', 'beach'])
combined_data

Unnamed: 0,beach,years,months,pp1d_from_N,swh_from_N,pp1d_from_NNE,swh_from_NNE,pp1d_from_NE,swh_from_NE,pp1d_from_ENE,...,1,2,3,4,5,6,7,8,9,10
0,CVCC,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,CVCC,1984,2,8.124553,0.898709,10.150081,0.979162,4.325909,0.938027,12.076363,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,CVCC,1984,3,11.416148,1.212591,10.845699,1.179881,9.994397,1.151467,10.096162,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,CVCC,1984,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-117.597740,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,CVCC,1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,MIRA,2022,8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,7.684163,6.985749,8.469560,2.602996,14.479233,-2.095405,-34.434764,-44.139283,-28.326693,-16.634225
9356,MIRA,2022,9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-14.976500,-4.118891,1.947118,2.640465,5.385459,-16.721884,-40.496321,-53.866435,-40.018042,-30.289771
9357,MIRA,2022,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,8.311467,16.138400,17.186364,10.319524,15.116868,6.445768,-30.248808,-36.877415,-27.219336,-21.681380
9358,MIRA,2022,11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-39.317704,-27.213373,-24.844876,-20.613260,-13.874573,-28.102132,-51.060485,-65.600963,-55.998810,-50.516779


In [27]:
# One-hot encode the 'beach' column
encoder = OneHotEncoder(sparse=False)
beach_encoded = encoder.fit_transform(combined_data[['beach']])

# Create a DataFrame from the encoded array
beach_encoded_df = pd.DataFrame(beach_encoded, columns=encoder.get_feature_names_out(['beach']), index=combined_data.index)

# Drop the original 'beach' column from combined_data
combined_data = combined_data.drop('beach', axis=1)

# Concatenate the one-hot encoded beach column with combined_data
combined_data = pd.concat([combined_data, beach_encoded_df], axis=1)



In [35]:
# Splitting the combined data into training and testing sets

# Filter rows for training data (years <= 2014)
training_data = combined_data[combined_data['years'] <= 2014]

# Filter rows for testing data (years > 2014)
testing_data = combined_data[combined_data['years'] > 2014]

# Define the columns for the target variables
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# For training set
x_train = training_data.drop(target_columns + ['years', 'months'], axis=1)
y_train = training_data[target_columns]

# For testing set
x_test = testing_data.drop(target_columns + ['years', 'months'], axis=1)
y_test = testing_data[target_columns]

In [40]:
x_test

Unnamed: 0,pp1d_from_N,swh_from_N,pp1d_from_NNE,swh_from_NNE,pp1d_from_NE,swh_from_NE,pp1d_from_ENE,swh_from_ENE,pp1d_from_E,swh_from_E,...,beach_LABI,beach_MCCO,beach_MEIA,beach_MGVR,beach_MIRA,beach_MRMG,beach_NNOR,beach_TORR,beach_TRAT,beach_VAGR
372,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
373,11.011012,1.164683,11.073523,1.369202,10.976942,1.059284,0.0,0.0,10.958426,1.115618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
374,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
376,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9356,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9357,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [26]:
x_test

Unnamed: 0,beach,pp1d_from_N,swh_from_N,pp1d_from_NNE,swh_from_NNE,pp1d_from_NE,swh_from_NE,pp1d_from_ENE,swh_from_ENE,pp1d_from_E,...,pp1d_from_WSW,swh_from_WSW,pp1d_from_W,swh_from_W,pp1d_from_WNW,swh_from_WNW,pp1d_from_NW,swh_from_NW,pp1d_from_NNW,swh_from_NNW
372,CVCC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,10.491076,1.198384,13.541367,1.905114,14.648787,2.826441,11.905569,2.097821
373,CVCC,11.011012,1.164683,11.073523,1.369202,10.976942,1.059284,0.0,0.0,10.958426,...,13.551736,0.804875,13.325838,0.783068,13.400940,2.766638,12.943959,2.771264,10.992348,2.137387
374,CVCC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,12.071030,2.317126,13.691127,2.108807,13.690386,2.240968,10.917246,2.451765
375,CVCC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,10.887028,2.006135,11.129812,1.725045,12.616592,1.559513,11.381337,1.471296,12.660587,2.522553
376,CVCC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,9.955587,3.200043,10.760673,1.901975,10.024467,1.501032,11.145662,1.728762,8.764474,1.700430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,MIRA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,8.188249,0.664454,8.457845,1.108349,8.790249,1.480382
9356,MIRA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,12.173240,2.308535,9.700359,1.217052,10.688978,1.292548,10.430787,1.813675,8.112554,2.067011
9357,MIRA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,10.234812,2.558815,11.438960,2.600941,10.592397,1.730166,11.204914,1.609404,11.528874,1.703321
9358,MIRA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,14.634862,2.579548,12.441947,2.454408,13.128974,3.174024,12.206421,2.616470,0.000000,0.000000


In [41]:
# Create an instance of the DecisionTreeRegressor model
model = DecisionTreeRegressor()

In [42]:
# Train the model
model.fit(x_train, y_train)

In [44]:
# Make predictions on the test set
y_pred = model.predict(x_test)


In [46]:
# Calculate and print the metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

26.527286898965592

In [None]:
# cross validation com time series split

In [55]:
# baseline dummy que prevê o ano a seguir com base no anterior (igual)

# Splitting the combined data into training and testing sets - Naive model to compare

# Train for 2021
training_data_naive = combined_data[combined_data['years'] == 2021]

# Test for 2022
testing_data_naive = combined_data[combined_data['years'] == 2022]

# Define the columns for the target variables
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# For training set
x_train_naive = training_data_naive.drop(target_columns + ['years', 'months'], axis=1)
y_train_naive = training_data_naive[target_columns]

# For testing set
x_test_naive = testing_data_naive.drop(target_columns + ['years', 'months'], axis=1)
y_test_naive = testing_data_naive[target_columns]

In [56]:
# Create an instance of the DecisionTreeRegressor model
model_naive = DecisionTreeRegressor()

In [57]:
# Train the model
model_naive.fit(x_train_naive, y_train_naive)

In [58]:
# Make predictions on the test set
y_pred_naive = model_naive.predict(x_test_naive)

In [60]:
# Calculate and print the metrics for the naive model
mse_naive = mean_squared_error(y_test_naive, y_pred_naive)
rmse_naive = mean_squared_error(y_test_naive, y_pred_naive, squared=False)
rmse_naive

14.8933687600227