# Capstone Project - Wave whisperers (Group 6)

In this notebook we will cover the following:
    1. 

# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

# Data collection and preprocessing

Data is retrieved after the previous extraction on waves and shorelines for the 20 selected sites:
- Shoreline data was retrieved using CoastSat, an open-source software toolkit that enables users to obtain time-series of shoreline position at any coastal stretch from publicly available satellite imagery.
- Wave data was extracted from the ERA5 programme dataset for meteorological and global climate reanalysis, available at the European Centre for Medium-Range Weather Forecasts (ECMWF) website.

After loading the data from the csv's, wave and shoreline data are manipulated with a loop through each site name, structuring waves_df and shorelines_df into a combined dictionary.
Regarding waves, a deeper preprocessing is required in order to convert degrees into directions and considering both height and period.

In [2]:
# Paths and site names setup
waves_folder_path = "./dataset_Ondas"
shorelines_folder_path = "./dataset_linhascosta"
transects_folder_path = "./dataset_transects"
site_names = ['CVCC','CCFT','FTAD','ADLA','LABI',
              'TRAT','ATMC','MCCO','CCCL','NNOR',
              'MEIA','TORR','CVMR','MRMG','MGVR',
              'COSN','VAGR','GBHA','BARR','MIRA']

In [3]:
# Create an empty dictionary to store DataFrames
data = {}

In [4]:
# Loop through each file name
for name in site_names:
    # Construct the file paths
    waves_file_path = os.path.join(waves_folder_path, f"{name}_wave_timeseries.csv")
    shorelines_file_path = os.path.join(shorelines_folder_path, f"{name}_shoreline_timeseries.csv")
    transects_file_path = os.path.join(transects_folder_path, f"{name}_T.geojson")

    # Read the waves CSV files into DataFrame
    waves_df = pd.read_csv(waves_file_path, sep=',', header=0) # Set header=0 to use the first row as column headers
    
    waves_df['time'] = pd.to_datetime(waves_df['time'])
    waves_df.set_index('time', inplace=True)
    waves_df['years'] = waves_df.index.year
    waves_df['months'] = waves_df.index.month
    waves_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(waves_df.index.year, waves_df.index.month)],
    names=['years', 'months'])
    waves_df = waves_df[waves_df['years'] != 1983] # Remove 1983 because satellite data is not available for that year
    
    
    # List of directions (16 directions compass rose)
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    def degrees_to_direction(wave_direction_degrees):
        if wave_direction_degrees >= 0 and   wave_direction_degrees <= 11.25:
            return 'N'
        elif wave_direction_degrees <= 33.75:
            return 'NNE'
        elif wave_direction_degrees <= 56.25:
            return 'NE'
        elif wave_direction_degrees <= 78.75:
            return 'ENE'
        elif wave_direction_degrees <= 101.25:
            return 'E'
        elif wave_direction_degrees <= 123.75:
            return 'ESE'
        elif wave_direction_degrees <= 146.25:
            return 'SE'
        elif wave_direction_degrees <= 168.75:
            return 'SSE'
        elif wave_direction_degrees <= 191.25:
            return 'S'
        elif wave_direction_degrees <= 213.75:
            return 'SSW'
        elif wave_direction_degrees <= 236.25:
            return 'SW'
        elif wave_direction_degrees <= 258.75:
            return 'WSW'
        elif wave_direction_degrees <= 281.25:
            return 'W'
        elif wave_direction_degrees <= 303.75:
            return 'WNW'
        elif wave_direction_degrees <= 326.25:
            return 'NW'
        elif wave_direction_degrees <= 348.75:
            return 'NNW'
        elif wave_direction_degrees <= 360:
            return 'N'
        else:
            return 'false'

    # One-hot encode the 'mwd' column
    waves_df['mwd'] = waves_df['mwd'].apply(degrees_to_direction)

    # Create a DataFrame of dummy variables for 'mwd'
    one_hot_encode = pd.get_dummies(waves_df['mwd'], prefix='from')

    # Concatenate the one-hot encoded columns to the original DataFrame
    waves_df = pd.concat([waves_df, one_hot_encode], axis=1)
    waves_df = waves_df.drop('mwd', axis=1)

    # Iterate through directions and create new columns for each direction's pp1d and swh
    for direction in directions:
        # Create new columns for pp1d and swh
        pp1d_column_name = f'{name}_pp1d_from_{direction}'
        swh_column_name = f'{name}_swh_from_{direction}'
    
        # Use boolean indexing to set values based on the condition
        waves_df[pp1d_column_name] = waves_df['pp1d'] * waves_df[f'from_{direction}']
        waves_df[swh_column_name] = waves_df['swh'] * waves_df[f'from_{direction}']
    
    # Drop the original 'mwd' column and the 'pp1d' and 'swh' columns
    waves_df.drop(columns=[f'from_{direction}' for direction in directions], inplace=True)
    waves_df.drop(columns=['pp1d','swh'], inplace=True)

    # Read the shorelines CSV files into DataFrame
    shorelines_df = pd.read_csv(shorelines_file_path)
    shorelines_df = shorelines_df.iloc[:, 1:]
    shorelines_df['dates'] = pd.to_datetime(shorelines_df['dates'])
    shorelines_df.set_index('dates', inplace=True)
    shorelines_df['years'] = shorelines_df.index.year
    shorelines_df['months'] = shorelines_df.index.month
    shorelines_df.index = pd.MultiIndex.from_tuples(
    [(year, month) for year, month in zip(shorelines_df.index.year, shorelines_df.index.month)],
    names=['years', 'months'])

   
    # Read the transects GeoJSON file into a GeoDataFrame
    transects_gdf = gpd.read_file(transects_file_path, driver='GeoJSON')

    # Add DataFrames to the dictionary with site name as key
    data[name] = {
        'waves': waves_df,
        'shorelines': shorelines_df,
        'transects': transects_gdf
    }

# Data Aggregation and Missing Data Handling

As seen in the Exploratory Data Analysis, regarding shoreline data (available since 1984 for all sites), there is no defined periodicity observed, since there could be many observations in the same month or none.
Regarding wave data, periodicity is hourly (and the availability is since before 1984, but it was only considered since then).
In order to define the same periodicity for both datasets, it was defined that the data would be aggregated by year and month.
To do this aggregation, median values by year and month were considered.

Regarding NaN handling:
- Wave data: NaNs were filled with 0's. This is because NaNs occur when creating the columns for heigh and period, when they're not specified. And in this case we considered it to be 0.
- Shoreline data: 

In [5]:
# Initialize an empty dictionary to store the results
annual_data = {}

In [6]:
# Iterate over keys in the data dictionary
for name in data.keys():
    waves_df = data[name]['waves']

    waves_df = waves_df.drop(['years', 'months'], axis=1)
    
    waves_df_annual = waves_df.groupby([waves_df.index.get_level_values('years'), waves_df.index.get_level_values('months')]).agg(
           {
        f'{name}_pp1d_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_N'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ENE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_E'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_ESE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSE'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_S'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_SW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WSW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_W'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_WNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_pp1d_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None,
        f'{name}_swh_from_NNW'  : lambda x: x[x != 0].quantile(0.5) if any(x != 0) else None
        })

    waves_df_annual = waves_df_annual.fillna(0)
    
    shoreline_df = data[name]['shorelines']

    # Create a MultiIndex with all possible combinations of years and months
    all_years = range(1984, 2023)
    all_months = range(1, 13)
    all_combinations = [(year, month) for year in all_years for month in all_months]

    full_index = pd.MultiIndex.from_tuples(all_combinations, names=['years', 'months'])

    # Group by the MultiIndex and calculate the median
    shoreline_df_annual = shoreline_df.groupby(level=['years', 'months']).median(numeric_only=True)

    # Reindex with the full MultiIndex to fill missing combinations with NaN
    shoreline_df_annual = shoreline_df_annual.reindex(full_index)
    
    # Check for columns that only have NaN values after reindexing
    empty_columns_after_reindexing = shoreline_df_annual.columns[shoreline_df_annual.isnull().all()]

   # Drop year and month columns
    shoreline_df_annual = shoreline_df_annual.drop(['years', 'months'], axis=1)

        # Iterate over each column in the DataFrame 

    # Iterate over each column in the DataFrame
    for i in range(1, len(shoreline_df_annual.columns) - 1):
        col = shoreline_df_annual.columns[i]

        # Skip columns with names "years" or "months"
        if col.lower() not in ['years', 'months']:
            for idx in shoreline_df_annual[col][shoreline_df_annual[col].isnull()].index:
                prev_col = shoreline_df_annual.columns[i - 1] if i - 1 >= 0 else None
                next_col = shoreline_df_annual.columns[i + 1] if i + 1 < len(shoreline_df_annual.columns) else None

                # Check if adjacent columns have non-NaN values and use them for filling NaNs
                if prev_col and next_col:
                    prev_val = shoreline_df_annual.at[idx, prev_col]
                    next_val = shoreline_df_annual.at[idx, next_col]
                    if pd.notnull(prev_val) and pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = (prev_val + next_val) / 2
                    elif pd.notnull(prev_val):
                        shoreline_df_annual.at[idx, col] = prev_val
                    elif pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = next_val
                elif prev_col:
                    prev_val = shoreline_df_annual.at[idx, prev_col]
                    if pd.notnull(prev_val):
                        shoreline_df_annual.at[idx, col] = prev_val
                elif next_col:
                    next_val = shoreline_df_annual.at[idx, next_col]
                    if pd.notnull(next_val):
                        shoreline_df_annual.at[idx, col] = next_val

    # Perform median replacement only for columns that are not "years" or "months"
    for column in shoreline_df_annual.columns:
        if column.lower() not in ['years', 'months']:
            # Check if there are any NaN values in the column
            if shoreline_df_annual[column].isnull().any():
                # Calculate the median value of the column (excluding NaN values)
                median_value = shoreline_df_annual[column].median()
        
                # Replace NaN values with the calculated median value
                shoreline_df_annual[column].fillna(median_value, inplace=True)

    # Ensure no NaNs are left before model training
    if shoreline_df_annual.isna().any().any():
        print(f"NaNs remain in shorelines data for {name}")
        continue  # Skip this iteration if NaNs are still present

                
    # Add the DataFrame to the dictionary with site name as key
    annual_data[name] = {
        'waves': waves_df_annual,
        'shorelines': shoreline_df_annual
    }

In [None]:
annual_data['TORR']['waves']

In [7]:
# List of all wave dataframes
waves_dfs = []

# Loop through each beach's data
for name in annual_data:
    # Copy the wave DataFrame
    df = annual_data[name]['waves'].copy()

    # Reset index to turn MultiIndex into columns
    df = df.reset_index()

    # Rename columns to remove the beach prefix
    new_columns = {col: col.replace(f'{name}_', '') for col in df.columns}
    df.rename(columns=new_columns, inplace=True)

    # Add a column for the beach name
    df['beach'] = name

    # Reorder columns
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]  # move 'beach' to the front
    df = df[cols]

    # Append to the list
    waves_dfs.append(df)

# Combine all dataframes into one
waves_combined = pd.concat(waves_dfs, ignore_index=True)

In [9]:
waves_combined

Unnamed: 0,beach,years,months,pp1d_from_N,swh_from_N,pp1d_from_NNE,swh_from_NNE,pp1d_from_NE,swh_from_NE,pp1d_from_ENE,...,pp1d_from_WSW,swh_from_WSW,pp1d_from_W,swh_from_W,pp1d_from_WNW,swh_from_WNW,pp1d_from_NW,swh_from_NW,pp1d_from_NNW,swh_from_NNW
0,CVCC,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,11.347860,2.497525,14.537097,2.851056,12.987213,2.683459
1,CVCC,1984,2,8.124553,0.898709,10.150081,0.979162,4.325909,0.938027,12.076363,...,0.000000,0.000000,11.780992,1.820531,10.590620,1.852415,13.017728,1.713646,12.319592,2.777541
2,CVCC,1984,3,11.416148,1.212591,10.845699,1.179881,9.994397,1.151467,10.096162,...,12.280782,1.696713,11.603828,1.136268,11.095743,1.845146,11.499841,2.191573,8.317714,1.289162
3,CVCC,1984,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,10.436861,1.644427,12.143021,1.578842,11.779658,1.670776,9.794569,1.529034,10.800816,1.398195
4,CVCC,1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,9.772054,2.445487,9.422171,1.687710,8.291199,1.530686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,MIRA,2022,8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,8.188249,0.664454,8.457845,1.108349,8.790249,1.480382
9356,MIRA,2022,9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,12.173240,2.308535,9.700359,1.217052,10.688978,1.292548,10.430787,1.813675,8.112554,2.067011
9357,MIRA,2022,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,10.234812,2.558815,11.438960,2.600941,10.592397,1.730166,11.204914,1.609404,11.528874,1.703321
9358,MIRA,2022,11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,14.634862,2.579548,12.441947,2.454408,13.128974,3.174024,12.206421,2.616470,0.000000,0.000000


In [8]:
# List of all shoreline dataframes
shorelines_dfs = []

# Loop through each beach's data
for name in annual_data:
    # Copy the shoreline DataFrame
    df = annual_data[name]['shorelines'].copy()

    # Reset index to turn MultiIndex into columns
    df = df.reset_index()

    # Rename columns to use generic names
    new_columns = {col: f'{col.split("_")[-1]}' for col in df.columns}
    df.rename(columns=new_columns, inplace=True)

    # Add a column for the beach name
    df['beach'] = name

    # Reorder columns
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]  # move 'beach' to the front
    df = df[cols]

    # Append to the list
    shorelines_dfs.append(df)

# Combine all dataframes into one
shorelines_combined = pd.concat(shorelines_dfs, ignore_index=True)

In [10]:
torr_shorelines = shorelines_combined[shorelines_combined['beach'] == 'TORR']
torr_shorelines

Unnamed: 0,beach,years,months,1,2,3,4,5,6,7,8,9,10
5148,TORR,1984,1,127.341791,64.146577,70.547531,115.017662,193.767937,201.470491,226.721756,389.911991,291.977818,211.161585
5149,TORR,1984,2,127.341791,64.146577,70.547531,115.017662,193.767937,201.470491,226.721756,389.911991,291.977818,211.161585
5150,TORR,1984,3,127.341791,64.146577,70.547531,115.017662,193.767937,201.470491,226.721756,389.911991,291.977818,211.161585
5151,TORR,1984,4,68.202639,47.883608,42.562175,88.300201,175.827400,188.048381,221.331366,388.158462,271.971201,210.403558
5152,TORR,1984,5,43.444593,39.959700,29.965047,81.823960,171.153308,185.185120,210.587753,380.066382,286.966622,199.214816
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5611,TORR,2022,8,181.887574,83.798995,77.919675,126.238288,196.958431,219.431143,236.212974,407.935886,340.655383,215.041576
5612,TORR,2022,9,180.021687,84.210283,81.297094,129.860962,201.877167,221.104556,249.945333,408.507229,330.339002,215.707524
5613,TORR,2022,10,172.687428,86.963107,82.404846,132.245063,204.154545,226.140038,251.290478,391.995482,296.135846,215.890059
5614,TORR,2022,11,173.316393,79.779173,80.172242,120.359394,197.344095,211.940968,246.774350,392.602328,296.745704,204.052931


In [11]:
# Define a function to normalize columns by subtracting the value of month 1, year 1984
def normalize_by_reference(df):
    # Find the reference row (month 1, year 1984) for the current beach
    reference_row = df[(df['years'] == 1984) & (df['months'] == 1)]


    # Otherwise, subtract the reference value from all rows in the dataframe
    for col in df.columns:
        if col not in ['years', 'months', 'beach']:
            df[col] -= reference_row[col].values[0]

    return df

# Apply the normalization function to each beach group
shorelines_normalized = shorelines_combined.groupby('beach', group_keys=True).apply(normalize_by_reference).reset_index(drop=True)
shorelines_normalized

Unnamed: 0,beach,years,months,1,2,3,4,5,6,7,8,9,10
0,ADLA,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ADLA,1984,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ADLA,1984,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ADLA,1984,4,18.235956,16.544675,24.035282,17.126782,12.230469,12.620807,16.279454,13.363555,19.099203,5.573088
4,ADLA,1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,VAGR,2022,8,10.804137,46.990446,19.388448,-16.260244,-2.908776,21.484457,-8.514907,-2.115407,20.072137,-2.328197
9356,VAGR,2022,9,1.041706,15.288826,36.190586,-21.372355,-7.177324,21.427532,-1.008303,-1.703837,4.748132,-13.411474
9357,VAGR,2022,10,4.119814,35.017437,41.222614,-9.115288,-5.923609,22.739544,11.452791,0.747645,28.205175,11.548602
9358,VAGR,2022,11,-11.645939,-1.866492,-2.540668,-25.434204,-29.388037,12.593057,-15.902495,-17.898834,-21.660184,-22.357994


In [12]:
torr_shorelines_normalized = shorelines_normalized[shorelines_normalized['beach'] == 'TORR']
torr_shorelines_normalized

Unnamed: 0,beach,years,months,1,2,3,4,5,6,7,8,9,10
7956,TORR,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7957,TORR,1984,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7958,TORR,1984,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7959,TORR,1984,4,-59.139153,-16.262969,-27.985356,-26.717461,-17.940537,-13.422110,-5.390391,-1.753529,-20.006618,-0.758027
7960,TORR,1984,5,-83.897198,-24.186877,-40.582484,-33.193701,-22.614628,-16.285372,-16.134003,-9.845609,-5.011197,-11.946769
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8419,TORR,2022,8,54.545783,19.652418,7.372144,11.220626,3.190495,17.960652,9.491218,18.023894,48.677564,3.879991
8420,TORR,2022,9,52.679896,20.063706,10.749562,14.843300,8.109230,19.634065,23.223577,18.595237,38.361184,4.545939
8421,TORR,2022,10,45.345637,22.816529,11.857315,17.227401,10.386608,24.669546,24.568722,2.083491,4.158028,4.728473
8422,TORR,2022,11,45.974602,15.632595,9.624711,5.341732,3.576158,10.470477,20.052594,2.690337,4.767886,-7.108654


In [13]:
# Merge the waves_combined and shorelines_normalized dataframes
# This will join the tables on 'years', 'months', and 'beach'
combined_data_1 = pd.merge(waves_combined, shorelines_normalized, how='inner', on=['years', 'months', 'beach'])
combined_data_1

Unnamed: 0,beach,years,months,pp1d_from_N,swh_from_N,pp1d_from_NNE,swh_from_NNE,pp1d_from_NE,swh_from_NE,pp1d_from_ENE,...,1,2,3,4,5,6,7,8,9,10
0,CVCC,1984,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,CVCC,1984,2,8.124553,0.898709,10.150081,0.979162,4.325909,0.938027,12.076363,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,CVCC,1984,3,11.416148,1.212591,10.845699,1.179881,9.994397,1.151467,10.096162,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,CVCC,1984,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-117.597740,51.136046,138.685651,131.161611,127.123823,156.872864,186.447649,256.959403,288.838791,0.000000
4,CVCC,1984,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,MIRA,2022,8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,7.684163,-1.958026,3.408113,10.227749,-3.387626,30.818624,-29.271880,-63.683315,-54.158489,-16.634225
9356,MIRA,2022,9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-14.976500,-8.291064,3.859495,8.287506,3.209754,12.593605,-49.431063,-61.179950,-67.551234,-30.289771
9357,MIRA,2022,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,8.311467,7.213318,21.086111,18.508708,3.328626,24.377364,-12.827168,-63.852576,-43.820709,-21.681380
9358,MIRA,2022,11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-39.317704,-23.573551,-17.988263,-22.202000,-16.505702,-2.672734,-52.931528,-70.928031,-80.456126,-50.516779


In [15]:
# One-hot encode the 'beach' column
encoder = OneHotEncoder(sparse=False)
beach_encoded = encoder.fit_transform(combined_data_1[['beach']])

# Create a DataFrame from the encoded array
beach_encoded_df = pd.DataFrame(beach_encoded, columns=encoder.get_feature_names_out(['beach']), index=combined_data_1.index)

# Drop the original 'beach' column from combined_data
combined_data = combined_data_1.drop('beach', axis=1)

# Concatenate the one-hot encoded beach column with combined_data
combined_data = pd.concat([combined_data, beach_encoded_df], axis=1)



In [16]:
# Splitting the combined data into training and testing sets

# Filter rows for training data (years <= 2014)
training_data = combined_data[combined_data['years'] <= 2014]

# Filter rows for testing data (years > 2014)
testing_data = combined_data[combined_data['years'] > 2014]

# Define the columns for the target variables
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# For training set
x_train = training_data.drop(target_columns + ['years', 'months'], axis=1)
y_train = training_data[target_columns]

# For testing set
x_test = testing_data.drop(target_columns + ['years', 'months'], axis=1)
y_test = testing_data[target_columns]

In [None]:
x_test

In [17]:
# Create an instance of the DecisionTreeRegressor model
model = DecisionTreeRegressor()

In [18]:
# Train the model
model.fit(x_train, y_train)

In [19]:
# Make predictions on the test set
y_pred = model.predict(x_test)


In [20]:
# Calculate and print the metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

37.60418306152313

In [None]:
# cross validation com time series split

In [21]:
# baseline dummy que prevê o ano a seguir com base no anterior (igual)

# Splitting the combined data into training and testing sets - Naive model to compare

# Train for 2021
training_data_naive = combined_data_1[combined_data_1['years'] == 2021]

# Test for 2022
testing_data_naive = combined_data_1[combined_data_1['years'] == 2022]

# Define the columns for the target variables
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# Initialize an empty DataFrame for naive predictions
y_pred_naive = pd.DataFrame()

# Iterate over each beach and month, and use the 2021 value as the prediction for 2022
for beach in training_data_naive['beach'].unique():
    for month in range(1, 13):
        # Extract the last entry for the month in 2021 for the current beach
        last_entry = training_data_naive[(training_data_naive['beach'] == beach) & 
                                         (training_data_naive['months'] == month)][target_columns].iloc[-1]

        # Add the 'beach' and 'months' columns to last_entry for alignment with the test data
        last_entry['beach'] = beach
        last_entry['months'] = month

        # Append this entry to the predictions DataFrame
        y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)

# Align y_pred_naive with testing_data_naive
y_pred_naive = y_pred_naive.set_index(['beach', 'months'])
testing_data_naive = testing_data_naive.set_index(['beach', 'months'])
y_test_naive = testing_data_naive[target_columns]


  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry,

  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry,

  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry,

  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry, ignore_index=True)
  y_pred_naive = y_pred_naive.append(last_entry,

In [22]:
# Calculate RMSE for the naive model
rmse_naive = mean_squared_error(y_test_naive, y_pred_naive, squared=False)
rmse_naive

16.51363422154244

In [23]:
# Feature Engineering - create a column that represents time passing
combined_data['elapsed_time'] = (combined_data['years'] - combined_data['years'].min()) * 12 + combined_data['months']

# Filter rows for training data (years <= 2014)
training_data_2 = combined_data[combined_data['years'] <= 2014]

# Filter rows for testing data (years > 2014)
testing_data_2 = combined_data[combined_data['years'] > 2014]

# Define the columns for the target variables
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# For training set
x_train_2 = training_data_2.drop(target_columns + ['years', 'months'], axis=1)
y_train_2 = training_data_2[target_columns]

# For testing set
x_test_2 = testing_data_2.drop(target_columns + ['years', 'months'], axis=1)
y_test_2 = testing_data_2[target_columns]

# Creating and training the model
model_2 = DecisionTreeRegressor(random_state=42)
model_2.fit(x_train_2, y_train_2)

# Making predictions and evaluating the model
y_pred_2 = model_2.predict(x_test_2)
mse_2 = mean_squared_error(y_test_2, y_pred_2)
rmse_2 = mean_squared_error(y_test_2, y_pred_2, squared=False)


print("Root Mean Squared Error:", rmse_2)

Root Mean Squared Error: 25.56865357522829


In [24]:
# RANDOM FOREST MODEL

# Creating and training the Random Forest model
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(x_train_2, y_train_2)

# Making predictions and evaluating the model
y_pred_rf = model_rf.predict(x_test_2)
mse_rf = mean_squared_error(y_test_2, y_pred_rf)
rmse_rf = mean_squared_error(y_test_2, y_pred_rf, squared=False)

print("Root Mean Squared Error with Random Forest:", rmse_rf)

Root Mean Squared Error with Random Forest: 18.26935090197734


In [25]:
#Time series cross-validation ensures that each test set is 'ahead in time' relative to its corresponding training set, 
#thereby better mimicking the scenario where you predict future values based on past observations. 
#This method provides a more robust and realistic evaluation of your model's performance over time.

# Define the columns for the target variables and features
target_columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
feature_columns = [col for col in combined_data.columns if col not in target_columns + ['years', 'months']]

# Preparing the dataset
X = combined_data[feature_columns]
y = combined_data[target_columns]

# Create time series cross-validator object
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store results
rmse_scores = []

# Loop through each split
for train_index, test_index in tscv.split(X):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

    # Create and fit the Random Forest model
    model_cv = RandomForestRegressor(random_state=42)
    model_cv.fit(X_train_cv, y_train_cv)

    # Make predictions and evaluate
    y_pred_cv = model_cv.predict(X_test_cv)
    rmse_cv = mean_squared_error(y_test_cv, y_pred_cv, squared=False)
    rmse_scores.append(rmse_cv)

# Calculate average RMSE
average_rmse = np.mean(rmse_scores)
print("Average RMSE with Time Series Cross-Validation:", average_rmse)

Average RMSE with Time Series Cross-Validation: 22.932340101712455
