# Random Forest Model

Random Forest is an ensemble method that is based on decision-tree procedures. The method builds single decision trees using bootstrapping to repeatedly select random samples from the training data with replacement. This means that some samples may be repeated in the bootstrap sample while others are left out. From each of these samples a random subset of variables is selected. The samples that are not included in the bootstrap sample for a particular tree are known as out-of-bag (OOB) samples. On average, about one-third of the training data is left out as OOB samples for each tree. The predictions of the trees are averaged across all decision trees. This results in an improved prediction accuracy and prevents overfitting [@breiman2001random]. 
Depending on size of training set, a few hundred to several thousand trees are necessary. Here, only 100 trees are used due to processing time and memory problems.

## Overview
1. import packages
2. define base directory
3. define functions
4. load data and define variables
5. train model and make prediction for testing period 
    - input to model
      - lags = previous x time steps to predict the next step
      - n_estimators: number of trees
6. calcualte MSE for predicted and testing data
7. save data (predicted NDVI and MSE) to netCDF file

## Load packages

In [3]:
from pathlib import Path
import os
import xarray as xr
import numpy as np
import pandas as pd
from darts import TimeSeries
from darts.models import RandomForest
from sklearn.metrics import mean_squared_error
import netCDF4 as nc
import matplotlib.pyplot as plt

## Define base directory

In [4]:
# Define base_dir for consistent path management
notebook_dir = Path(os.getcwd()).resolve()
base_dir = notebook_dir.parent
print(base_dir)

/home/cgoehler/team-extra/ndvi-time-series-prediction


## Define functions

In [33]:
# Function to load data from NetCDF file and defining variables
def load_nc_file(file_path):
    """
    Load data from a NetCDF file.

    Parameters:
    file_path (str): Path to the NetCDF file.

    Returns:
    tuple: A tuple containing the NDVI data, times, x coordinates, and y coordinates.
    """
    ds = xr.open_dataset(file_path)
    ndvi = ds['NDVI']
    times = ds['time']
    x = ds['x']
    y = ds['y']
    return ndvi, times, x, y

# turn data into darts TimeSeries
def prepare_darts_timeseries(ndvi_data, times):
    """
    Turn NDVI data into a list of Darts TimeSeries objects.

    Parameters:
    ndvi_data (xarray.DataArray): The NDVI data array.
    times (xarray.DataArray): The time data array.

    Returns:
    list: A list of Darts TimeSeries objects.
    """
    series_list = []
    for i in range(ndvi_data.shape[1]):  # iterate over x dimension
        for j in range(ndvi_data.shape[2]):  # iterate over y dimension
            values = ndvi_data[:, i, j]
            # replace nan values by zeros (assuming only pixels with just NaNs exist)
            values = np.nan_to_num(values, nan=0.0)
            time_index = pd.to_datetime(times, unit='s')
            series = TimeSeries.from_times_and_values(time_index, values)
            series_list.append(series)
    return series_list

# preparing data by turning them into darts TimeSeries
def prediction_series(train_ndvi_data, train_times, test_times):
    """
    Train a Random Forest model and predict NDVI values for the test period.

    Parameters:
    train_ndvi_data (xarray.DataArray): The NDVI data for training.
    train_times (xarray.DataArray): The time data for training.
    test_times (xarray.DataArray): The time data for testing.

    Returns:
    list: A list of predicted Darts TimeSeries objects.
    """
    # Train Random Forest model
    model = RandomForest(
            lags=25,
            n_estimators=100)
    pred_series = []
    for i in range(train_ndvi_data.shape[1]):  # iterate over x dimension
        print(f'{i}/{train_ndvi_data.shape[1]}', end='\r')
        for j in range(train_ndvi_data.shape[2]):  # iterate over y dimension
            values = train_ndvi_data[:, i, j]
            # replace NaN values by zeros (assuming they only exist in pixels that are completely NaN)
            values = np.nan_to_num(values, nan=0.0)
            time_index = pd.to_datetime(train_times, unit='s')
            series = TimeSeries.from_times_and_values(time_index, values)
            # train model on training series
            model.fit(series)
            # predict using random forest model
            pred = model.predict(n=len(test_times))
            pred_series.append(pred)
    return pred_series

# Save predictions and MSE to a new NetCDF file
def save_to_nc_file(output_file, pred_data, mse_data, times, x, y):
    """
    Save predictions and MSE to a new NetCDF file.

    Parameters:
    output_file (str): Path to the output NetCDF file.
    pred_data (list): List of predicted Darts TimeSeries objects.
    mse_data (list): List of Mean Squared Error (MSE) values.
    times (numpy.array): Array of time values.
    x (numpy.array): Array of x coordinates.
    y (numpy.array): Array of y coordinates.
    """
    with nc.Dataset(output_file, 'w', format='NETCDF4') as ds:
        ds.createDimension('time', len(times))
        ds.createDimension('x', len(x))
        ds.createDimension('y', len(y))
        
        time_var = ds.createVariable('time', 'f4', ('time',))
        x_var = ds.createVariable('x', 'f4', ('x',))
        y_var = ds.createVariable('y', 'f4', ('y',))
        pred_var = ds.createVariable('pred_ndvi', 'f4', ('time', 'x', 'y'))
        mse_var = ds.createVariable('mse', 'f4', ('x', 'y'))
        
        time_var[:] = times
        x_var[:] = x
        y_var[:] = y
        
        pred_ndvi = np.array([pred.values().flatten() for pred in pred_data]).reshape((len(times), len(x), len(y)))
        pred_var[:] = pred_ndvi
        
        mse_var[:] = np.array(mse_list).reshape((len(x), len(y)))

## Load data and define variables

In [5]:
# Load training and testing data
train_path = base_dir / "data" / "data_train" / "ds_B_Cube_665_train.nc"
test_path = base_dir / "data" / "data_test" / "Cube_665_test.nc"

## Train model and make prediction for testing period

In [31]:
# make prediction
pred_series = prediction_series(train_ndvi, train_times, test_times)
print("done with prediction")

done with prediction


## Calcualte MSE for predicted and testing data and save data

In [45]:
# turn testing data into darts TimeSeries
test_series = prepare_darts_timeseries(test_ndvi, test_times)

In [60]:
# Calculate MSE for each pixel between prediction and testing data
mse_list = []
for pred, actual in zip(pred_series, test_series):
    mask = np.isfinite(actual.values().flatten())
    mse = mean_squared_error(actual.values().flatten()[mask], pred.values().flatten()[mask])
    mse_list.append(mse)
print("done with mse")

# Save the data (prediction and MSE)
pred_path = base_dir / "data" / "data_predictions"
save_to_nc_file(pred_path, 'Random_Forest_Cube_665.nc', pred_series, mse_list, test_times, test_x, test_y)
print("done saving")

done with mse
done saving
