# Testing different ML models
Random forest regressor, gradient boosting regressor, zero-inflated beta regressor

In [1]:
import numpy as np
import pandas as pd

import csv

import random

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, RegressorMixin

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklego.meta import ZeroInflatedRegressor

import statsmodels.api as sm
from statsmodels.othermod.betareg import BetaModel

from matplotlib import pyplot as plt

from IPython.display import clear_output

import warnings

from typing import Dict

## Auxilliary functions and classes

In [2]:
def aggregate(data_role:str, data_X:np.ndarray, data_y:np.ndarray, noSupply_value:float, points:int) -> np.ndarray:
    '''Creates a dataset from raw data according to a 5-points-wide aggregated sliding window. Assumes no header rows.
    :param: data_role: whether the array to aggregate is the predictor ("X") or target ("y")
    :param: data_X: the predictor columns of the dataset to aggregate
    :param: data_Y: the target columns of the dataset to aggregate
    :param: noSupply_value: the value to substitute in the target variable if there was no shoot supply
    :param: points: the number of sampling points within a forest track.
    :return: the aggregated data array.
    '''

    if data_role == 'X':
        processed_data = data_X.copy()

        processed_data = processed_data.astype(float)

        rows, cols = np.shape(data_X)

        for i in range(rows):
            for j in range(0, cols, 3):
                # Choosing 5 adjacent points within the limit of one forest.
                start_row = max(i - 2, int((i // points * points)))
                end_row = min(i + 2, int((i // points + 1) * points - 1))

                processed_data[i, j] = np.sum(data_X[start_row:end_row+1, j].astype(float))

    elif data_role == 'y':
        processed_data = data_y.copy()

        processed_data = processed_data.astype(float)

        rows, cols = np.shape(data_y)

        for i in range(rows):
            for j in range(cols):
                # Choosing 5 adjacent points within the limit of one forest.
                start_row = max(i - 2, int((i // points * points)))  
                end_row = min(i + 2, int((i // points + 1) * points - 1))

                aggregated_supply = np.sum(data_X[start_row:end_row+1, j * 3].astype(float))

                if aggregated_supply > 0:
                    aggregated_browsed = sum([(float(data_X[row,j*3]) / float(aggregated_supply)) * float(data_y[row,j]) \
                        for row in range(start_row, end_row+1)])

                    processed_data[i,j] = aggregated_browsed

                else:
                    processed_data[i, j] = noSupply_value

    return processed_data

def normalize_TS(X:np.ndarray) -> np.ndarray:
    '''Normalizes the TS columns of a predictor array.
    :param: X: the array which contains the columns to normalize.
    :return: the complete array in which the TS columns are now normalized.'''

    X_normalized = X.copy()

    cols_to_normalize = [i for i in range(np.shape(X)[1]) if i % 3 == 0]

    smoothing = 0.00001     # Applied so that 0 division does not cause any problems while normalizing.

    X_normalized[:, cols_to_normalize] = (X[:, cols_to_normalize] - (X[:, cols_to_normalize].min(axis=0))) \
                            / (X[:, cols_to_normalize].max(axis=0) - (X[:, cols_to_normalize].min(axis=0)) + smoothing)
    
    return X_normalized

def adjust_predictions(y_pred:np.ndarray, X:np.ndarray, noSupply_value:float) -> np.ndarray:
    '''Ensures that the specified noSupply_value is predicted whenever there is no shoot supply
    and that there are no negative predictions when there IS supply.
    :param: y_pred: the array of original predictions.
    :param: X: the array of predictors.
    :param: noSupply_value: the value to predict in case of no supply.
    :return: the adjusted array of predictions.
    '''

    y_pred_adjusted = y_pred.copy()

    for i in range(y_pred.shape[0]):
            for j in range(y_pred.shape[1]):
                if X[i, j * 3] == 0:
                    y_pred_adjusted[i, j] = noSupply_value

                elif y_pred[i, j] < 0:
                    y_pred_adjusted[i, j] = 0

    return y_pred_adjusted

def csv_to_dict(filename:str) -> Dict[any,any]:
    '''Reads a csv file to a dictionary. Keys come from the first column, values from the second.
    :param: filename: name of the file to read, must include path if in different directory.
    :return: a dictionary constructed from the csv file.'''

    result_dict = {}

    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        
        for row in csv_reader:
            if len(row) >= 2:
                key = row[0]
                value = row[1]

                if value == 'None':
                    value = None

                elif '.' in value:
                    value = float(value)

                else:
                    value = int(value)

                result_dict[key] = value

    return result_dict

def reduce_data(X_array:np.ndarray, y_array, proportion:float, seed:int) -> np.ndarray:
    '''Reduces forest browsing data by a specified proportion using a random seed.
    :param: X_array: numpy array containing the predictors of the data.
    :param: y_array: numpy array containing freshly browsed data.
    :param: proportion: the proportion to reduce the data by.
    :param: seed: the seed for pseudo-random row selection.
    :return: the reduced data array.
    '''

    if proportion == 0.0:
        return X_array, y_array
    
    rows_per_track = 100
    rows_to_remove = int(rows_per_track * proportion)
    nr_tracks = len(X_array) // rows_per_track

    keep_indices = np.ones(len(X_array), dtype=bool)

    for i in range(nr_tracks):
        start_index = i * rows_per_track
        end_index = start_index + rows_per_track

        # Generate random indices within the current track
        random.seed(seed)
        indices_to_remove = random.sample(range(start_index, end_index), rows_to_remove)

        # Set the indices to remove to False in keep_indices
        keep_indices[indices_to_remove] = False

    # Seeting random seed back to system time.
    random.seed(None)

    # Filter the array using the keep_indices
    filtered_X, filtered_y = X_array[keep_indices, :], y_array[keep_indices]

    return filtered_X, filtered_y
    

In [3]:
class BetaRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self):
        self.model = None

    def fit(self, X, y):
        X_statsmodels = sm.add_constant(X)
        
        self.model = BetaModel(y, X_statsmodels)

        self.results = self.model.fit()

    def predict(self, X):
        X_statsmodels = sm.add_constant(X)
        
        return self.results.predict(X_statsmodels)

## Testing

In [4]:
# Loading the previously selected hyperparameters for the models.
rf_hp = csv_to_dict('./CV/rf.csv')
gb_hp = csv_to_dict('./CV/gb.csv')
br_hp = csv_to_dict('./CV/br.csv')

In [5]:
labels = ['Random Forest Regressor', 'Gradient Boosting Regressor', 'Zero-Inflated Beta Regressor']
models = [RandomForestRegressor(n_estimators=rf_hp['best_n_estimators'], max_depth=rf_hp['best_max_depth'], max_samples=rf_hp['best_max_samples'], random_state=23), \
          GradientBoostingRegressor(learning_rate=gb_hp['best_rate'], n_estimators=gb_hp['best_n_estimators'], max_depth=gb_hp['best_max_depth'], random_state=23), \
          ZeroInflatedRegressor(classifier=SVC(C=br_hp['best_C']), regressor=BetaRegressor())
        ]
sample_sizes = [1.0, 0.75, 0.5]
seeds = [i for i in range(50)]

noSupply_value = -0.00001

In [6]:
X_df = pd.read_csv('./Data/predictors_fl.csv')
y_df = pd.read_csv('./Data/freshly_browsed_dist_fl.csv')

In [7]:
og_X = X_df.values
og_y = y_df.values

In [8]:
results = {}

for label in labels:
    results[label] = {}
    for size in sample_sizes:
        results[label][size] = {'y_pred' : [], 'errors' : []}

data = {}

for size in sample_sizes:
    data[size] = []

In [9]:
total_datas = len(sample_sizes) * len(seeds)
progress_counter = 0

for size in sample_sizes:
    for seed in seeds:
        X_unagg, y_unagg = reduce_data(og_X, og_y, (1-size), seed)

        X = aggregate('X', X_unagg, y_unagg, noSupply_value, 100 * size)
        y = aggregate('y', X_unagg, y_unagg, noSupply_value, 100 * size)

        X = normalize_TS(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200, random_state=23)

        data[size].append((X_train, X_test, y_train, y_test))

        progress_counter += 1
        progress_percent = progress_counter / total_datas * 100

        clear_output(wait=True)
        print(f'Progress: {progress_percent:.1f}%')

Progress: 100.0%


In [10]:
array_labels = ['Xtrain', 'Xtest', 'ytrain', 'ytest']

for size, variations in data.items():
    size_label = ''.join(str(size).split('.'))
    
    for index, variation in enumerate(variations):

        for j, label in enumerate(array_labels):
            np.savetxt(f'./Data for testing/{size_label}_{index}_{label}.csv', variation[j], delimiter=',', fmt='%s')

In [11]:
total_runs = len(sample_sizes) * len(seeds)

for index, label in enumerate(labels):
        progress_counter = 0

        for size in sample_sizes:
                for seed_index, seed in enumerate(seeds):
                        X_train, X_test, y_train, y_test = data[size][seed_index]        

                        model = models[index]

                        if label == 'Random Forest Regressor':

                                model.fit(X_train, y_train)

                                y_pred_test = model.predict(X_test)

                                y_pred_test = adjust_predictions(y_pred_test, X_test, noSupply_value)

                        elif label == 'Gradient Boosting Regressor':
                                y_pred_test = np.zeros_like(y_test)

                                for i in range(np.shape(y_test)[1]):
                                        model.fit(X_train, y_train[:,i])
                                        y_pred_test[:,i] = model.predict(X_test)

                                y_pred_test = adjust_predictions(y_pred_test, X_test, noSupply_value)
                        
                        elif label == 'Zero-Inflated Beta Regressor':
                                # Exchanging ones to a value very close to 1 since beta regression cannot handle exactly 1.0 values.
                                y_train_beta = y_train.copy()
                                y_train_beta[y_train_beta == 1.0] = 1.0 - (-noSupply_value)
                                
                                y_test_beta = y_test.copy()
                                y_test_beta[y_test_beta == 1.0] = 1.0 - (-noSupply_value)

                                y_pred_test = np.zeros_like(y_test)

                                total_models = np.shape(y_pred_test)[1]

                                SVC_zeros = 0

                                for i in range(np.shape(y_test_beta)[1]):
                                        # Using only rows where there WAS supply from the species.
                                        reg_rows = [r for r in range(np.shape(y_test_beta)[0]) if y_test_beta[r, i] >= 0]
                                        noReg_rows = [r for r in range(np.shape(y_test_beta)[0]) if r not in reg_rows]

                                        X_train_reg = X_train[reg_rows]
                                        y_train_reg = y_train_beta[reg_rows, i]
                                        
                                        X_test_reg = X_test[reg_rows]

                                        try:    # Runs into error if all target values are 0.
                                                model.fit(X_train_reg, y_train_reg)
                                                y_pred_test[reg_rows,i] = model.predict(X_test_reg)

                                        except:
                                                pass    # Nothing to do as default prediction is 0.

                                        # Setting prediction for rows with no supply.
                                        y_pred_test[noReg_rows, i] = noSupply_value

                        
                        errors = y_pred_test - y_test

                        results[label][size]['y_pred'].append(y_pred_test)
                        results[label][size]['errors'].append(errors)

                        progress_counter += 1
                        progress_percent = progress_counter / total_datas * 100

                        clear_output(wait=True)
                        print(f'Progress for {index+1}. model, {label}: {progress_percent:.1f}%')

Progress for 3. model, Zero-Inflated Beta Regressor: 100.0%


In [12]:
for label, size_dict in results.items():
    if label == 'Random Forest Regressor':
        model_name = 'rf'

    elif label == 'Gradient Boosting Regressor':
        model_name = 'gb'

    elif label == 'Zero-Inflated Beta Regressor':
        model_name = 'br'

    for size, result in size_dict.items():
        size_name = ''.join(str(size).split('.'))
        
        for component, arrays in result.items():
            for i, array in enumerate(arrays):
                np.savetxt(f'./Results/{model_name}_{size_name}_{i}_{component}.csv', array, delimiter=',', fmt='%s')