In [9]:
from sklearn.neighbors import BallTree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
def get_nearest_neighbors(gdf, k_neighbors=5):

    src_points = [(x,y) for x,y in zip(gdf.geometry.x , gdf.geometry.y)]
    candidates =  [(x,y) for x,y in zip(gdf.geometry.x , gdf.geometry.y)]

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='euclidean')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors+1)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    closest_gdfs = []
    for k in np.arange(k_neighbors):
        gdf_new = gdf.iloc[indices[k+1]].reset_index()
        del gdf_new['index']
        gdf_new = gdf_new.add_suffix(f'_{k+1}')
        closest_gdfs.append(gdf_new)
        
    closest_gdfs.insert(0,gdf)    
    gdf_final = pd.concat(closest_gdfs,axis=1)

    return gdf_final

def rfsi(gdf, k_neighbors=5, vname = '', ntrees = 150, seed = None, folds = 5):
    # Definimos el nombre del target y el valor de k para k-nearest neighbors
    target = vname
    
    # Creamos el modelo de Random Forest
    random_forest_model = RandomForestRegressor(n_estimators=ntrees, random_state=seed)
    
    if isinstance(k_neighbors, list):
        xv = pd.DataFrame(columns=['knn','rmse'])
        for j in k_neighbors:
            knn = j

            # Obtenemos los knn vecinos más cercanos
            gdf_nn = get_nearest_neighbors(gdf=gdf,k_neighbors=knn)
            # Generamos automáticamente el nombre de las características basadas en el nombre del target y el valor de k
            features = [f'{target}_{i}' for i in range(1, knn + 1)]
            X = gdf_nn[features]
            y = gdf_nn[target]

            # Define los pliegues para la validación cruzada
            kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

            # Realiza la validación cruzada y obtiene las predicciones y los valores reales
            predictions = cross_val_predict(random_forest_model, X, y, cv=kf)

            # Crear un DataFrame con las columnas "Observado" y "Predicho"
            residuals = pd.DataFrame({'observado': y, 'predicho': predictions})

            # Calcular el RMSE utilizando los valores observados y predichos en el DataFrame
            rmse = np.sqrt(mean_squared_error(residuals['observado'], residuals['predicho']))

            #Registrar el valor de rmse
            #xv = pd.concat([xv, pd.DataFrame({'knn': [knn], 'rmse': [rmse]})], ignore_index=True)
            xv.loc[len(xv)] = [knn, rmse]
        
        #Filtrar el menor rmse
        min_rmse = xv.loc[xv['rmse'].idxmin()]

        #Obtener el nuemro de vecinos optimo
        knn = min_rmse['knn']
        rmse = min_rmse['rmse']
        print("Numero vecinos optimo: ", knn, " - rmse: ", round(rmse,3))
    elif isinstance(k_neighbors, int):
        knn = k_neighbors
    else:
        return "Tipo de dato no soportado para k_neighbors"
    # Obtenemos los knn vecinos más cercanos
    gdf_nn = get_nearest_neighbors(gdf=gdf,k_neighbors=knn)

    # Generamos automáticamente el nombre de las características basadas en el nombre del target y el valor de k
    features = [f'{target}_{i}' for i in range(1, knn + 1)]
    X = gdf_nn[features]
    y = gdf_nn[target]

    # Entrena el modelo utilizando todos los datos
    random_forest_model.fit(X, y)

    # Retorna el modelo entrenado
    return random_forest_model

In [5]:
import geopandas as gpd
obs = gpd.read_file(filename="Data/ECa.gpkg",layer="EC_field_01")
knn = 5
model= rfsi(gdf=obs,k_neighbors=knn,vname='EC90',seed=701408733)

RMSE:  6.816


In [10]:
import geopandas as gpd
obs = gpd.read_file(filename="Data/ECa.gpkg",layer="EC_field_01")
knn = [5,10,15,20,25]
model = rfsi(gdf=obs,k_neighbors=knn,vname='EC90',seed=701408733)
