# Leave-one-site-out random forest prediction accuracy

In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [7]:
data = pd.read_csv('full_dataset.csv', index_col=0)
data.head()

Unnamed: 0,Aspect,Aspect_LP10,Aspect_LP30,Aspect_ss_5,CV,DEM5,DEM5_ns,DEM_ss_5,DEM_ss_5_ns,FW,...,geomorphons_LP10_3,geomorphons_LP10_4,geomorphons_LP10_5,geomorphons_LP10_6,geomorphons_LP10_7,geomorphons_LP10_8,geomorphons_LP10_9,SD,One_plus_Ymean_minus_SD_over_Ymean_plus_SD_over_2,Ymean_times_One_plus_Ymean_minus_SD_over_Ymean_plus_SD_over_2
6,3.037332,2.606362,2.52538,2.928303,0.164828,1245.4928,1245.4928,1245.4492,1245.4492,5.493209,...,False,False,False,False,False,False,False,0.056301,0.858495,0.293239
7,3.044177,2.693667,2.576599,2.977511,0.157282,1245.4727,1245.4727,1245.4326,1245.4326,5.462603,...,False,False,False,False,False,False,False,0.055823,0.864094,0.306688
8,3.044177,2.693667,2.576599,2.977511,0.171515,1245.4727,1245.4727,1245.4326,1245.4326,5.462603,...,False,False,False,False,False,False,False,0.065438,0.853595,0.325669
9,0.043711,0.935594,2.540089,0.304161,0.109724,1245.6732,1245.6732,1245.4695,1245.4695,5.213712,...,False,False,False,False,False,False,False,0.041659,0.901125,0.342133
10,0.074691,1.162928,2.600108,0.374965,0.015728,1245.6699,1245.6699,1245.4613,1245.4613,5.359166,...,False,False,False,False,False,False,False,0.005824,0.984516,0.364602


In [13]:
# Define the target variable and terrain attributes
target_variable = 'Ymean_times_One_plus_Ymean_minus_CV_over_Ymean_plus_CV_over_2'
terrain_attributes = [
    'Aspect_LP10', 'FW_gs_LP10', 'hillshade', 'Midslope_LP10', 'MRRTF_LP10', 'MRVBF_LP10', 
    'NegOpen_LP10', 'PosOpen_LP10', 'ProfileCurvature_LP30', 'SCA_D8_LP10', 'SlopeHeight_LP10', 
    'Slope_LP10', 'SWI1E2_LP10', 'SWI10', 'SWI1E16', 'SWI1E2', 'SWI1E4', 'SWI1E8', 'TangentCurvature_LP30', 
    'TPI_LP10', 'TWI_D8_LP10', 'TWI_MFD1.1_LP10', 'ValleyDepth_LP10', 'vdist_chn_network', 
    'wind_shelter_LP10', 'geomorphons_LP10_1', 'geomorphons_LP10_3', 'geomorphons_LP10_4', 
    'geomorphons_LP10_5', 'geomorphons_LP10_6', 'geomorphons_LP10_7', 'geomorphons_LP10_8', 
    'geomorphons_LP10_9'
]

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Store results
results = {}

# List of sites
sites = data['Site'].unique()

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Loop through each site
for site in sites:
    print(f'Processing site {site}...')
    # Split data into training and test sets based on the site
    train_data = data[data['Site'] != site]
    test_data = data[data['Site'] == site]
    
    X_train_full = train_data[terrain_attributes]
    y_train_full = train_data[target_variable]
    X_test = test_data[terrain_attributes]
    y_test = test_data[target_variable]

    # Impute missing values
    X_train_full = imputer.fit_transform(X_train_full)
    X_test = imputer.transform(X_test)

    # Further split the training data into training and validation sets (70/30 split)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(random_state=42)

    # Perform Grid Search with Cross-Validation
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2', n_jobs=6)
    grid_search.fit(X_train, y_train)

    # Get the best model from Grid Search
    best_rf = grid_search.best_estimator_

    # Save the best model and X_train for future use
    with open(f'models/model_{target_variable}_site_{site}.pkl', 'wb') as model_file:
        pickle.dump(best_rf, model_file)
    
    pd.DataFrame(X_train).to_csv(f'models/X_train_{target_variable}_site_{site}.csv', index=False)

    # Perform cross-validation
    cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='r2')
    mean_cv_score = np.mean(cv_scores)

    # Predict on training, validation, and test sets
    y_train_pred = best_rf.predict(X_train)
    y_val_pred = best_rf.predict(X_val)
    y_test_pred = best_rf.predict(X_test)
    
    # Calculate R² and RMSE for training, validation, and test sets
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Store feature importances and scores
    results[site] = {
        'feature_importances': best_rf.feature_importances_,
        'cv_r2': mean_cv_score,
        'train_r2': train_r2,
        'train_rmse': train_rmse,
        'val_r2': val_r2,
        'val_rmse': val_rmse,
        'test_r2': test_r2,
        'test_rmse': test_rmse
    }

    print(f'Model for site {site}:')
    print(f'Cross-Validation R²: {mean_cv_score}')
    print(f'Training R²: {train_r2}')
    print(f'Training RMSE: {train_rmse}')
    print(f'Validation R²: {val_r2}')
    print(f'Validation RMSE: {val_rmse}')
    print(f'Test R²: {test_r2}')
    print(f'Test RMSE: {test_rmse}')
    print('Feature Importances:', best_rf.feature_importances_)
    print()

# Convert results to DataFrames for easier visualization
importance_df = pd.DataFrame({
    site: results[site]['feature_importances'] for site in sites
}, index=terrain_attributes)

scores_df = pd.DataFrame({
    'CV_R2': {site: results[site]['cv_r2'] for site in sites},
    'Train_R2': {site: results[site]['train_r2'] for site in sites},
    'Train_RMSE': {site: results[site]['train_rmse'] for site in sites},
    'Val_R2': {site: results[site]['val_r2'] for site in sites},
    'Val_RMSE': {site: results[site]['val_rmse'] for site in sites},
    'Test_R2': {site: results[site]['test_r2'] for site in sites},
    'Test_RMSE': {site: results[site]['test_rmse'] for site in sites}
})

# Save the results to CSV files
importance_df.to_csv('feature_importances_leave_one_site_out_grid_search.csv')
scores_df.to_csv('r2_rmse_scores_leave_one_site_out_grid_search.csv')

print('Feature importances and R² and RMSE scores saved to CSV files.')

Processing site CO...




Model for site CO:
Cross-Validation R²: 0.8942799682450598
Training R²: 0.9830926731700642
Training RMSE: 0.018369871437681776
Validation R²: 0.9110391335178477
Validation RMSE: 0.042459190661022814
Test R²: -0.28100433205318853
Test RMSE: 0.15013045606682898
Feature Importances: [9.22065101e-02 1.02220692e-02 1.67722576e-02 2.89724364e-02
 1.58942757e-01 3.97646808e-02 1.22254468e-01 3.41564451e-02
 2.89221770e-02 6.92340845e-03 5.10157782e-02 2.93348123e-02
 1.87300210e-02 3.17873176e-02 3.13607157e-02 1.81486539e-02
 1.24853510e-02 7.38009543e-03 2.91549472e-02 3.80641070e-02
 6.07840502e-03 1.76887996e-02 4.94149966e-02 7.56667414e-02
 1.42138715e-02 1.20688135e-02 1.34250770e-02 9.45147944e-04
 4.27818535e-04 8.51114439e-04 3.54357820e-04 2.22089027e-03
 4.49573832e-05]

Processing site MD...




Model for site MD:
Cross-Validation R²: 0.8745036409918651
Training R²: 0.9765699520534694
Training RMSE: 0.020051669949968112
Validation R²: 0.8826490701348658
Validation RMSE: 0.044939516122436636
Test R²: -1.7934903087138316
Test RMSE: 0.2219994179752943
Feature Importances: [3.05826267e-02 1.75449427e-02 1.10449306e-02 3.63750577e-02
 4.29331442e-02 4.76258731e-02 1.79072795e-01 3.64387949e-02
 1.03354030e-01 1.00029520e-02 3.05348527e-02 1.75319154e-02
 2.89897070e-02 4.68345601e-02 1.16069267e-02 1.89042899e-02
 9.62955100e-03 9.65830118e-03 3.78244225e-02 1.88273633e-02
 2.35806216e-02 2.30986634e-02 3.89401833e-02 4.39260540e-02
 1.13396599e-01 3.25909489e-03 3.17397139e-05 4.86663002e-04
 2.81909859e-03 2.79549643e-03 1.76731452e-05 2.20231983e-03
 1.28756381e-04]

Processing site TX...




Model for site TX:
Cross-Validation R²: 0.872245958693958
Training R²: 0.977812608938558
Training RMSE: 0.020714699841843896
Validation R²: 0.8849252402242708
Validation RMSE: 0.047040011609049956
Test R²: -2.2831467740130633
Test RMSE: 0.17380661997614102
Feature Importances: [1.17395172e-01 1.19794607e-02 1.66496374e-02 2.84362673e-02
 4.49254536e-02 4.80709936e-02 5.14508847e-02 3.06903470e-02
 4.59103272e-02 1.05628304e-02 3.44550181e-02 1.92545527e-02
 3.47791089e-02 3.75205437e-02 1.73550808e-02 2.13183750e-02
 1.12372591e-02 1.13064329e-02 3.23404806e-02 1.62703396e-01
 1.77392195e-02 1.98150926e-02 4.68088859e-02 9.44314423e-02
 2.72925194e-02 1.07385509e-03 6.00591558e-04 1.16488225e-03
 1.62239083e-04 8.56648837e-04 7.05987626e-05 1.55498024e-03
 8.74231363e-05]

Feature importances and R² and RMSE scores saved to CSV files.
