# Leave-one-field-out random forest predictions

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [4]:
data = pd.read_csv('full_dataset.csv') 
print(data.shape)

(92513, 164)


In [5]:
# Define the target variable and terrain attributes
target_variable = 'Ymean_times_One_plus_Ymean_minus_CV_over_Ymean_plus_CV_over_2'

terrain_attributes = ['Aspect_LP30',
 'L8_Band4',
 'L8_Band7',
 'L8_EVI',
 'L8_NDVI',
 'LP30',
 'SWI10',
 'TPI_LP30']

# terrain_attributes = [
#     'Aspect_LP10', 'FW_gs_LP10', 'hillshade', 'Midslope_LP10', 'MRRTF_LP10', 'MRVBF_LP10', 'DEM5',
#     'NegOpen_LP10', 'PosOpen_LP10', 'ProfileCurvature_LP30', 'SCA_D8_LP10', 'SlopeHeight_LP10', 
#     'Slope_LP10', 'SWI1E2_LP10', 'SWI10', 'SWI1E16', 'SWI1E2', 'SWI1E4', 'SWI1E8', 'TangentCurvature_LP30', 
#     'TPI_LP10', 'TWI_D8_LP10', 'TWI_MFD1.1_LP10', 'ValleyDepth_LP10', 'vdist_chn_network', 
#     'wind_shelter_LP10', 'geomorphons_LP10_1', 'geomorphons_LP10_3', 'geomorphons_LP10_4', 
#     'geomorphons_LP10_5', 'geomorphons_LP10_6', 'geomorphons_LP10_7', 'geomorphons_LP10_8', 
#     'geomorphons_LP10_9'
# ]

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Store results
results = {}

# List of fields
fields = data['Field'].unique()

# Loop through each field
for field in fields:
    print(f'Processing field {field}...')
    # Split data into training and test sets based on the field
    train_data = data[data['Field'] != field]
    test_data = data[data['Field'] == field]
    
    X_train_full = train_data[terrain_attributes]
    y_train_full = train_data[target_variable]
    X_test = test_data[terrain_attributes]
    y_test = test_data[target_variable]

    # Impute missing values
    X_train_full = imputer.fit_transform(X_train_full)
    X_test = imputer.transform(X_test)

    # Further split the training data into training and validation sets (70/30 split)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Fit the model
    rf.fit(X_train, y_train)

    # Save the model and X_train for future use
    with open(f'models/model_{target_variable}_field_{field}.pkl', 'wb') as model_file:
        pickle.dump(rf, model_file)
    
    pd.DataFrame(X_train).to_csv(f'models/X_train_{target_variable}_field_{field}.csv', index=False)

    # Perform cross-validation
    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
    mean_cv_score = np.mean(cv_scores)

    # Predict on training, validation, and test sets
    y_train_pred = rf.predict(X_train)
    y_val_pred = rf.predict(X_val)
    y_test_pred = rf.predict(X_test)
    
    # Calculate R² and RMSE for training, validation, and test sets
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Store feature importances and scores
    results[field] = {
        'feature_importances': rf.feature_importances_,
        'cv_r2': mean_cv_score,
        'train_r2': train_r2,
        'train_rmse': train_rmse,
        'val_r2': val_r2,
        'val_rmse': val_rmse,
        'test_r2': test_r2,
        'test_rmse': test_rmse
    }

    print(f'Model for field {field}:')
    print(f'Cross-Validation R²: {mean_cv_score}')
    print(f'Training R²: {train_r2}')
    print(f'Training RMSE: {train_rmse}')
    print(f'Validation R²: {val_r2}')
    print(f'Validation RMSE: {val_rmse}')
    print(f'Test R²: {test_r2}')
    print(f'Test RMSE: {test_rmse}')
    print('Feature Importances:', rf.feature_importances_)
    print()

# Convert results to DataFrames for easier visualization
importance_df = pd.DataFrame({
    field: results[field]['feature_importances'] for field in fields
}, index=terrain_attributes)

scores_df = pd.DataFrame({
    'CV_R2': {field: results[field]['cv_r2'] for field in fields},
    'Train_R2': {field: results[field]['train_r2'] for field in fields},
    'Train_RMSE': {field: results[field]['train_rmse'] for field in fields},
    'Val_R2': {field: results[field]['val_r2'] for field in fields},
    'Val_RMSE': {field: results[field]['val_rmse'] for field in fields},
    'Test_R2': {field: results[field]['test_r2'] for field in fields},
    'Test_RMSE': {field: results[field]['test_rmse'] for field in fields}
})

# Save the results to CSV files
importance_df.to_csv('feature_importances_leave_one_field_out.csv')
scores_df.to_csv('r2_rmse_scores_leave_one_field_out.csv')

print('Feature importances and R² and RMSE scores saved to CSV files.')

Processing field 13-N1...
Model for field 13-N1:
Cross-Validation R²: 0.8708281521051464
Training R²: 0.9776684470886662
Training RMSE: 0.028867553193650083
Validation R²: 0.8799330271334544
Validation RMSE: 0.06705650911687466
Test R²: -0.5402846743558649
Test RMSE: 0.22122685321640947
Feature Importances: [0.0990902  0.06719567 0.059794   0.2972033  0.11836532 0.15083648
 0.09061518 0.11689986]

Processing field 13-N2...
Model for field 13-N2:
Cross-Validation R²: 0.8917092811407821
Training R²: 0.9821977822257928
Training RMSE: 0.025530969982776208
Validation R²: 0.8965787568920873
Validation RMSE: 0.06108828789988736
Test R²: -0.46930439724221196
Test RMSE: 0.2347963497306617
Feature Importances: [0.09943398 0.04575232 0.2773975  0.04450361 0.21612849 0.12843738
 0.09654907 0.09179765]

Processing field Hevelow5...
Model for field Hevelow5:
Cross-Validation R²: 0.8790122943801613
Training R²: 0.9783719641323178
Training RMSE: 0.028130414609288414
Validation R²: 0.8874662940501745
V