# Leave-one-field-out random forest predicitons with different set of predictor variables

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
data = pd.read_csv('full_dataset.csv') 
print(data.shape)

(92488, 155)


In [3]:
# Define the target variable and terrain attributes
target_variable = 'Ymean_times_One_plus_Ymean_minus_CV_over_Ymean_plus_CV_over_2'

terrain_attributes = ['NDVI_avg',
 'Soil_om_15-30cm',
 'SWI10',
 'PosOpen_LP30',
 'TPI_LP30',
 'Aspect_LP30']

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Store results
results = {}

# List of fields
fields = data['Field'].unique()

# Loop through each field
for field in fields:
    print(f'Processing field {field}...')
    # Split data into training and test sets based on the field
    train_data = data[data['Field'] != field]
    test_data = data[data['Field'] == field]
    
    X_train_full = train_data[terrain_attributes]
    y_train_full = train_data[target_variable]
    X_test = test_data[terrain_attributes]
    y_test = test_data[target_variable]

    # Impute missing values
    X_train_full = imputer.fit_transform(X_train_full)
    X_test = imputer.transform(X_test)

    # Further split the training data into training and validation sets (70/30 split)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=300, random_state=42)

    # Fit the model
    rf.fit(X_train, y_train)

    # Save the model and X_train for future use
    with open(f'models/model_{target_variable}_field_{field}.pkl', 'wb') as model_file:
        pickle.dump(rf, model_file)
    
    pd.DataFrame(X_train).to_csv(f'models/X_train_{target_variable}_field_{field}.csv', index=False)

    # Perform cross-validation
    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
    mean_cv_score = np.mean(cv_scores)

    # Predict on training, validation, and test sets
    y_train_pred = rf.predict(X_train)
    y_val_pred = rf.predict(X_val)
    y_test_pred = rf.predict(X_test)
    
    # Calculate R² and RMSE for training, validation, and test sets
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Store feature importances and scores
    results[field] = {
        'feature_importances': rf.feature_importances_,
        'cv_r2': mean_cv_score,
        'train_r2': train_r2,
        'train_rmse': train_rmse,
        'val_r2': val_r2,
        'val_rmse': val_rmse,
        'test_r2': test_r2,
        'test_rmse': test_rmse
    }

    print(f'Model for field {field}:')
    print(f'Cross-Validation R²: {mean_cv_score}')
    print(f'Training R²: {train_r2}')
    print(f'Training RMSE: {train_rmse}')
    print(f'Validation R²: {val_r2}')
    print(f'Validation RMSE: {val_rmse}')
    print(f'Test R²: {test_r2}')
    print(f'Test RMSE: {test_rmse}')
    print('Feature Importances:', rf.feature_importances_)
    print()

# Convert results to DataFrames for easier visualization
importance_df = pd.DataFrame({
    field: results[field]['feature_importances'] for field in fields
}, index=terrain_attributes)

scores_df = pd.DataFrame({
    'CV_R2': {field: results[field]['cv_r2'] for field in fields},
    'Train_R2': {field: results[field]['train_r2'] for field in fields},
    'Train_RMSE': {field: results[field]['train_rmse'] for field in fields},
    'Val_R2': {field: results[field]['val_r2'] for field in fields},
    'Val_RMSE': {field: results[field]['val_rmse'] for field in fields},
    'Test_R2': {field: results[field]['test_r2'] for field in fields},
    'Test_RMSE': {field: results[field]['test_rmse'] for field in fields}
})

Processing field 13-N1...
Model for field 13-N1:
Cross-Validation R²: 0.8760971021813596
Training R²: 0.9793437640030439
Training RMSE: 0.018717602403852802
Validation R²: 0.882777860691988
Validation RMSE: 0.04450907665520576
Test R²: 0.060384097198468156
Test RMSE: 0.1157275586953328
Feature Importances: [0.35314279 0.20739521 0.10119277 0.13243722 0.10173944 0.10409257]

Processing field 13-N2...
Model for field 13-N2:
Cross-Validation R²: 0.90586714856607
Training R²: 0.9850314510519824
Training RMSE: 0.01744562077711823
Validation R²: 0.9150860204957929
Validation RMSE: 0.041398335975733266
Test R²: -0.1267074283107903
Test RMSE: 0.12810838398594226
Feature Importances: [0.54217107 0.08712684 0.09047385 0.10947935 0.0820277  0.0887212 ]

Processing field Hevelow5...
Model for field Hevelow5:
Cross-Validation R²: 0.8853359899883204
Training R²: 0.9797427862315657
Training RMSE: 0.018327486615940212
Validation R²: 0.8925722769562229
Validation RMSE: 0.04220044898399607
Test R²: -0.1

In [10]:
# Save the results to CSV files
# importance_df.to_csv('feature_importances_leave_one_field_out_ndvi.csv')
# scores_df.to_csv('r2_rmse_scores_leave_one_field_out_ndvi.csv')

importance_df.to_csv('feature_importances_leave_one_field_out_simple.csv')
scores_df.to_csv('r2_rmse_scores_leave_one_field_out_simple.csv')

print('Feature importances and R² and RMSE scores saved to CSV files.')

Feature importances and R² and RMSE scores saved to CSV files.
