# Leave-one-field-out predictions for within state

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
data = pd.read_csv('full_dataset.csv') 
print(data.shape)

(92513, 164)


In [3]:
# Define the target variable and terrain attributes
target_variable = 'Ymean_times_One_plus_Ymean_minus_CV_over_Ymean_plus_CV_over_2'

terrain_attributes = ['Aspect_LP30',
 'L8_Band4',
 'L8_Band7',
 'L8_EVI',
 'L8_NDVI',
 'LP30',
 'SWI10',
 'TPI_LP30']

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Store results
results = {}

# List of states
states = data['Site'].unique()

# Loop through each state
for state in states:
    print(f'Processing state {state}...')
    
    # Subset data for the current state
    state_data = data[data['Site'] == state]
    
    # List of fields within the state
    fields = state_data['Field'].unique()

    # Loop through each field within the state
    for field in fields:
        print(f'Processing field {field} in state {state}...')
        
        # Split data into training and test sets based on the field within the state
        train_data = state_data[state_data['Field'] != field]
        test_data = state_data[state_data['Field'] == field]
        
        X_train_full = train_data[terrain_attributes]
        y_train_full = train_data[target_variable]
        X_test = test_data[terrain_attributes]
        y_test = test_data[target_variable]

        # Impute missing values
        X_train_full = imputer.fit_transform(X_train_full)
        X_test = imputer.transform(X_test)

        # Further split the training data into training and validation sets (70/30 split)
        X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

        # Initialize the Random Forest Regressor
        rf = RandomForestRegressor(n_estimators=300, random_state=42)

        # Fit the model
        rf.fit(X_train, y_train)

        # Save the model and X_train for future use
        with open(f'models/model_{target_variable}_state_{state}_field_{field}.pkl', 'wb') as model_file:
            pickle.dump(rf, model_file)
        
        pd.DataFrame(X_train).to_csv(f'models/X_train_{target_variable}_state_{state}_field_{field}.csv', index=False)

        # Perform cross-validation
        cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
        mean_cv_score = np.mean(cv_scores)

        # Predict on training, validation, and test sets
        y_train_pred = rf.predict(X_train)
        y_val_pred = rf.predict(X_val)
        y_test_pred = rf.predict(X_test)
        
        # Calculate R² and RMSE for training, validation, and test sets
        train_r2 = r2_score(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        val_r2 = r2_score(y_val, y_val_pred)
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        test_r2 = r2_score(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        # Store feature importances and scores
        results[(state, field)] = {
            'feature_importances': rf.feature_importances_,
            'cv_r2': mean_cv_score,
            'train_r2': train_r2,
            'train_rmse': train_rmse,
            'val_r2': val_r2,
            'val_rmse': val_rmse,
            'test_r2': test_r2,
            'test_rmse': test_rmse
        }

        print(f'Model for field {field} in state {state}:')
        print(f'Cross-Validation R²: {mean_cv_score}')
        print(f'Training R²: {train_r2}')
        print(f'Training RMSE: {train_rmse}')
        print(f'Validation R²: {val_r2}')
        print(f'Validation RMSE: {val_rmse}')
        print(f'Test R²: {test_r2}')
        print(f'Test RMSE: {test_rmse}')
        print('Feature Importances:', rf.feature_importances_)
        print()

# Convert results to DataFrames for easier visualization
importance_df = pd.DataFrame({
    (state, field): results[(state, field)]['feature_importances'] for state, field in results
}, index=terrain_attributes)

scores_df = pd.DataFrame({
    'CV_R2': {f'{state}_{field}': results[(state, field)]['cv_r2'] for state, field in results},
    'Train_R2': {f'{state}_{field}': results[(state, field)]['train_r2'] for state, field in results},
    'Train_RMSE': {f'{state}_{field}': results[(state, field)]['train_rmse'] for state, field in results},
    'Val_R2': {f'{state}_{field}': results[(state, field)]['val_r2'] for state, field in results},
    'Val_RMSE': {f'{state}_{field}': results[(state, field)]['val_rmse'] for state, field in results},
    'Test_R2': {f'{state}_{field}': results[(state, field)]['test_r2'] for state, field in results},
    'Test_RMSE': {f'{state}_{field}': results[(state, field)]['test_rmse'] for state, field in results}
})

Processing state CO...
Processing field 13-N1 in state CO...
Model for field 13-N1 in state CO:
Cross-Validation R²: 0.8592023456059747
Training R²: 0.9750498829829825
Training RMSE: 0.030553994292099446
Validation R²: 0.8750127713092178
Validation RMSE: 0.06869594134284182
Test R²: -0.4600267746721378
Test RMSE: 0.21538614079916735
Feature Importances: [0.07854687 0.02656597 0.06196694 0.51111942 0.01957768 0.11075065
 0.0692706  0.12220186]

Processing field 13-N2 in state CO...
Model for field 13-N2 in state CO:
Cross-Validation R²: 0.8814949061212166
Training R²: 0.9811534086392637
Training RMSE: 0.024436210577340807
Validation R²: 0.8922028987886964
Validation RMSE: 0.058716595938946334
Test R²: -0.42523683443359617
Test RMSE: 0.23124852464927015
Feature Importances: [0.10185244 0.03806765 0.34936669 0.04809652 0.16832993 0.11284283
 0.09465646 0.08678749]

Processing state MD...
Processing field Hevelow5 in state MD...
Model for field Hevelow5 in state MD:
Cross-Validation R²: 0.

In [10]:
# Save the results to CSV files
# importance_df.to_csv('feature_importances_leave_one_field_out_ndvi.csv')
# scores_df.to_csv('r2_rmse_scores_leave_one_field_out_ndvi.csv')

importance_df.to_csv('feature_importances_leave_one_field_out_simple.csv')
scores_df.to_csv('r2_rmse_scores_leave_one_field_out_simple.csv')

print('Feature importances and R² and RMSE scores saved to CSV files.')

Feature importances and R² and RMSE scores saved to CSV files.
