# Leave-one-field-out linear regression predictions

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
import pickle

In [2]:
data = pd.read_csv('full_dataset.csv') 
print(data.shape)

(92488, 155)


In [3]:
# Define the target variable and terrain attributes
target_variable = 'Ymean_times_One_plus_Ymean_minus_CV_over_Ymean_plus_CV_over_2'

terrain_attributes = [
    'NDVI_avg',
    'Soil_om_15-30cm',
    'SWI10',
    'PosOpen_LP30',
    'TPI_LP30',
    'Aspect_LP30'
]

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Store results
results = {}

# List of fields
fields = data['Field'].unique()

# Loop through each field
for field in fields:
    print(f'Processing field {field}...')
    # Split data into training and test sets based on the field
    train_data = data[data['Field'] != field]
    test_data = data[data['Field'] == field]
    
    X_train_full = train_data[terrain_attributes]
    y_train_full = train_data[target_variable]
    X_test = test_data[terrain_attributes]
    y_test = test_data[target_variable]

    # Impute missing values
    X_train_full = imputer.fit_transform(X_train_full)
    X_test = imputer.transform(X_test)

    # Further split the training data into training and validation sets (70/30 split)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

    # Initialize the Linear Regression model
    lr = LinearRegression()

    # Fit the model
    lr.fit(X_train, y_train)

    # Save the model and X_train for future use
    with open(f'models/model_{target_variable}_field_{field}.pkl', 'wb') as model_file:
        pickle.dump(lr, model_file)
    
    pd.DataFrame(X_train).to_csv(f'models/X_train_{target_variable}_field_{field}.csv', index=False)

    # Perform cross-validation
    cv_scores = cross_val_score(lr, X_train, y_train, cv=5, scoring='r2')
    mean_cv_score = np.mean(cv_scores)

    # Predict on training, validation, and test sets
    y_train_pred = lr.predict(X_train)
    y_val_pred = lr.predict(X_val)
    y_test_pred = lr.predict(X_test)
    
    # Calculate R² and RMSE for training, validation, and test sets
    train_r2 = r2_score(y_train, y_train_pred) if len(np.unique(y_train)) > 1 else np.nan
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_r2 = r2_score(y_val, y_val_pred) if len(np.unique(y_val)) > 1 else np.nan
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    test_r2 = r2_score(y_test, y_test_pred) if len(np.unique(y_test)) > 1 else np.nan
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Store results
    results[field] = {
        'cv_r2': mean_cv_score,
        'train_r2': train_r2,
        'train_rmse': train_rmse,
        'val_r2': val_r2,
        'val_rmse': val_rmse,
        'test_r2': test_r2,
        'test_rmse': test_rmse
    }

    print(f'Model for field {field}:')
    print(f'Cross-Validation R²: {mean_cv_score}')
    print(f'Training R²: {train_r2}')
    print(f'Training RMSE: {train_rmse}')
    print(f'Validation R²: {val_r2}')
    print(f'Validation RMSE: {val_rmse}')
    print(f'Test R²: {test_r2}')
    print(f'Test RMSE: {test_rmse}')
    print()

# Convert results to DataFrames for easier visualization
scores_df = pd.DataFrame({
    'CV_R2': {field: results[field]['cv_r2'] for field in fields},
    'Train_R2': {field: results[field]['train_r2'] for field in fields},
    'Train_RMSE': {field: results[field]['train_rmse'] for field in fields},
    'Val_R2': {field: results[field]['val_r2'] for field in fields},
    'Val_RMSE': {field: results[field]['val_rmse'] for field in fields},
    'Test_R2': {field: results[field]['test_r2'] for field in fields},
    'Test_RMSE': {field: results[field]['test_rmse'] for field in fields}
})

Processing field 13-N1...
Model for field 13-N1:
Cross-Validation R²: 0.22937987649947128
Training R²: 0.23040646293684242
Training RMSE: 0.11424976893512473
Validation R²: 0.2300524346405105
Validation RMSE: 0.11407076088201547
Test R²: 0.08495686695347449
Test RMSE: 0.1142042840398099

Processing field 13-N2...
Model for field 13-N2:
Cross-Validation R²: 0.4181262760930517
Training R²: 0.4193060207844215
Training RMSE: 0.10866013454823381
Validation R²: 0.4242397316534321
Validation RMSE: 0.10779885142356802
Test R²: -0.002962815205790559
Test RMSE: 0.1208688499799133

Processing field Hevelow5...
Model for field Hevelow5:
Cross-Validation R²: 0.34341962357189215
Training R²: 0.3436516133285096
Training RMSE: 0.10432304780295858
Validation R²: 0.33721007587361507
Validation RMSE: 0.10482064673487257
Test R²: -0.23874466708312303
Test RMSE: 0.16929539112550523

Processing field Kennedyville2...
Model for field Kennedyville2:
Cross-Validation R²: 0.3643947322482261
Training R²: 0.36621

In [10]:
# Save the results to CSV files
# importance_df.to_csv('feature_importances_leave_one_field_out_ndvi.csv')
# scores_df.to_csv('r2_rmse_scores_leave_one_field_out_ndvi.csv')

importance_df.to_csv('feature_importances_leave_one_field_out_simple_regression.csv')
scores_df.to_csv('r2_rmse_scores_leave_one_field_out_simple_regression.csv')

print('Feature importances and R² and RMSE scores saved to CSV files.')

Feature importances and R² and RMSE scores saved to CSV files.
