In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
import matplotlib.pyplot as plt

data = pd.read_csv('../data/Ground Water .csv')


In [7]:

# Fill NaN values in numeric columns with median
numeric_columns = data.select_dtypes(include='number').columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Handle outliers using quantile clipping
numeric_data = data.select_dtypes(include=[np.number])
data[numeric_data.columns] = numeric_data.clip(
    lower=numeric_data.quantile(0.01), 
    upper=numeric_data.quantile(0.99), 
    axis=1
)

# Convert categorical variables to dummy variables
data = pd.get_dummies(data)

# Scale the features
scaler = StandardScaler()
data[data.select_dtypes(include=['float64']).columns] = scaler.fit_transform(
    data.select_dtypes(include=['float64'])
)

# Drop specified columns and split features/target
data = data.drop(data.columns[[1, 2]], axis=1)
X = data.iloc[:,:-3]
y = data.iloc[:, -1]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Perform K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []
accuracy_scores = []
rmse_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train), 1):
    # Split data for this fold
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index]
    y_val_fold = y_train.iloc[val_index]
    
    # Train the model
    rf_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred = rf_model.predict(X_val_fold)
    
    # Calculate metrics
    mse = mean_squared_error(y_val_fold, y_pred)
    rmse = np.sqrt(mse)
    
    # Convert predictions to binary for accuracy calculation
    y_pred_binary = (y_pred >= 0.5).astype(int)
    y_val_binary = (y_val_fold >= 0.5).astype(int)
    acc = accuracy_score(y_val_binary, y_pred_binary) * 100
    
    # Store scores
    mse_scores.append(mse)
    accuracy_scores.append(acc)
    rmse_scores.append(rmse)
    
    print(f"\nFold {fold}:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
   


Fold 1:
MSE: 0.0105
RMSE: 0.1022

Fold 2:
MSE: 0.0006
RMSE: 0.0245

Fold 3:
MSE: 0.0000
RMSE: 0.0053

Fold 4:
MSE: 0.0076
RMSE: 0.0871

Fold 5:
MSE: 0.0009
RMSE: 0.0305
