In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
df = pd.read_excel("C:/Users/aray28/Desktop/SOC_META/SOC_REQ.xlsx")

In [3]:
y = df["SOC_lnRR"]
X = df.drop(columns=["SOC_lnRR"], errors="ignore")

In [5]:
# Define categorical and numerical features
categorical_cols = ["ClimateZone", "Soil_Type", "BIO_Type", "Treatment", "Crop_type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # ✅ fixed argument
])

# Combine into ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# Build full pipeline with ExtraTreesRegressor
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ExtraTreesRegressor(n_estimators=300, random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model_pipeline.fit(X_train, y_train)

# Predict
y_pred = model_pipeline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

MAE: 0.0803
R² Score: 0.8095


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

from scipy.stats import randint


In [7]:
df = pd.read_excel("C:/Users/aray28/Desktop/SOC_META/SOC_REQ.xlsx")

In [8]:
y = df["SOC_lnRR"]
X = df.drop(columns=["SOC_lnRR"], errors="ignore")

In [9]:
# Define categorical and numerical features
categorical_cols = ["ClimateZone", "Soil_Type", "BIO_Type", "Treatment", "Crop_type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base model
base_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ExtraTreesRegressor(random_state=42))
])

# Hyperparameter grid
param_dist = {
    "regressor__n_estimators": randint(200, 600),
    "regressor__max_depth": [None] + list(range(5, 30)),
    "regressor__min_samples_split": randint(2, 10),
    "regressor__min_samples_leaf": randint(1, 6),
    "regressor__max_features": ['sqrt', 'log2', None]
}

# Randomized search
random_search = RandomizedSearchCV(
    base_model,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Fit
random_search.fit(X_train, y_train)

# Predict
y_pred = random_search.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Params: {random_search.best_params_}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Params: {'regressor__max_depth': 24, 'regressor__max_features': None, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 4, 'regressor__n_estimators': 414}
MAE: 0.0792
R² Score: 0.8215


In [10]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Use best ExtraTrees params
et_model = ExtraTreesRegressor(
    n_estimators=414,
    max_depth=24,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features=None,
    random_state=42
)

# LightGBM
lgbm_model = LGBMRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=8,
    random_state=42
)

# CatBoost
cat_model = CatBoostRegressor(
    iterations=400,
    depth=6,
    learning_rate=0.05,
    random_state=42,
    verbose=0
)

# HistGradientBoosting (optional alternative)
hgb_model = HistGradientBoostingRegressor(
    max_iter=400,
    learning_rate=0.05,
    max_depth=8,
    random_state=42
)

# Create Voting Regressor Ensemble
ensemble_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", VotingRegressor(estimators=[
        ('et', et_model),
        ('lgbm', lgbm_model),
        ('cat', cat_model)
        # you can also add ('hgb', hgb_model) if needed
    ]))
])

# Fit ensemble
ensemble_model.fit(X_train, y_train)
y_pred_ens = ensemble_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred_ens)
r2 = r2_score(y_test, y_pred_ens)

print(f"Ensemble MAE: {mae:.4f}")
print(f"Ensemble R²: {r2:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 685
[LightGBM] [Info] Number of data points in the train set: 639, number of used features: 33
[LightGBM] [Info] Start training from score 0.282372
Ensemble MAE: 0.0732
Ensemble R²: 0.8551


In [11]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [12]:
df = pd.read_excel("C:/Users/aray28/Desktop/SOC_META/SOC_REQ.xlsx")

In [13]:
y = df["SOC_lnRR"]
X = df.drop(columns=["SOC_lnRR"])

In [15]:
categorical_cols = ["ClimateZone", "Soil_Type", "BIO_Type", "Crop_type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Fill missing values
X[categorical_cols] = X[categorical_cols].fillna("Missing")
for col in numerical_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
    X[col] = X[col].fillna(X[col].median())

# Preprocessing pipeline
numeric_pipeline = Pipeline([("imputer", SimpleImputer(strategy="mean"))])
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

def tune_model(trial, model_name):
    if model_name == "ExtraTrees":
        model = ExtraTreesRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 500),
            max_depth=trial.suggest_int("max_depth", 5, 30),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
            random_state=42,
            n_jobs=-1
        )
    elif model_name == "LGBM":
        model = LGBMRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 500),
            max_depth=trial.suggest_int("max_depth", 5, 30),
            num_leaves=trial.suggest_int("num_leaves", 20, 150),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            subsample=trial.suggest_float("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            random_state=42,
            n_jobs=-1
        )
    elif model_name == "CatBoost":
        model = CatBoostRegressor(
            iterations=trial.suggest_int("iterations", 100, 500),
            depth=trial.suggest_int("depth", 4, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1, 10),
            verbose=0,
            random_state=42
        )

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    score = cross_val_score(pipeline, X, y, scoring="r2", cv=5, n_jobs=-1)
    return score.mean()

# Tune all models
study_et = optuna.create_study(direction="maximize", study_name="ExtraTrees")
study_et.optimize(lambda trial: tune_model(trial, "ExtraTrees"), n_trials=30)

study_lgbm = optuna.create_study(direction="maximize", study_name="LGBM")
study_lgbm.optimize(lambda trial: tune_model(trial, "LGBM"), n_trials=30)

study_cat = optuna.create_study(direction="maximize", study_name="CatBoost")
study_cat.optimize(lambda trial: tune_model(trial, "CatBoost"), n_trials=30)

print("✅ ExtraTrees Best Params:", study_et.best_params)
print("✅ LGBM Best Params:", study_lgbm.best_params)
print("✅ CatBoost Best Params:", study_cat.best_params)

[I 2025-04-13 14:32:28,210] A new study created in memory with name: ExtraTrees
[I 2025-04-13 14:32:36,649] Trial 0 finished with value: 0.1729689886060612 and parameters: {'n_estimators': 490, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.1729689886060612.
[I 2025-04-13 14:32:40,066] Trial 1 finished with value: 0.1595602290480327 and parameters: {'n_estimators': 145, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.1729689886060612.
[I 2025-04-13 14:32:40,698] Trial 2 finished with value: 0.1561588020221879 and parameters: {'n_estimators': 165, 'max_depth': 18, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.1729689886060612.
[I 2025-04-13 14:32:41,794] Trial 3 finished with value: 0.12925675719401716 and parameters: {'n_estimators': 460, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.1729689886060612.
[I 2025-04-1

✅ ExtraTrees Best Params: {'n_estimators': 207, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 3}
✅ LGBM Best Params: {'n_estimators': 318, 'max_depth': 21, 'num_leaves': 83, 'learning_rate': 0.010259675860639727, 'subsample': 0.6796568574188429, 'colsample_bytree': 0.5133849702171432}
✅ CatBoost Best Params: {'iterations': 233, 'depth': 7, 'learning_rate': 0.2661936925785558, 'l2_leaf_reg': 2.14690636266171}


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# === 1. Load Data ===
df = pd.read_excel("C:/Users/aray28/Desktop/SOC_META/SOC_REQ.xlsx")
y = df["SOC_lnRR"]
X = df.drop(columns=["SOC_lnRR"])

# === 2. Define Categorical Columns ===
categorical_cols = ["ClimateZone", "Soil_Type", "BIO_Type", "Crop_type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Fill missing values
X[categorical_cols] = X[categorical_cols].fillna("Missing")
for col in numerical_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
    X[col] = X[col].fillna(X[col].median())


# Fill missing values
X[categorical_cols] = X[categorical_cols].fillna("Missing")
for col in numerical_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
    X[col] = X[col].fillna(X[col].median())
    
# === 4. Preprocessing Pipeline ===
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# === 5. Define Tuned Base Models ===
et_model = ExtraTreesRegressor(
    n_estimators=207,
    max_depth=7,
    min_samples_split=7,
    min_samples_leaf=3,
    random_state=42
)

lgbm_model = LGBMRegressor(
    n_estimators=318,
    max_depth=21,
    num_leaves=83,
    learning_rate=0.01026,
    subsample=0.6796,
    colsample_bytree=0.5133,
    random_state=42
)

cat_model = CatBoostRegressor(
    iterations=233,
    depth=7,
    learning_rate=0.2661,
    l2_leaf_reg=2.1469,
    verbose=0,
    random_state=42
)

# === 6. Stacking Ensemble ===
ensemble_model = StackingRegressor(
    estimators=[
        ("et", et_model),
        ("lgbm", lgbm_model),
        ("cat", cat_model)
    ],
    final_estimator=ExtraTreesRegressor(n_estimators=100, random_state=42),
    passthrough=True,
    n_jobs=-1
)

# === 7. Full Pipeline ===
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ensemble_model)
])

# === 8. Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# === 9. Fit and Predict ===
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

# === 10. Evaluation ===
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"🔍 Ensemble MAE: {mae:.4f}")
print(f"✅ Ensemble R²: {r2:.4f}")    



🔍 Ensemble MAE: 0.0805
✅ Ensemble R²: 0.8106




In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# === Load Data ===
# === 1. Load Data ===
df = pd.read_excel("C:/Users/aray28/Desktop/SOC_META/SOC_REQ.xlsx")
y = df["SOC_lnRR"]
X = df.drop(columns=["SOC_lnRR"])

# === 2. Define Categorical Columns ===
categorical_cols = ["ClimateZone", "Soil_Type", "BIO_Type", "Crop_type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Fill missing values
X[categorical_cols] = X[categorical_cols].fillna("Missing")
for col in numerical_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
    X[col] = X[col].fillna(X[col].median())

# === Preprocessing Pipelines ===
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# === Base Models with Optuna-Tuned Parameters ===
et_model = ExtraTreesRegressor(
    n_estimators=207,
    max_depth=7,
    min_samples_split=7,
    min_samples_leaf=3,
    random_state=42
)

lgbm_model = LGBMRegressor(
    n_estimators=318,
    max_depth=21,
    num_leaves=83,
    learning_rate=0.01026,
    subsample=0.6796,
    colsample_bytree=0.5133,
    random_state=42
)

cat_model = CatBoostRegressor(
    iterations=233,
    depth=7,
    learning_rate=0.2661,
    l2_leaf_reg=2.1469,
    verbose=0,
    random_state=42
)

# === Stacking Regressor ===
stacking_model = StackingRegressor(
    estimators=[
        ('et', et_model),
        ('lgbm', lgbm_model),
        ('cat', cat_model)
    ],
    final_estimator=ExtraTreesRegressor(n_estimators=100, random_state=42),
    passthrough=True,
    n_jobs=-1
)

# === Full Pipeline ===
stack_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", stacking_model)
])

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# === Train Model ===
stack_pipeline.fit(X_train, y_train)

# === Predict & Evaluate ===
y_pred = stack_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Stacked Regressor MAE: {mae:.4f}")
print(f"✅ Stacked Regressor R²: {r2:.4f}")



📊 Stacked Regressor MAE: 0.0805
✅ Stacked Regressor R²: 0.8106


