In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import shap

# Assuming X and Y are your features and target variable DataFrames
X = pd.read_csv('../Data11train.csv')  # Replace with your features file path
Y = pd.read_csv('../Data11test.csv')    # Replace with your target file path

best_r2_score = -np.inf
best_mae = np.inf
best_random_state = None

# SHAP values storage
shap_values_all = []
explainer = None

for random_state in range(1000, 1010):
    kfold = KFold(n_splits=5, shuffle=True, random_state=random_state)
    model = make_pipeline(StandardScaler(), GradientBoostingRegressor(random_state=random_state))
    
    r2_scores = []
    maes = []

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        r2 = model.score(X_test, y_test)
        mae = mean_absolute_error(y_test, predictions)

        r2_scores.append(r2)
        maes.append(mae)

        # SHAP analysis
        if explainer is None:
            explainer = shap.Explainer(model.steps[-1][1], X_train)
        
        shap_values = explainer(X_test)
        shap_values_all.append(shap_values)
        
        # Plotting the SHAP values
        shap.summary_plot(shap_values, X_test)

    average_r2_score = np.mean(r2_scores)
    average_mae = np.mean(maes)

    if average_r2_score > best_r2_score:
        best_r2_score = average_r2_score
        best_mae = average_mae
        best_random_state = random_state

print(f"Best Average R² Score: {best_r2_score} for Random State: {best_random_state}")
print(f"Corresponding MAE: {best_mae}")

# Optional: Aggregate and plot SHAP values for all folds
all_shap_values = np.concatenate([shap_values.values for shap_values in shap_values_all], axis=0)
all_features = pd.concat([X.iloc[test_index] for _, test_index in kfold.split(X)], axis=0)
shap.summary_plot(all_shap_values, all_features)


IndexError: positional indexers are out-of-bounds