In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import Bunch

In [None]:
df = pd.read_csv("/home/athyrson/Code/Data/Raw Data/Lending Club/accepted_2007_to_2018Q4.csv")

In [None]:
la = ["issue_d", "loan_amnt", "funded_amnt", "funded_amnt_inv", "term", "int_rate"]
lb = ["installment", "emp_length", "annual_inc", "verification_status", "loan_status", 
      "purpose", "addr_state", "dti", "delinq_2yrs"]
lc = ["inq_last_6mths", "open_acc", "home_ownership", "revol_bal", "revol_util",
       "total_acc", "total_pymnt", "total_rec_prncp", "total_rec_int", "total_pymnt_inv",
         "last_pymnt_amnt", "last_fico_range_high", "last_fico_range_low"]

selected_columns_a = la+lb+lc

# Define the chunk size for reading the CSV file
chunksize = 100000  # Adjust this value based on your requirements

# Initialize an empty list to store filtered chunks
filtered_chunks = []

# Read the CSV file in chunks based on the defined chunk size
for chunk in pd.read_csv("/home/athyrson/Code/Data/Raw Data/Lending Club/accepted_2007_to_2018Q4.csv", chunksize=chunksize, usecols=selected_columns_a):
    # Filter the current chunk based on the criteria
    filtered_chunk = chunk[chunk['issue_d'].str.contains("2009", na=False)]
    # filtered_chunk = filtered_chunk[~filtered_chunk['issue_d'].str.contains("Oct-2013|Nov-2013|Dec-2013", na=False)]
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)

# Concatenate all filtered chunks into a single DataFrame
df = pd.concat(filtered_chunks)


In [None]:
df_1 = df.sample(frac=0.05, random_state=42)
df_1.shape

In [None]:
import re

def preprocess_data(df):
    # Calculate risk score
    df["risk_score"] = df.loc[:, ["last_fico_range_high", "last_fico_range_low"]].mean(axis=1)
    
    # Create target variable
    df["target"] = np.where((df.loan_status == 'Current') |
                            (df.loan_status == 'Fully Paid') |
                            (df.loan_status == "Issued") |
                            (df.loan_status == 'Does not meet the credit policy. Status:Fully Paid'), 0, 1)
    
    # Drop unnecessary columns
    for c in ['last_fico_range_high', 'last_fico_range_low', 'loan_status']:
        df = df.drop(c, axis=1, errors='ignore')
    
    # Columns based on Shih et al. (2022)
    pearson_a = ['int_rate', 'dti', 'delinq_2yrs', 'emp_length', 'annual_inc', 'inq_last_6mths', 'term',
                 'home_ownership', 'revol_util', 'risk_score', 'issue_d', 'target']
    
    # Select columns
    df = df.loc[:, pearson_a]
    
    
    def fix_term_column(df):
        """
        Fix the dtype of the 'term' column by extracting the numeric part and converting it to an integer.
        Handles cases where the column contains non-string values (e.g., NaN or float).
        
        Parameters:
            df (pd.DataFrame): The DataFrame containing the 'term' column.
        
        Returns:
            pd.DataFrame: The DataFrame with the 'term' column converted to numeric.
        """
        try:
            # Extract numeric part from the 'term' column and convert to integer
            df['term'] = df['term'].apply(lambda x: int(re.search(r'\d+', str(x)).group()) if pd.notna(x) else x)
        except AttributeError as e:
            # Handle cases where the regex search fails (e.g., no match found)
            print("Error: Unable to extract numeric value from 'term' column.")
            print(f"Details: {e}")
        except Exception as e:
            # Handle any other unexpected errors
            print("An unexpected error occurred while fixing the 'term' column.")
            print(f"Details: {e}")
        
        return df
    
    def fix_emp_length_column(df):
        """
        Fix the dtype of the 'emp_length' column by converting it to numeric.
        Handles cases where the column contains non-string values (e.g., NaN or float).
        
        Parameters:
            df (pd.DataFrame): The DataFrame containing the 'emp_length' column.
        
        Returns:
            pd.DataFrame: The DataFrame with the 'emp_length' column converted to numeric.
        """
        try:
            # Map '< 1 year' to '0' and extract numeric part from the 'emp_length' column
            df['emp_length'] = df['emp_length'].map(lambda x: "0" if x == '< 1 year' else x)
            df['emp_length'] = df['emp_length'].map(lambda x: int(re.search(r'\d+', str(x)).group()), na_action='ignore')
        except Exception as e:
            # Handle any unexpected errors
            print("Error: Unable to convert 'emp_length' column to numeric.")
            print(f"Details: {e}")
        
        return df
    
    df = fix_term_column(df)
    df = fix_emp_length_column(df)

    df = df.dropna(axis=0)

    return df

In [None]:
df_2 = preprocess_data(df_1)

In [None]:
df_2

In [None]:
risk_df = df_2.loc[:, ['risk_score', 'target']]

In [None]:
import matplotlib.pyplot as plt

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Histogram for target = 0
ax1.hist(risk_df[risk_df['target'] == 0]['risk_score'], bins=60, edgecolor='black', alpha=0.7, color='blue', label='Target 0')
ax1.set_xlabel('Risk Score')
ax1.set_ylabel('Frequency')
ax1.set_title('Histogram of Risk Scores (Target = 0)')
ax1.legend()

# Histogram for target = 1
ax2.hist(risk_df[risk_df['target'] == 1]['risk_score'], bins=60, edgecolor='black', alpha=0.7, color='red', label='Target 1')
ax2.set_xlabel('Risk Score')
ax2.set_ylabel('Frequency')
ax2.set_title('Histogram of Risk Scores (Target = 1)')
ax2.legend()

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Function to evaluate classifiers
def evaluate_classifiers(X_train, X_test, y_train, y_test):
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)
    }
    
    metrics = {
        'AUC': roc_auc_score,
        'Balanced Accuracy': balanced_accuracy_score,
        'Recall': recall_score
    }
    
    results = {name: {} for name in classifiers.keys()}
    
    for clf_name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)
        
        for metric_name, metric_func in metrics.items():
            if metric_name == 'AUC':
                score = metric_func(y_test, y_pred_proba)
            else:
                score = metric_func(y_test, y_pred)
            results[clf_name][metric_name] = score
    
    return results

# Prepare datasets
risk_df_zero_min = risk_df.copy()
risk_df_zero_min['risk_score'] = risk_df_zero_min['risk_score'] - risk_df_zero_min['risk_score'].min()

# Filter the second dataset to include only samples where risk_score > 500
risk_df_500_min = risk_df[risk_df['risk_score'] > 500].copy()

# Split datasets
X_zero_min = risk_df_zero_min[['risk_score']]
y_zero_min = risk_df_zero_min['target']
X_train_zero_min, X_test_zero_min, y_train_zero_min, y_test_zero_min = train_test_split(X_zero_min, y_zero_min, test_size=0.2, random_state=42)

X_500_min = risk_df_500_min[['risk_score']]
y_500_min = risk_df_500_min['target']
X_train_500_min, X_test_500_min, y_train_500_min, y_test_500_min = train_test_split(X_500_min, y_500_min, test_size=0.2, random_state=42)

# Evaluate classifiers
results_zero_min = evaluate_classifiers(X_train_zero_min, X_test_zero_min, y_train_zero_min, y_test_zero_min)
results_500_min = evaluate_classifiers(X_train_500_min, X_test_500_min, y_train_500_min, y_test_500_min)

print("Evaluation Metrics with Risk Score Min = 0:")
for clf_name, metrics in results_zero_min.items():
    print(f"{clf_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")

print("\nEvaluation Metrics with Risk Score > 500:")
for clf_name, metrics in results_500_min.items():
    print(f"{clf_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")

In [None]:
risk_df_500_min.mean(), risk_df_zero_min.mean()

In [None]:
import matplotlib.pyplot as plt

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Histogram for target = 0
ax1.hist(risk_df_500_min[risk_df_500_min['target'] == 0]['risk_score'], bins=60, edgecolor='black', alpha=0.7, color='blue', label='Target 0')
ax1.set_xlabel('Risk Score')
ax1.set_ylabel('Frequency')
ax1.set_title('Histogram of Risk Scores (Target = 0)')
ax1.legend()

# Histogram for target = 1
ax2.hist(risk_df_500_min[risk_df_500_min['target'] == 1]['risk_score'], bins=60, edgecolor='black', alpha=0.7, color='red', label='Target 1')
ax2.set_xlabel('Risk Score')
ax2.set_ylabel('Frequency')
ax2.set_title('Histogram of Risk Scores (Target = 1)')
ax2.legend()

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()

In [None]:
X = df_2.loc[:, df_2.columns != 'target'].drop(['issue_d', 'home_ownership', 'risk_score'], axis=1)
y = df_2['target']

In [None]:
def create_bunch_df(X,y):
    # Convert to NumPy arrays
    X_array = X.to_numpy()
    y_array = y.to_numpy()

    # Get feature names
    feature_names = X.columns.tolist()

    # Create a Bunch object
    bunch_df = Bunch(
        data=X_array,
        target=y_array,
        feature_names=feature_names,
        DESCR="Custom dataset similar to scikit-learn fetch_datasets"
    )
    return bunch_df

In [None]:
df_3 = create_bunch_df(X, y)
X = df_3.data
y = df_3.target
feature_names = df_3.feature_names

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train[1,:]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, roc_auc_score

# Assuming X_train, X_test, y_train, y_test, and feature_names are already defined
n_features = X_train.shape[1]
models = []
predictions_train = np.zeros_like(y_train, dtype=np.float64)
predictions_test = np.zeros_like(y_test, dtype=np.float64)

# Train separate boosting models for each feature
for i in range(n_features):
    # Initialize and train the model
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.001, max_depth=3, random_state=42)
    model.fit(X_train[:, i].reshape(-1, 1), y_train)
    models.append(model)
    
    # Summing probability contributions of each feature
    predictions_train += model.predict_proba(X_train[:, i].reshape(-1, 1))[:, 1]
    predictions_test += model.predict_proba(X_test[:, i].reshape(-1, 1))[:, 1]

# Evaluate performance
mse = mean_squared_error(y_test, predictions_test)
auc = roc_auc_score(y_test, predictions_test)

print(f"EBM-like Model MSE: {mse:.4f}")
print(f"EBM-like Model AUC: {auc:.4f}")

# Visualize learned feature contributions
fig, axes = plt.subplots((n_features // 2) + 1, 2, figsize=(15, 9))
axes = axes.ravel()

for i, model in enumerate(models):
    x_range = np.linspace(X_train[:, i].min(), X_train[:, i].max(), 100).reshape(-1, 1)
    y_pred = model.predict_proba(x_range)[:, 1]
    axes[i].plot(x_range, y_pred, label=f"{feature_names[i]}")
    axes[i].set_title(f"Feature: {feature_names[i]}")
    axes[i].set_xlabel("Feature Value")  # X-axis: Feature value
    axes[i].set_ylabel("Predicted Probability")  # Y-axis: Predicted probability of class 1
    axes[i].legend()

plt.tight_layout()  # Fix layout to prevent overlapping
plt.show()

In [None]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from interpret.glassbox import ExplainableBoostingClassifier  # Official EBM model

# Assuming X_train, X_test, y_train, y_test are already defined

# Select the 8 most important features according to ANOVA F-test
selector = SelectKBest(f_classif, k=8)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the names of the selected features (if feature_names is available)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [feature_names[i] for i in selected_feature_indices]
print("Selected Features:", selected_feature_names)

# Train separate boosting models for each selected feature (EBM-like model)
n_selected_features = X_train_selected.shape[1]
models_selected = []
predictions_train_selected = np.zeros_like(y_train, dtype=np.float64)
predictions_test_selected = np.zeros_like(y_test, dtype=np.float64)

for i in range(n_selected_features):
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)
    model.fit(X_train_selected[:, i].reshape(-1, 1), y_train)
    models_selected.append(model)
    
    # Summing probability contributions of each feature
    predictions_train_selected += model.predict_proba(X_train_selected[:, i].reshape(-1, 1))[:, 1]
    predictions_test_selected += model.predict_proba(X_test_selected[:, i].reshape(-1, 1))[:, 1]

# Evaluate performance of the EBM-like model
auc_selected = roc_auc_score(y_test, predictions_test_selected)
print(f"EBM-like Model with Selected Features AUC: {auc_selected:.4f}")

# Train benchmark models with selected features
# Logistic Regression
log_reg_selected = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42))
log_reg_selected.fit(X_train_selected, y_train)
log_reg_pred_selected = log_reg_selected.predict_proba(X_test_selected)[:, 1]

# Decision Tree Classifier
tree_clf_selected = DecisionTreeClassifier(max_depth=5, random_state=42)
tree_clf_selected.fit(X_train_selected, y_train)
tree_clf_pred_selected = tree_clf_selected.predict_proba(X_test_selected)[:, 1]

# Gradient Boosting Classifier
gb_clf_selected = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)
gb_clf_selected.fit(X_train_selected, y_train)
gb_clf_pred_selected = gb_clf_selected.predict_proba(X_test_selected)[:, 1]

# Official EBM Model
ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(X_train_selected, y_train)
ebm_pred_selected = ebm.predict_proba(X_test_selected)[:, 1]

# Evaluate performance of benchmark models
log_reg_auc_selected = roc_auc_score(y_test, log_reg_pred_selected)
log_reg_recall_selected = recall_score(y_test, log_reg_pred_selected.round())
log_reg_balanced_acc_selected = balanced_accuracy_score(y_test, log_reg_pred_selected.round())

tree_clf_auc_selected = roc_auc_score(y_test, tree_clf_pred_selected)
tree_clf_recall_selected = recall_score(y_test, tree_clf_pred_selected.round())
tree_clf_balanced_acc_selected = balanced_accuracy_score(y_test, tree_clf_pred_selected.round())

gb_clf_auc_selected = roc_auc_score(y_test, gb_clf_pred_selected)
gb_clf_recall_selected = recall_score(y_test, gb_clf_pred_selected.round())
gb_clf_balanced_acc_selected = balanced_accuracy_score(y_test, gb_clf_pred_selected.round())

ebm_auc_selected = roc_auc_score(y_test, ebm_pred_selected)
ebm_recall_selected = recall_score(y_test, ebm_pred_selected.round())
ebm_balanced_acc_selected = balanced_accuracy_score(y_test, ebm_pred_selected.round())

print(f"Logistic Regression with Selected Features AUC: {log_reg_auc_selected:.4f}, Recall: {log_reg_recall_selected:.4f}, Balanced Accuracy: {log_reg_balanced_acc_selected:.4f}")
print(f"Decision Tree Classifier with Selected Features AUC: {tree_clf_auc_selected:.4f}, Recall: {tree_clf_recall_selected:.4f}, Balanced Accuracy: {tree_clf_balanced_acc_selected:.4f}")
print(f"Gradient Boosting Classifier with Selected Features AUC: {gb_clf_auc_selected:.4f}, Recall: {gb_clf_recall_selected:.4f}, Balanced Accuracy: {gb_clf_balanced_acc_selected:.4f}")
print(f"Official EBM Model with Selected Features AUC: {ebm_auc_selected:.4f}, Recall: {ebm_recall_selected:.4f}, Balanced Accuracy: {ebm_balanced_acc_selected:.4f}")
tree_clf_auc_selected = roc_auc_score(y_test, tree_clf_pred_selected)
gb_clf_auc_selected = roc_auc_score(y_test, gb_clf_pred_selected)
ebm_auc_selected = roc_auc_score(y_test, ebm_pred_selected)

print(f"Logistic Regression with Selected Features AUC: {log_reg_auc_selected:.4f}")
print(f"Decision Tree Classifier with Selected Features AUC: {tree_clf_auc_selected:.4f}")
print(f"Gradient Boosting Classifier with Selected Features AUC: {gb_clf_auc_selected:.4f}")
print(f"Official EBM Model with Selected Features AUC: {ebm_auc_selected:.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate ROC curves
fpr_ebm_selected, tpr_ebm_selected, _ = roc_curve(y_test, predictions_test_selected)
fpr_log_reg_selected, tpr_log_reg_selected, _ = roc_curve(y_test, log_reg_pred_selected)
fpr_tree_clf_selected, tpr_tree_clf_selected, _ = roc_curve(y_test, tree_clf_pred_selected)
fpr_ebm, tpr_ebm, _ = roc_curve(y_test, ebm_pred_selected)
fpr_gb_clf_selected, tpr_gb_clf_selected, _ = roc_curve(y_test, gb_clf_pred_selected)

# Calculate AUC values
auc_ebm_selected = auc(fpr_ebm_selected, tpr_ebm_selected)
auc_log_reg_selected = auc(fpr_log_reg_selected, tpr_log_reg_selected)
auc_tree_clf_selected = auc(fpr_tree_clf_selected, tpr_tree_clf_selected)
auc_ebm = auc(fpr_ebm, tpr_ebm)
auc_gb_clf_selected = auc(fpr_gb_clf_selected, tpr_gb_clf_selected)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_ebm_selected, tpr_ebm_selected, label=f'EBM-like Model with Selected Features (AUC = {auc_ebm_selected:.4f})')
plt.plot(fpr_log_reg_selected, tpr_log_reg_selected, label=f'Logistic Regression with Selected Features (AUC = {auc_log_reg_selected:.4f})')
plt.plot(fpr_tree_clf_selected, tpr_tree_clf_selected, label=f'Decision Tree Classifier with Selected Features (AUC = {auc_tree_clf_selected:.4f})')
plt.plot(fpr_ebm, tpr_ebm, label=f'Official EBM Model (AUC = {auc_ebm:.4f})')
plt.plot(fpr_gb_clf_selected, tpr_gb_clf_selected, label=f'Gradient Boosting Classifier with Selected Features (AUC = {auc_gb_clf_selected:.4f})')

plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Models with Selected Features')
plt.legend(loc='lower right')
plt.show()


In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

def ebm_like_fit(X_train, y_train, X_test, y_test, feature_names, categorical_features=None, n_estimators=100, learning_rate=0.01, max_depth=3):
    """
    Train an EBM-like model with feature interactions, bagging, and early stopping.

    Args:
        X_train: Training feature matrix (NumPy array).
        y_train: Training labels (NumPy array).
        X_test: Testing feature matrix (NumPy array).
        y_test: Testing labels (NumPy array).
        feature_names: List of feature names.
        categorical_features: List of indices for categorical features (optional).
        n_estimators: Number of estimators for GradientBoostingClassifier.
        learning_rate: Learning rate for GradientBoostingClassifier.
        max_depth: Maximum depth for GradientBoostingClassifier.

    Returns:
        predictions_test: Combined predictions for the test set.
        models: List of trained models for each feature and interaction.
    """
    # Handle categorical features
    if categorical_features is not None:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        X_train_cat = encoder.fit_transform(X_train[:, categorical_features])
        X_test_cat = encoder.transform(X_test[:, categorical_features])
        X_train = np.hstack([X_train[:, ~np.isin(np.arange(X_train.shape[1]), categorical_features)], X_train_cat])
        X_test = np.hstack([X_test[:, ~np.isin(np.arange(X_test.shape[1]), categorical_features)], X_test_cat])

    n_features = X_train.shape[1]
    models = []
    predictions_train = np.zeros_like(y_train, dtype=np.float64)
    predictions_test = np.zeros_like(y_test, dtype=np.float64)

    # Train separate boosting models for each feature
    for i in range(n_features):
        # Use grid search to find the best hyperparameters
        param_grid = {
                'n_estimators': 50,
                'learning_rate': 0.01,
                'max_depth': 3,
                'random_state' : 42,
            }
        model = GradientBoostingClassifier(**param_grid)
        model.fit(X_train[:, i].reshape(-1, 1), y_train)
        

        # Use bagging to reduce variance
        bagged_model = BaggingClassifier(
            estimator=model,
            n_estimators=10,
            random_state=42
        )
        bagged_model.fit(X_train[:, i].reshape(-1, 1), y_train)
        models.append(bagged_model)

        # Summing probability contributions of each feature
        predictions_train += bagged_model.predict_proba(X_train[:, i].reshape(-1, 1))[:, 1]
        predictions_test += bagged_model.predict_proba(X_test[:, i].reshape(-1, 1))[:, 1]

    # Train models for pairwise interactions
    for i in range(n_features):
        for j in range(i + 1, n_features):
            # Use grid search to find the best hyperparameters
            param_grid = {
                'n_estimators': 50,
                'learning_rate': 0.01,
                'max_depth': 3,
                'random_state' : 42,
            }
            model = GradientBoostingClassifier(**param_grid)
            model.fit(X_train[:, [i, j]], y_train)
            

            # Use bagging to reduce variance
            bagged_model = BaggingClassifier(
                estimator=model,
                n_estimators=10,
                random_state=42
            )
            bagged_model.fit(X_train[:, [i, j]], y_train)
            models.append(bagged_model)

            # Summing probability contributions of each interaction
            predictions_train += bagged_model.predict_proba(X_train[:, [i, j]])[:, 1]
            predictions_test += bagged_model.predict_proba(X_test[:, [i, j]])[:, 1]

    from scipy.special import expit  # Logistic function (sigmoid)

    # # Apply logistic transformation
    # predictions_train = expit(predictions_train)
    # predictions_test = expit(predictions_test)

    # Evaluate performance
    mse = mean_squared_error(y_test, predictions_test)
    auc = roc_auc_score(y_test, predictions_test)

    print(f"EBM-like Model MSE: {mse:.4f}")
    print(f"EBM-like Model AUC: {auc:.4f}")

    # Visualize learned feature contributions
    fig, axes = plt.subplots((n_features // 2) + 1, 2, figsize=(15, 9))
    axes = axes.ravel()

    for i, model in enumerate(models[:n_features]):  # Only plot individual features
        x_range = np.linspace(X_train[:, i].min(), X_train[:, i].max(), 100).reshape(-1, 1)
        y_pred = model.predict_proba(x_range)[:, 1]
        axes[i].plot(x_range, y_pred, label=f"{feature_names[i]}")
        axes[i].set_title(f"Feature: {feature_names[i]}")
        axes[i].set_xlabel("Feature Value")
        axes[i].set_ylabel("Predicted Probability")
        axes[i].legend()

    plt.tight_layout()
    plt.show()

    return predictions_test, models

# Assuming X_train, X_test, y_train, y_test, and feature_names are already defined
predictions_test, models = ebm_like_fit(X_train, y_train, X_test, y_test, feature_names)

In [None]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Function to evaluate classifiers
def evaluate_classifiers(X_train, X_test, y_train, y_test):
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)
    }
    
    metrics = {
        'AUC': roc_auc_score,
        'Balanced Accuracy': balanced_accuracy_score,
        'Recall': recall_score
    }
    
    results = {name: {} for name in classifiers.keys()}
    
    for clf_name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)
        
        for metric_name, metric_func in metrics.items():
            if metric_name == 'AUC':
                score = metric_func(y_test, y_pred_proba)
            else:
                score = metric_func(y_test, y_pred)
            results[clf_name][metric_name] = score
    
    return results

# Prepare datasets
risk_df_zero_min = risk_df.copy()
risk_df_zero_min['risk_score'] = risk_df_zero_min['risk_score'] - risk_df_zero_min['risk_score'].min()

# Filter the second dataset to include only samples where risk_score > 500
risk_df_500_min = risk_df[risk_df['risk_score'] > 500].copy()

# Split datasets
X_zero_min = risk_df_zero_min[['risk_score']]
y_zero_min = risk_df_zero_min['target']
X_train_zero_min, X_test_zero_min, y_train_zero_min, y_test_zero_min = train_test_split(X_zero_min, y_zero_min, test_size=0.2, random_state=42)

X_500_min = risk_df_500_min[['risk_score']]
y_500_min = risk_df_500_min['target']
X_train_500_min, X_test_500_min, y_train_500_min, y_test_500_min = train_test_split(X_500_min, y_500_min, test_size=0.2, random_state=42)

# Evaluate classifiers
results_zero_min = evaluate_classifiers(X_train_zero_min, X_test_zero_min, y_train_zero_min, y_test_zero_min)
results_500_min = evaluate_classifiers(X_train_500_min, X_test_500_min, y_train_500_min, y_test_500_min)

print("Evaluation Metrics with Risk Score Min = 0:")
for clf_name, metrics in results_zero_min.items():
    print(f"{clf_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")

print("\nEvaluation Metrics with Risk Score > 500:")
for clf_name, metrics in results_500_min.items():
    print(f"{clf_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")