In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import Bunch
from sklearn.datasets import make_classification
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import jax.numpy as jnp
from itertools import cycle
import collections

In [3]:
df = pd.read_csv("/home/athyrson/Code/Data/Raw Data/German/german_raw.csv")

In [4]:
arr = np.arange(5)
d = collections.deque(arr)
d.rotate(-1)
d

deque([1, 2, 3, 4, 0])

In [5]:
def reorder_df(df, step = 0, seed = 42):
    X = df.drop(columns=["GoodCustomer"])
    y = df["GoodCustomer"]

    cols_order = np.arange(len(X.columns))
    cols_order = collections.deque(cols_order)
    cols_order.rotate(step)
    X=X[[X.columns[i] for i in cols_order]]


    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    return X_train, X_test, y_train, y_test

In [6]:
def compute_mutual_information(X, y, return_as="df"):
    """
    Computes mutual information (MI) between features and the target variable.
    
    Parameters:
        X (pd.DataFrame or np.array): Feature matrix.
        y (pd.Series or np.array): Target variable.
        return_as (str): 'df' to return as DataFrame, 'series' to return as Series.
    
    Returns:
        pd.DataFrame or pd.Series: Mutual information scores for each feature.
    """
    mi_scores = mutual_info_classif(X, y, discrete_features='auto', n_neighbors=3, random_state=42)
    
    if return_as == "df":
        return pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi_scores}).sort_values(by="Mutual Information", ascending=False)
    elif return_as == "series":
        return pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    else:
        raise ValueError("return_as must be 'df' or 'series'.")



In [7]:
X_train, X_test, y_train, y_test = reorder_df(df, step=0)

In [8]:
from interpret.glassbox import ExplainableBoostingClassifier
import pandas as pd
from sklearn.metrics import roc_auc_score

hist_dict = {}

# Fixed feature order for consistent indexing
zero_order = df.drop(columns=["GoodCustomer"]).columns
n_changes = 20

for s in range(n_changes):
    hist_dict[s] = {}
    fi_list = []
    auc_score = 0

    for seed_num in range(5):
        X_train, X_test, y_train, y_test = reorder_df(df, step=s, seed=seed_num)

        ebm = ExplainableBoostingClassifier()
        ebm.fit(X_train, y_train)

        # Extract feature importances
        feature_importances = ebm.explain_global().data()['scores'][:X_train.shape[1]]
        ebm_df = pd.DataFrame({'Feature': X_train.columns, 'EBM Importance': feature_importances})

        # Align importance values to consistent feature order
        fi_series = ebm_df.set_index('Feature').reindex(zero_order)['EBM Importance']
        fi_list.append(fi_series)

        # Compute AUC
        y_prob = ebm.predict_proba(X_test)[:, 1]
        auc_score += roc_auc_score(y_test, y_prob)

    # Average importances across seeds
    mean_importances = pd.concat(fi_list, axis=1).mean(axis=1).round(3)
    hist_dict[s]['auc'] = auc_score / 5
    hist_dict[s]['importances'] = mean_importances


In [9]:
# Convert hist_dict to a DataFrame and make sure index is numeric
df_auc = pd.DataFrame(hist_dict).T.copy()
df_auc.index = df_auc.index.astype(int)
df_auc['step'] = df_auc.index  # or call it 's' if you prefer

# Sort for display (optional — doesn't affect correlation)
df_auc = df_auc.sort_values(by='auc', ascending=False)

# Ensure both columns are float
df_auc['auc'] = df_auc['auc'].astype(float)
df_auc['step'] = df_auc['step'].astype(float)

# Now compute the Pearson correlation between step index and AUC
correlation = df_auc['auc'].corr(df_auc['step'])
print("Correlation between AUC and step index:", correlation)


Correlation between AUC and step index: 0.1028024350772334


In [10]:
df_auc

Unnamed: 0,auc,importances,step
9,0.723081,Gender 0.047 Forei...,9.0
7,0.722528,Gender 0.047 Forei...,7.0
11,0.72243,Gender 0.047 Forei...,11.0
19,0.721835,Gender 0.047 Forei...,19.0
3,0.721371,Gender 0.047 Forei...,3.0
15,0.721191,Gender 0.047 Forei...,15.0
8,0.720757,Gender 0.047 Forei...,8.0
16,0.720319,Gender 0.047 Forei...,16.0
1,0.720232,Gender 0.047 Forei...,1.0
4,0.719279,Gender 0.047 Forei...,4.0


In [11]:
imp_list = [hist_dict[i]['importances'] for i in range(n_changes)]

In [12]:
pd.concat(imp_list, axis = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Gender,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047
ForeignWorker,0.047,0.047,0.048,0.047,0.047,0.048,0.047,0.047,0.047,0.047,0.047,0.048,0.047,0.047,0.047,0.047,0.047,0.047,0.047,0.047
Single,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075,0.075
Age,0.069,0.069,0.067,0.069,0.068,0.07,0.068,0.07,0.07,0.069,0.069,0.071,0.068,0.068,0.069,0.068,0.07,0.07,0.07,0.069
LoanDuration,0.26,0.26,0.26,0.26,0.258,0.26,0.258,0.261,0.261,0.263,0.262,0.261,0.263,0.26,0.26,0.259,0.263,0.262,0.261,0.263
PurposeOfLoan,0.296,0.299,0.297,0.294,0.294,0.298,0.295,0.298,0.299,0.296,0.301,0.297,0.298,0.296,0.297,0.297,0.3,0.297,0.297,0.299
LoanAmount,0.188,0.19,0.189,0.186,0.187,0.189,0.187,0.19,0.189,0.191,0.191,0.188,0.19,0.189,0.19,0.187,0.191,0.189,0.188,0.191
LoanRateAsPercentOfIncome,0.135,0.135,0.135,0.135,0.135,0.136,0.135,0.135,0.134,0.135,0.135,0.135,0.134,0.135,0.135,0.134,0.135,0.134,0.135,0.135
YearsAtCurrentHome,0.036,0.036,0.037,0.037,0.035,0.036,0.037,0.036,0.036,0.036,0.036,0.037,0.036,0.036,0.036,0.036,0.037,0.036,0.036,0.037
NumberOfOtherLoansAtBank,0.029,0.029,0.029,0.029,0.029,0.029,0.029,0.028,0.029,0.029,0.029,0.029,0.029,0.029,0.028,0.028,0.029,0.029,0.03,0.029
