In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import Bunch
from sklearn.datasets import make_classification
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import jax.numpy as jnp
from itertools import cycle
import collections

In [2]:
df = pd.read_csv("/home/athyrson/Code/Data/Raw Data/German/german_raw.csv")

In [3]:
def compute_mutual_information(X, y, return_as="df"):
    """
    Computes mutual information (MI) between features and the target variable.
    
    Parameters:
        X (pd.DataFrame or np.array): Feature matrix.
        y (pd.Series or np.array): Target variable.
        return_as (str): 'df' to return as DataFrame, 'series' to return as Series.
    
    Returns:
        pd.DataFrame or pd.Series: Mutual information scores for each feature.
    """
    mi_scores = mutual_info_classif(X, y, discrete_features='auto', n_neighbors=3, random_state=42)
    
    if return_as == "df":
        return pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi_scores}).sort_values(by="Mutual Information", ascending=False)
    elif return_as == "series":
        return pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    else:
        raise ValueError("return_as must be 'df' or 'series'.")



In [4]:
def create_subsets(df, step = 0, seed = 42):
    X = df.drop(columns=["GoodCustomer", "Gender", "PurposeOfLoan"])
    y = df["GoodCustomer"]

    cols_order = np.arange(len(X.columns))
    cols_order = collections.deque(cols_order)
    cols_order.rotate(step)
    X=X[[X.columns[i] for i in cols_order]]


    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


    # Compute MI and return as DataFrame
    mi_df = compute_mutual_information(X_train, y_train, return_as="df").round(3)
    
    # Step 1: Get selected feature names
    selected_features = mi_df[mi_df['Mutual Information'] > 0]['Feature'].unique()

    # Step 2: Filter training and test sets
    X_train_sel = X_train[selected_features]
    X_test_sel = X_test[selected_features]
    
    return X_train, X_test, X_train_sel, X_test_sel, y_train, y_test, mi_df

In [5]:
X_train, X_test, X_train_sel, X_test_sel, y_train, y_test, mi_df = create_subsets(df, step=0)

X_train.shape, X_train_sel.shape, X_test.shape, X_test_sel.shape

((800, 27), (800, 11), (200, 27), (200, 11))

In [6]:
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
import lightgbm as lgb

# Dicionário principal de resultados
hist_dict = {}

# Importâncias apenas para EBM
fi_dict = {
    'original': [],
    'selected': []
}

# Modelos para comparação
models = {
    'ebm': lambda seed: ExplainableBoostingClassifier(random_state=seed),
    'lr': lambda seed: make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=seed)),
    'rf': lambda seed: RandomForestClassifier(n_estimators=100, random_state=seed),
    'lgbm': lambda seed: lgb.LGBMClassifier(n_estimators=100, random_state=seed, verbose=-1)
}

# Loop principal por seed
for seed_num in range(20):
    X_train, X_test, X_train_sel, X_test_sel, y_train, y_test, mi_df = create_subsets(df, step=0, seed=seed_num)

    for subset_key, (train, test) in {
        'original': (X_train, X_test),
        'selected': (X_train_sel, X_test_sel)
    }.items():
        for model_key, model_builder in models.items():
            model = model_builder(seed_num)
            model.fit(train, y_train)
            y_prob = model.predict_proba(test)[:, 1]
            auc = roc_auc_score(y_test, y_prob)

            dict_key = f"{model_key}_{subset_key}"
            if dict_key not in hist_dict:
                hist_dict[dict_key] = {'auc_list': []}

            hist_dict[dict_key]['auc_list'].append(auc)

            # Salvar importâncias apenas para o EBM
            if model_key == 'ebm':
                feature_importances = model.explain_global().data()['scores'][:train.shape[1]]
                ebm_df = pd.DataFrame({'Feature': train.columns, 'EBM Importance': feature_importances})
                ebm_df.set_index('Feature', inplace=True)
                fi_dict[subset_key].append(ebm_df['EBM Importance'])

# Calcular médias finais
for dict_key in hist_dict:
    hist_dict[dict_key]['auc'] = round(sum(hist_dict[dict_key]['auc_list']) / len(hist_dict[dict_key]['auc_list']), 4)

# Importâncias médias do EBM
for key in ['original', 'selected']:
    hist_dict[f'ebm_{key}']['importances'] = pd.concat(fi_dict[key], axis=1).mean(axis=1).round(3)

# Exibir resultados
for dict_key in hist_dict:
    print(f"--- {dict_key.upper()} ---")
    print(f"AUC médio: {hist_dict[dict_key]['auc']}")
    if 'importances' in hist_dict[dict_key]:
        print("Importâncias médias (EBM):")
        print(hist_dict[dict_key]['importances'].sort_values(ascending=False))
    print()


--- EBM_ORIGINAL ---
AUC médio: 0.725
Importâncias médias (EBM):
Feature
LoanDuration                       0.235
CriticalAccountOrLoansElsewhere    0.203
LoanAmount                         0.169
OwnsHouse                          0.120
YearsAtCurrentJob_geq_4            0.118
LoanRateAsPercentOfIncome          0.116
CheckingAccountBalance_geq_0       0.108
SavingsAccountBalance_geq_500      0.094
Single                             0.088
OtherLoansAtBank                   0.080
YearsAtCurrentJob_lt_1             0.076
Age                                0.073
HasTelephone                       0.070
SavingsAccountBalance_geq_100      0.063
NoCurrentLoan                      0.063
RentsHouse                         0.050
ForeignWorker                      0.048
YearsAtCurrentHome                 0.041
HasGuarantor                       0.040
CheckingAccountBalance_geq_200     0.034
HasCoapplicant                     0.027
NumberOfLiableIndividuals          0.025
JobClassIsSkilled        

In [19]:
k = hist_dict['ebm_original']['importances'] - hist_dict['ebm_selected']['importances']
k.sort_values(ascending=True)

Feature
LoanDuration                      -0.040
LoanAmount                        -0.024
RentsHouse                        -0.020
CriticalAccountOrLoansElsewhere   -0.020
Age                               -0.018
OwnsHouse                         -0.017
YearsAtCurrentJob_geq_4           -0.017
YearsAtCurrentJob_lt_1            -0.013
Single                            -0.012
SavingsAccountBalance_geq_500     -0.011
HasTelephone                      -0.010
SavingsAccountBalance_geq_100     -0.007
NoCurrentLoan                     -0.006
NumberOfOtherLoansAtBank          -0.005
HasCoapplicant                    -0.004
OtherLoansAtBank                  -0.003
ForeignWorker                     -0.003
HasGuarantor                      -0.001
Unemployed                        -0.001
YearsAtCurrentHome                 0.000
JobClassIsSkilled                  0.000
MissedPayments                     0.000
OtherLoansAtStore                  0.000
LoanRateAsPercentOfIncome          0.001
Checking

Feature
LoanDuration                       0.275
CriticalAccountOrLoansElsewhere    0.223
LoanAmount                         0.193
OwnsHouse                          0.137
YearsAtCurrentJob_geq_4            0.135
LoanRateAsPercentOfIncome          0.115
CheckingAccountBalance_geq_0       0.105
SavingsAccountBalance_geq_500      0.105
Single                             0.100
Age                                0.091
YearsAtCurrentJob_lt_1             0.089
OtherLoansAtBank                   0.083
HasTelephone                       0.080
SavingsAccountBalance_geq_100      0.070
RentsHouse                         0.070
NoCurrentLoan                      0.069
ForeignWorker                      0.051
YearsAtCurrentHome                 0.041
HasGuarantor                       0.041
HasCoapplicant                     0.031
CheckingAccountBalance_geq_200     0.029
NumberOfOtherLoansAtBank           0.024
NumberOfLiableIndividuals          0.021
JobClassIsSkilled                  0.020
Unemploy

In [None]:
hist_dict['ebm_original']['importances'].sort_values(ascending=False)

In [7]:
# Convert hist_dict to a DataFrame and make sure index is numeric
df_auc = pd.DataFrame(hist_dict).T.copy()
df_auc.index = df_auc.index.astype(int)
df_auc['step'] = df_auc.index  # or call it 's' if you prefer

# Sort for display (optional — doesn't affect correlation)
df_auc = df_auc.sort_values(by='auc', ascending=False)

# Ensure both columns are float
df_auc['auc'] = df_auc['auc'].astype(float)
df_auc['step'] = df_auc['step'].astype(float)

# Now compute the Pearson correlation between step index and AUC
correlation = df_auc['auc'].corr(df_auc['step'])
print("Correlation between AUC and step index:", correlation)


TypeError: Cannot cast Index to dtype int64

In [None]:
df_auc

Unnamed: 0,auc,importances,step
0,0.277997,Gender NaN Forei...,0.0
1,0.277997,Gender NaN Forei...,1.0
18,0.277997,Gender NaN Forei...,18.0
17,0.277997,Gender NaN Forei...,17.0
16,0.277997,Gender NaN Forei...,16.0
15,0.277997,Gender NaN Forei...,15.0
14,0.277997,Gender NaN Forei...,14.0
13,0.277997,Gender NaN Forei...,13.0
12,0.277997,Gender NaN Forei...,12.0
11,0.277997,Gender NaN Forei...,11.0


In [None]:
imp_list = [hist_dict[i]['importances'] for i in range(n_changes)]

In [None]:
pd.concat(imp_list, axis=1).loc[selected_features]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
OwnsHouse,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124,0.124
NoCurrentLoan,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079
LoanDuration,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374,0.374
CheckingAccountBalance_geq_200,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03
SavingsAccountBalance_geq_100,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084,0.084
RentsHouse,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069,0.069
LoanAmount,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25
Unemployed,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014,0.014
CriticalAccountOrLoansElsewhere,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277,0.277
MissedPayments,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017
