In [105]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [None]:
df = pd.read_csv(/GermanCredit.csv')

df.head()

In [6]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
categorical_cols

In [None]:
for col in categorical_cols:
    print(f"\n▶ {col} ({df[col].nunique()} categories):")
    print(df[col].value_counts())

In [12]:
def split_personal_status(value):
    if value.startswith('male'):
        sex = 'male'
    else:
        sex = 'female'
    
    status = value.split(":")[-1].strip()
    return pd.Series([sex, status])

# Apply and replace
df[['Sex', 'Personal_Status']] = df['personal_status_sex'].apply(split_personal_status)

# Drop the original column
df.drop(columns='personal_status_sex', inplace=True)

In [14]:
df['foreign_worker'] = df['foreign_worker'].map({'yes': 1, 'no': 0})

# Map 'telephone' to binary: yes → 1, no → 0
df['telephone'] = df['telephone'].map({'yes': 1, 'no': 0})


In [16]:
df.drop(columns='Personal_Status', inplace=True)

In [18]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})


In [None]:
df

In [22]:
exclude_cols = ['people_liable', 'telephone', 'foreign_worker', 'credit_risk', 'Sex']

In [24]:
numerical_cols_for_outliers = df.select_dtypes(include=['int64', 'float64']).columns
numerical_cols_for_outliers = [col for col in numerical_cols_for_outliers if col not in exclude_cols]

# Function to cap outliers using IQR
def cap_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)

# Apply capping only to the valid features
for col in numerical_cols_for_outliers:
    cap_outliers_iqr(df, col)



In [27]:
status_mapping = {
    "... < 100 DM": "lt_100",
    "0 <= ... < 200 DM": "btw_0_200",
    "... >= 200 DM / salary for at least 1 year": "gte_200_salary",
    "no checking account": "no_account"  # if this value exists in the dataset
}

credit_history_mapping = {
    "existing credits paid back duly till now": "paid_duly",
    "critical account/other credits existing": "critical_account",
    "delay in paying off in the past": "delayed_payment",
    "all credits at this bank paid back duly": "bank_paid_duly",
    "no credits taken/all credits paid back duly": "no_credits"
}

# PURPOSE
purpose_mapping = {
    "domestic appliances": "domestic_appliances",
    "car (new)": "new_car",
    "radio/television": "radio_tv",
    "car (used)": "used_car",
    "others": "others",
    "retraining": "retraining",
    "education": "education",
    "repairs": "repairs",
    "furniture/equipment": "furniture",
    "business": "business"
}

# SAVINGS
savings_mapping = {
    "... < 100 DM": "lt_100",
    "unknown/no savings account": "no_savings",
    "100 <= ... < 500 DM": "btw_100_500",
    "500 <= ... < 1000 DM": "btw_500_1000",
    "... >= 1000 DM": "gte_1000"
}

# EMPLOYMENT DURATION
employment_mapping = {
    "1 <= ... < 4 years": "1_to_4_yrs",
    "... >= 7 years": "gte_7_yrs",
    "4 <= ... < 7 years": "4_to_7_yrs",
    "... < 1 year": "lt_1_yr",
    "unemployed": "unemployed"
}

# OTHER DEBTORS / GUARANTORS
other_debtors_mapping = {
    "none": "none",
    "guarantor": "guarantor",
    "co-applicant": "co_applicant"
}

# PROPERTY
property_mapping = {
    "car or other": "car_other",
    "real estate": "real_estate",
    "building society savings agreement/life insurance": "savings_insurance",
    "unknown/no property": "no_property"
}

# OTHER INSTALLMENT PLANS
installment_plans_mapping = {
    "none": "none",
    "bank": "bank",
    "stores": "stores"
}

# HOUSING
housing_mapping = {
    "own": "own",
    "rent": "rent",
    "for free": "free"
}

# JOB
job_mapping = {
    "skilled employee/official": "skilled",
    "unskilled - resident": "unskilled_resident",
    "management/self-employed/highly qualified employee/officer": "management",
    "unemployed/unskilled - non-resident": "unskilled_nonresident"
}

In [29]:
df['credit_history'] = df['credit_history'].map(credit_history_mapping)
df['purpose'] = df['purpose'].map(purpose_mapping)
df['savings'] = df['savings'].map(savings_mapping)
df['employment_duration'] = df['employment_duration'].map(employment_mapping)
df['other_debtors'] = df['other_debtors'].map(other_debtors_mapping)
df['property'] = df['property'].map(property_mapping)
df['other_installment_plans'] = df['other_installment_plans'].map(installment_plans_mapping)
df['housing'] = df['housing'].map(housing_mapping)
df['job'] = df['job'].map(job_mapping)

In [None]:
df

In [33]:
X = df.drop(columns='credit_risk')
y = df['credit_risk']


In [35]:
X.columns

Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'other_debtors',
       'present_residence', 'property', 'age', 'other_installment_plans',
       'housing', 'number_credits', 'job', 'people_liable', 'telephone',
       'foreign_worker', 'Sex'],
      dtype='object')

In [37]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [39]:
numerical_cols

['duration',
 'amount',
 'installment_rate',
 'present_residence',
 'age',
 'number_credits',
 'people_liable',
 'credit_risk']

In [41]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test_original = X_test.copy() 

In [45]:
X_test_original.columns

Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'other_debtors',
       'present_residence', 'property', 'age', 'other_installment_plans',
       'housing', 'number_credits', 'job', 'people_liable', 'telephone',
       'foreign_worker', 'Sex'],
      dtype='object')

In [47]:
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])


In [49]:
exclude_scaling = ['people_liable', 'telephone', 'foreign_worker', 'Sex']

# Get all numerical columns from the dataset
all_numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Filter to get only the columns you DO want to scale
numerical_to_scale = [col for col in all_numerical_cols if col not in exclude_scaling]

# Initialize and apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_to_scale])
X_test_scaled = scaler.transform(X_test[numerical_to_scale])



In [51]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_to_scale, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_to_scale, index=X_test.index)


In [53]:
X_train_unscaled_df = X_train[exclude_scaling].copy()
X_test_unscaled_df = X_test[exclude_scaling].copy()

# 3. Create DataFrames for encoded categorical features
encoded_cat_features = encoder.get_feature_names_out(categorical_cols)

X_train_cat_df = pd.DataFrame(X_train_cat, columns=encoded_cat_features, index=X_train.index)
X_test_cat_df = pd.DataFrame(X_test_cat, columns=encoded_cat_features, index=X_test.index)


In [55]:
X_train_final = pd.concat([X_train_scaled_df, X_train_unscaled_df, X_train_cat_df], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test_unscaled_df, X_test_cat_df], axis=1)


In [None]:
X_test_final

In [59]:
mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16), max_iter=1, warm_start=True, random_state=42)


In [None]:
epochs = 200

# 6. Perform training over 100 epochs.
# On the first call, use partial_fit with the classes parameter.
classes = np.unique(y_train)

for epoch in range(epochs):
    if epoch == 0:
        mlp.partial_fit(X_train_final, y_train, classes=classes)
    else:
        mlp.partial_fit(X_train_final, y_train)
    # Optionally, print epoch number and current loss value.
    print(f"Epoch {epoch+1:03d}, Loss: {mlp.loss_:.4f}")

In [63]:
y_pred = mlp.predict(X_test_final)

In [65]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.71


In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [67]:
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted_Label'], index=X_test.index)

In [69]:
merged_df = pd.concat([X_test_original, y_pred_df], axis=1)

In [71]:
merged_df.columns

Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'other_debtors',
       'present_residence', 'property', 'age', 'other_installment_plans',
       'housing', 'number_credits', 'job', 'people_liable', 'telephone',
       'foreign_worker', 'Sex', 'Predicted_Label'],
      dtype='object')

In [None]:
merged_df

In [75]:
import pandas as pd
import numpy as np
import itertools
from scipy.stats import pointbiserialr, chi2_contingency, chi2
from sklearn.impute import SimpleImputer
import statsmodels.formula.api as smf
from scipy.special import expit
from IPython.display import display

In [89]:
categorical_features = ["status",
 "credit_history",
 "purpose",
 'savings',
 "employment_duration",
 "other_debtors",
 "property",
 "other_installment_plans",
 "housing",
 "job"]

target = "Predicted_Label"

numeric_features = ['duration',
 'amount',
 'installment_rate',
 'present_residence',
 'age',
 'number_credits',
 'people_liable',
 'telephone',
 'foreign_worker',
 'Sex']

In [None]:
import numpy as np
from scipy.stats import pointbiserialr, chi2_contingency
from sklearn.impute import SimpleImputer
from IPython.display import display

# 1) Main‐effect ranking
def compute_main_importance(df, numeric_feats, categorical_feats, target, alpha=0.05):
    df = df.copy()
    for c in categorical_feats:
        df[c] = df[c].astype('category')
    def cramers_v(tbl):
        chi2_stat = chi2_contingency(tbl, correction=False)[0]
        n = tbl.values.sum(); r, k = tbl.shape
        return np.sqrt(chi2_stat / (n * (min(r, k) - 1)))
    rows = []
    for feat in numeric_feats:
        r, p = pointbiserialr(df[feat], df[target])
        eff = abs(r) if p < alpha else 0.0
        rows.append({'Feature': feat, 'p_value': round(p,4), 'Effect': round(eff,4)})
    for feat in categorical_feats:
        tbl = pd.crosstab(df[feat], df[target])
        _, p, _, _ = chi2_contingency(tbl, correction=False)
        eff = cramers_v(tbl) if p < alpha else 0.0
        rows.append({'Feature': feat, 'p_value': round(p,4), 'Effect': round(eff,4)})
    return pd.DataFrame(rows).sort_values('Effect', ascending=False).reset_index(drop=True)

# 2) Single‐run Jaccard@Top-5
def compute_single_jaccard_top5(df, num_feats, cat_feats, target, alpha=0.05):
    orig = compute_main_importance(df, num_feats, cat_feats, target, alpha)['Feature'][:5].tolist()
    pert = df.copy()
    # noise + MCAR
    for c in num_feats:
        pert[c] += np.random.normal(0, 0.01*pert[c].std(), size=len(pert))
    frac = np.random.uniform(0.05,0.10)
    idx = pert.sample(frac=frac).index
    pert.loc[idx, num_feats]     = np.nan
    idx = pert.sample(frac=frac).index
    pert.loc[idx, cat_feats]     = np.nan
    pert[num_feats] = SimpleImputer(strategy='mean').fit_transform(pert[num_feats])
    pert[cat_feats] = SimpleImputer(strategy='most_frequent').fit_transform(pert[cat_feats])
    new = compute_main_importance(pert, num_feats, cat_feats, target, alpha)['Feature'][:5].tolist()

    j = len(set(orig)&set(new))/len(set(orig)|set(new))
    return j, orig, new

def evaluate_jaccard_topk_noise(df, numeric_features, categorical_features, target,
                                n_runs=50, top_k=5, alpha=0.05,
                                gaussian_frac=0.01, cat_noise_frac=0.05):
    orig = compute_main_importance(df, numeric_features, categorical_features, target, alpha)['Feature'][:top_k].tolist()
    orig_set = set(orig)
    scores = []
    for _ in range(n_runs):
        pert = perturb_with_noise(df, numeric_features, categorical_features, gaussian_frac, cat_noise_frac)
        new = compute_main_importance(pert, numeric_features, categorical_features, target, alpha)['Feature'][:top_k].tolist()
        scores.append(len(orig_set & set(new)) / len(orig_set | set(new)) if orig_set else 0.0)
    return {
        f'avg_jaccard_top_{top_k}': np.mean(scores),
        f'std_jaccard_top_{top_k}': np.std(scores)
    }

# === RUN THIS AFTER merged_df IS DEFINED ===
# numeric_features = ['age', 'fnlwgt', 'hours_per_week']
# categorical_features = ["workclass", "education", "marital-status","occupation","relationship","race","gender","native-country"]
# target = 'Predicted_Label'
# alpha = 0.05

# show main‐effect ranking
main_df = compute_main_importance(merged_df, numeric_features, categorical_features, target, alpha)
display(main_df)

# compute one‐off Jaccard@5
j, orig_top5, pert_top5 = compute_single_jaccard_top5(
    merged_df, numeric_features, categorical_features, target, alpha
)
print("Original Top-5:", orig_top5)
print("Perturbed Top-5:", pert_top5)
print(f"Jaccard@Top-5 = {j:.2f}")

# Average over 50 runs
stability = evaluate_jaccard_topk_noise(
    merged_df, numeric_features, categorical_features, target,
    n_runs=50, top_k=5, alpha=alpha,
    gaussian_frac=0.01, cat_noise_frac=0.05
)
print("\nNoise-only stability:", stability)


In [109]:
import os
save_dir = r"/Users/arsh/real_STAT"           # ← Windows example
# save_dir = "/Users/alice/Pictures/MyPlots"            # ← macOS/Linux example

# 2) Make sure it exists
os.makedirs(save_dir, exist_ok=True)

# 3) Build the full filename
filename = "MLP_German.jpg"
output_path = os.path.join(save_dir, filename)

In [None]:
import matplotlib.pyplot as plt

plot_df = main_df[main_df['Effect'] > 0]

plt.figure(figsize=(8, 6))
plt.barh(plot_df['Feature'], plot_df['Effect'], color='skyblue')
plt.xlabel("Effect Size")
plt.title("Ranked Effect Sizes for Significant Features (Effect > 0)")
plt.gca().invert_yaxis()  # largest effect at top
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig(output_path, dpi=300)
plt.show()

print("Saved plot to:", os.path.abspath(output_path))