# German credit dataset

## Reading and preprocessing data

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 

y.loc[y['class']==2,'class'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[y['class']==2,'class'] = 0


In [5]:
def translate_german_credit(df):
        attribute_descriptions = {
                        'A11': '< 0 DM',
                        'A12': '0 <= ... < 200 DM',
                        'A13': '>= 200 DM / salary assignments for at least 1 year',
                        'A14': 'no checking account',
                        'A30': 'no credits taken/all credits paid back duly',
                        'A31': 'all credits at this bank paid back duly',
                        'A32': 'existing credits paid back duly till now',
                        'A33': 'delay in paying off in the past',
                        'A34': 'critical account/other credits existing (not at this bank)',
                        'A40': 'car (new)',
                        'A41': 'car (used)',
                        'A42': 'furniture/equipment',
                        'A43': 'radio/television',
                        'A44': 'domestic appliances',
                        'A45': 'repairs',
                        'A46': 'education',
                        'A47': '(vacation - does not exist?)',
                        'A48': 'retraining',
                        'A49': 'business',
                        'A410': 'others',
                        'A61': '< 100 DM',
                        'A62': '100 <= ... < 500 DM',
                        'A63': '500 <= ... < 1000 DM',
                        'A64': '>= 1000 DM',
                        'A65': 'unknown/no savings account',
                        'A71': 'unemployed/unskilled - non-resident',
                        'A72': 'unskilled - resident',
                        'A73': '1 <= ... < 4 years',
                        'A74': '4 <= ... < 7 years',
                        'A75': '>= 7 years',
                        'A91': 'male: divorced/separated',
                        'A92': 'female: divorced/separated/married',
                        'A93': 'male: single',
                        'A94': 'male: married/widowed',
                        'A95': 'female: single',
                        'A101': 'none',
                        'A102': 'co-applicant',
                        'A103': 'guarantor',
                        'A121': 'real estate',
                        'A122': 'building society savings agreement/life insurance',
                        'A123': 'car or other, not in attribute 6',
                        'A124': 'unknown/no property',
                        'A141': 'bank',
                        'A142': 'stores',
                        'A143': 'none',
                        'A151': 'rent',
                        'A152': 'own',
                        'A153': 'for free',
                        'A171': 'unemployed/unskilled - non-resident',
                        'A172': 'unskilled - resident',
                        'A173': 'skilled employee / official',
                        'A174' : 'management/ self-employed/highly qualified employee/ officer',
                        'A191': 'none',
                        'A192': 'yes, registered under the customer\'s name',
                        'A201': 'yes',
                        'A202': 'no'
                }

        column_name_mapping = {
                'Attribute1': 'Status of existing checking account',
                'Attribute2': 'Duration',
                'Attribute3': 'Credit history',
                'Attribute4': 'Purpose',
                'Attribute5': 'Credit amount',
                'Attribute6': 'Savings account/bonds',
                'Attribute7': 'Present employment since',
                'Attribute8': 'Installment rate in percentage of disposable income',
                'Attribute9': 'Personal status and sex',
                'Attribute10': 'Other debtors / guarantors',
                'Attribute11': 'Present residence since',
                'Attribute12': 'Property',
                'Attribute13': 'Age',
                'Attribute14': 'Other installment plans',
                'Attribute15': 'Housing',
                'Attribute16': 'Number of existing credits at this bank',
                'Attribute17': 'Job',
                'Attribute18': 'Number of people being liable to provide maintenance for',
                'Attribute19': 'Telephone',
                'Attribute20': 'Foreign worker'
                }

        df = df.replace(attribute_descriptions)
        df.rename(columns=column_name_mapping, inplace=True)
        return df

In [6]:
X = translate_german_credit(X)

## Training a Classifier

In [7]:
categorical_features = ['Status of existing checking account', 'Credit history', 'Purpose', 'Savings account/bonds', 'Present employment since', 
                        'Personal status and sex', 'Other debtors / guarantors', 'Property', 'Other installment plans', 'Housing', 'Job', 'Telephone', 'Foreign worker']
all_features = X.columns
numerical_features = [feature for feature in all_features if feature not in categorical_features]


In [8]:


# Define transformers for the preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Example usage
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y['class'], test_size=0.2, random_state=42)

# Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = rf_pipeline.predict(X_test)

from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.46      0.56        59
           1       0.80      0.92      0.86       141

    accuracy                           0.79       200
   macro avg       0.76      0.69      0.71       200
weighted avg       0.78      0.79      0.77       200



In [9]:
import pickle
from pathlib import Path
directory = Path("./models")
# Create the directory if it does not exist
directory.mkdir(parents=True, exist_ok=True)


# Open the file in binary write mode and save the object
with open('./models/german_credit_model.pkl', 'wb') as file:
    pickle.dump(rf_pipeline, file)

## Creating necessary Files for LLM Explanation

In [12]:
X_train['class'] = y_train
X_test['class'] = y_test
X_train.to_csv('./data/germancredit_train_dataset.csv', index = False)
X_test.to_csv('./data/germancredit_test_dataset.csv', index = False)

In [23]:
X_test['prediction'] = y_pred
test = X_test.loc[X_test['prediction']==0]
male_test = test.loc[test['Personal status and sex'].isin(['male: divorced/separated','male: single', 'male: married/widowed'])]
male_test = translate_german_credit(male_test)
male_test = male_test.iloc[[2,4,7,16,24]]
female_test = test.loc[test['Personal status and sex'].isin(['female: divorced/separated/married','female: single'])]
female_test = translate_german_credit(female_test)
female_test = female_test.iloc[[0,2,4,6,7]]


print('Male: ', male_test.shape, male_test['Age'].values)
print('Female: ',female_test.shape, female_test['Age'].values)
male_test.drop(columns = ['class', 'prediction'], axis = 1, inplace = True)
female_test.drop(columns = ['class', 'prediction'], axis = 1, inplace = True)
male_test.to_csv('./data/german_male_test.csv', index = False)
female_test.to_csv('./data/german_female_test.csv', index = False)

Male:  (5, 22) [20 36 33 25 40]
Female:  (5, 22) [23 25 24 24 24]


## Generating Explanations

In [24]:
import pandas as pd
import pickle
# Load the model we want to explain
with open("""./models/german_credit_model.pkl""", 'rb') as file:
    model = pickle.load(file)

#Load the train and test data set
train_dataset = pd.read_csv('./data/germancredit_train_dataset.csv')
test_dataset = pd.read_csv('./data/germancredit_test_dataset.csv')

#Load the examples we will try to explain
test_male = pd.read_csv('./data/german_male_test.csv')
test_female = pd.read_csv('./data/german_female_test.csv')

dataset_info = """Here are all the features and their possible values:
Status of existing checking account: ('< 0 DM','0 <= ... < 200 DM','>= 200 DM / salary assignments for at least 1 year','no checking account')
Duration: ('...time periods...')
Credit history: ('no credits taken/all credits paid back duly', 'all credits at this bank paid back duly', 'existing credits paid back duly till now', 'delay in paying off in the past', 'critical account/other credits existing (not at this bank)')
Purpose: ('car (new)', 'car (used)', 'furniture/equipment', 'radio/television', 'domestic appliances', 'repairs', 'education', '(vacation - does not exist?)', 'retraining', 'business', 'others')
Credit amount: ('...amount ranges...')
Savings account/bonds: ('< 100 DM', '100 <= ... < 500 DM', '500 <= ... < 1000 DM', '>= 1000 DM', 'unknown/no savings account')
Present employment since: ('unemployed/unskilled - non-resident', 'unskilled - resident', '1 <= ... < 4 years', '4 <= ... < 7 years', '>= 7 years')
Installment rate in percentage of disposable income: ('...percentage rates...')
Personal status and sex: ('male: divorced/separated', 'female: divorced/separated/married', 'male: single', 'male: married/widowed', 'female: single')
Other debtors / guarantors: ('none', 'co-applicant', 'guarantor')
Present residence since: ('...years...')
Property: ('real estate', 'building society savings agreement/life insurance', 'car or other, not in attribute 6', 'unknown/no property')
Age: ('...age ranges...')
Other installment plans: ('bank', 'stores', 'none')
Housing: ('rent', 'own', 'for free')
Number of existing credits at this bank: ('...number of credits...')
Job: ('unemployed/unskilled - non-resident', 'unskilled - resident', 'skilled employee / official', 'management/ self-employed/highly qualified employee/ officer')
Number of people being liable to provide maintenance for: ('...number of people...')
Telephone: ('none', 'yes, registered under the customer\'s name')
Foreign worker: ('yes', 'no')
"""

In [18]:
train_dataset.columns

Index(['Status of existing checking account', 'Duration', 'Credit history',
       'Purpose', 'Credit amount', 'Savings account/bonds',
       'Present employment since',
       'Installment rate in percentage of disposable income',
       'Personal status and sex', 'Other debtors / guarantors',
       'Present residence since', 'Property', 'Age', 'Other installment plans',
       'Housing', 'Number of existing credits at this bank', 'Job',
       'Number of people being liable to provide maintenance for', 'Telephone',
       'Foreign worker', 'class'],
      dtype='object')

In [25]:
from llm_explainers import *
exp_m = LLMExplanation4CFs(model = model, #Load the model we want to explain
                            model_description = """ML-system that predicts wether a person has good or bad credit""", # brief explanation of the ML model
                            backend='sklearn', # Framework used to build the model (used to generate counterfactuals)
                            dataset_info=dataset_info , # string information about the dataset
                            continuous_features=['Duration','Credit amount','Installment rate in percentage of disposable income' ,'Age','Present residence since',
                                                'Number of existing credits at this bank','Number of people being liable to provide maintenance for'], # Necessary for the counterfactual generation
                            outcome_name= 'class', #Necessary for counterfactual generation
                            training_set=train_dataset, #Necessary for counterfactual generation
                            test_set= test_dataset, #Necessary to  check novelty of the evaluation example
                            llm='gpt-4o', #LLM used, works with Langchain
                            prompt_type='one', # zero or one
                            n_counterfactuals=5, #Number of counterfactuals used in the explanation 
                            user_input=False #Human in the loop helping select the causes
                           )


exp_m.fit()
counterfactuals, rules, code1, result1, explanation, code2, final_cf, code3, prediction, n_rules,rules_followed, first_rule, second_rule,third_rule, is_in_data = exp_m.explain_evaluate(user_data = test_male.iloc[[2]], verbose = False,return_all=True)

100%|██████████| 1/1 [00:00<00:00,  1.57it/s]


In [27]:
test_male.iloc[[2]]

Unnamed: 0,Status of existing checking account,Duration,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,Present residence since,Property,Age,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker
2,>= 200 DM / salary assignments for at least 1 ...,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,1,building society savings agreement/life insurance,33,none,own,2,skilled employee / official,1,none,yes


In [41]:
df = pd.concat([test_male.iloc[[2]],counterfactuals])
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Status of existing checking account,Duration,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,class
0,>= 200 DM / salary assignments for at least 1 ...,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,...,building society savings agreement/life insurance,33,none,own,2,skilled employee / official,1,none,yes,
1,no checking account,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,...,building society savings agreement/life insurance,33,none,own,2,skilled employee / official,1,none,yes,1.0
2,>= 200 DM / salary assignments for at least 1 ...,22,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,...,building society savings agreement/life insurance,33,none,own,2,skilled employee / official,1,none,yes,1.0
3,>= 200 DM / salary assignments for at least 1 ...,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,...,building society savings agreement/life insurance,44,none,own,2,skilled employee / official,1,none,yes,1.0
4,>= 200 DM / salary assignments for at least 1 ...,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,none,...,building society savings agreement/life insurance,46,none,own,2,skilled employee / official,1,none,yes,1.0
5,>= 200 DM / salary assignments for at least 1 ...,42,no credits taken/all credits paid back duly,business,6289,< 100 DM,unskilled - resident,2,male: divorced/separated,co-applicant,...,building society savings agreement/life insurance,33,none,own,2,unemployed/unskilled - non-resident,1,none,yes,0.0


In [39]:
print(df)

None


In [42]:
with open("Output.txt", "w") as text_file:
    text_file.write(explanation)

In [32]:
print(explanation)

### Explanation:

Based on the analysis of the factors that influence credit approval, it is clear that certain changes can significantly improve your chances of being classified with good credit. Here are the most impactful steps you can take:

1. **Status of Existing Checking Account**:
   - **Action**: Aim to have a checking account with a high status (e.g., with at least 200 DM in assignments for at least 1 year) or consider opening a checking account if you don't have one.
   - **Reason**: The analysis shows that having a high-status checking account or maintaining a checking account for a longer period is strongly associated with better credit outcomes. This was supported by 5 counterfactual cases.

2. **Duration of Credit**:
   - **Action**: If possible, aim for shorter credit durations.
   - **Reason**: While this rule was less frequently observed (1 counterfactual), shorter durations were associated with higher credit approval rates. For example, a duration of 22 months was mo

In [44]:
for i in range(test_male.shape[0]):
    counterfactuals, rules, code, result, explanation = exp_m.explain(user_data = test_male.iloc[[i]], verbose = False,return_all=True)
    with open(f"explanation_male{i}.txt", "w") as text_file:
        text_file.write(explanation)
    df = pd.concat([test_male.iloc[[i]],counterfactuals])
    df.reset_index(inplace=True, drop=True)
    df.to_csv(f'counterfactuals_male{i}.csv', index = False)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
100%|██████████| 1/1 [00:00<00:00,  1.52it/s]
100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]
100%|██████████| 1/1 [00:00<00:00,  1.53it/s]


In [45]:
for i in range(test_female.shape[0]):
    counterfactuals, rules, code, result, explanation = exp_m.explain(user_data = test_female.iloc[[i]], verbose = False,return_all=True)
    with open(f"explanation_female{i}.txt", "w") as text_file:
        text_file.write(explanation)
    df = pd.concat([test_female.iloc[[i]],counterfactuals])
    df.reset_index(inplace=True, drop=True)
    df.to_csv(f'counterfactuals_female{i}.csv', index = False)

100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
