In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [2]:
data = pd.read_csv("../data/CreditScoring.csv")

In [44]:
def data_prep_function(data):
    df = data.copy()
    df.columns = df.columns.map(lambda x: x.lower())
    
    status_map = {
    1: "ok",
    2: "default",
    0: "unknown"
    }
    df['status'] = df['status'].map(status_map)
    home_ownership_map = {
        1: "rent",
        2: "owner",
        3: "priv",
        4: "ignore",
        5: "parents",
        6: "other",
        0: "unknown"
    }
    df['home'] = df['home'].map(home_ownership_map)
    marital_status_map = {
        1: "single",
        2: "married",
        3: "widowed",
        4: "separated",
        5: "divorced",
        0: "unknown",
    }
    df['marital'] = df['marital'].map(marital_status_map)
    records_map = {
        1 : "no_records",
        2 : "yes_records"
    }
    df['records'] = df['records'].map(records_map)
    job_status_map = {
        1: "fixed",
        2: "part-time",
        3: "freelance",
        4: "others",
        0: "unknown"
    }
    df['job'] = df['job'].map(job_status_map)
    
    df['income'] = df['income'].replace(to_replace=99999999, value = np.nan)
    df['assets'] = df['assets'].replace(to_replace=99999999, value = np.nan)
    df['debt'] = df['debt'].replace(to_replace=99999999, value = np.nan)
    
    df = df[df['status'] != 'unknown'].reset_index(drop=True)
    
    y = (df['status'] == 'default').astype(int)
    X = df.drop(columns=['status'])
    
    X['income'] = X['income'].fillna(np.median(X['income'].dropna()))
    X['assets'] = X['assets'].fillna(0)
    X['debt'] = X['debt'].fillna(0)
    
    return X, y

In [45]:
X, y = data_prep_function(data)

In [46]:
X.isnull().sum()    

seniority    0
home         0
time         0
age          0
marital      0
records      0
job          0
expenses     0
income       0
assets       0
debt         0
amount       0
price        0
dtype: int64

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [49]:
dv = DictVectorizer(sparse=False)
X_train_dict = X_train.to_dict(orient='records')
X_train_final = dv.fit_transform(X_train_dict)


In [50]:
X_train_final

array([[3.80e+01, 8.50e+02, 3.50e+03, ..., 0.00e+00, 1.00e+00, 2.40e+01],
       [4.80e+01, 8.50e+02, 5.00e+03, ..., 1.00e+00, 1.40e+01, 4.80e+01],
       [4.60e+01, 5.00e+02, 8.00e+03, ..., 0.00e+00, 1.20e+01, 3.60e+01],
       ...,
       [5.40e+01, 6.00e+02, 5.00e+03, ..., 0.00e+00, 2.00e+01, 6.00e+01],
       [3.80e+01, 3.00e+03, 1.25e+04, ..., 1.00e+00, 3.00e+00, 3.60e+01],
       [5.10e+01, 3.50e+02, 2.00e+03, ..., 0.00e+00, 1.20e+01, 1.20e+01]],
      shape=(3117, 27))

In [51]:
dt = DecisionTreeClassifier()
dt.fit(X_train_final, y_train)

In [53]:
X_test_dict = X_test.to_dict(orient='records')
X_test_final = dv.transform(X_test_dict)
y_pred = dt.predict(X_test_final)

In [54]:
roc_auc_score(y_test, y_pred)

np.float64(0.6582345669058561)

In [56]:
for depths in [1,2,3,4,5,10,15,20]:
    dt = DecisionTreeClassifier(max_depth=depths)
    dt.fit(X_train_final, y_train)
    y_pred = dt.predict(X_test_final)
    print(f"Depth: {depths}, AUC: {roc_auc_score(y_test, y_pred)}")

Depth: 1, AUC: 0.6201752264237147
Depth: 2, AUC: 0.6009406326426402
Depth: 3, AUC: 0.6597422666559166
Depth: 4, AUC: 0.6745236367545487
Depth: 5, AUC: 0.6719100217689269
Depth: 10, AUC: 0.6636633610148083
Depth: 15, AUC: 0.6591321991991186
Depth: 20, AUC: 0.6538243435727916


In [60]:
for depths in [1,2,3,4,5,10,15,20]:
    for samples in [1,2,3,4,5,6,7,10,18,20]:
        dt = DecisionTreeClassifier(max_depth=depths, min_samples_leaf=samples)
        dt.fit(X_train_final, y_train)
        y_pred = dt.predict(X_test_final)
        print(f"Depth: {depths}, Min Samples: {samples}, AUC: {roc_auc_score(y_test, y_pred)}")

Depth: 1, Min Samples: 1, AUC: 0.6201752264237147
Depth: 1, Min Samples: 2, AUC: 0.6201752264237147
Depth: 1, Min Samples: 3, AUC: 0.6201752264237147
Depth: 1, Min Samples: 4, AUC: 0.6201752264237147
Depth: 1, Min Samples: 5, AUC: 0.6201752264237147
Depth: 1, Min Samples: 6, AUC: 0.6201752264237147
Depth: 1, Min Samples: 7, AUC: 0.6201752264237147
Depth: 1, Min Samples: 10, AUC: 0.6201752264237147
Depth: 1, Min Samples: 18, AUC: 0.6201752264237147
Depth: 1, Min Samples: 20, AUC: 0.6201752264237147
Depth: 2, Min Samples: 1, AUC: 0.6009406326426402
Depth: 2, Min Samples: 2, AUC: 0.6009406326426402
Depth: 2, Min Samples: 3, AUC: 0.6009406326426402
Depth: 2, Min Samples: 4, AUC: 0.6009406326426402
Depth: 2, Min Samples: 5, AUC: 0.6009406326426402
Depth: 2, Min Samples: 6, AUC: 0.6009406326426402
Depth: 2, Min Samples: 7, AUC: 0.6009406326426402
Depth: 2, Min Samples: 10, AUC: 0.6009406326426402
Depth: 2, Min Samples: 18, AUC: 0.6009406326426402
Depth: 2, Min Samples: 20, AUC: 0.60094063264

In [61]:
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=20)
dt.fit(X_train_final, y_train)
y_pred = dt.predict(X_test_final)
roc_auc_score(y_test, y_pred)

np.float64(0.6903679217393642)

In [62]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       942
           1       0.65      0.49      0.56       395

    accuracy                           0.77      1337
   macro avg       0.73      0.69      0.70      1337
weighted avg       0.76      0.77      0.76      1337

