Step 1: Data download and read

In [None]:
### Necessary imports
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import fbeta_score

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
import shap

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# from sklearn import metrics
# sorted(metrics.SCORERS.keys())

### Data download, Split into train and test

In [None]:
df_credit = pd.read_csv("german_credit.csv")
y = df_credit['Risk']
X = df_credit.drop(columns = ['Risk'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

df_train = pd.concat([X_train, y_train], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)

print(df_train.shape, df_test.shape)

In [None]:
df_train.head()

Train and Test Distribution comparison 

In [None]:
df_train.loc[:, 'Credit History'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Credit History'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Credit History')
plt.legend()

In [None]:
df_train.loc[:, 'Age'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Age'].hist(alpha=0.5, label='Test', density=True)  
plt.xlabel('Age')
plt.legend()

In [None]:
df_train.loc[:, 'Job'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Job'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Job')
plt.legend()

In [None]:
df_train.loc[:, 'Duration'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Duration'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Duration')
plt.legend()

In [None]:
df_train.loc[:, 'Credit amount'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Credit amount'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Credit amount')
plt.legend()

In [None]:
df_train.loc[:, 'Saving accounts'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Saving accounts'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Saving accounts')
plt.legend()

In [None]:
from scipy.stats import ks_2samp
ks_2samp(df_train['Age'], df_test['Age'])
ks_2samp(df_train['Credit amount'], df_test['Credit amount'])

### Preliminary data analysis

In [None]:
df_train.head()

In [None]:
df_train.nunique()

In [None]:
df_train.Risk.value_counts() ### Good = 1 (credit worthy), Bad = 0 (not worthy)

In [None]:
df_train.Risk.value_counts()

In [None]:
df_train_summ = df_train.describe()
df_train_summ

In [None]:
plt.hist(df_train['Credit amount'])
plt.title('Credit amount distribution')

In [None]:
### the credit amount is highly skewed distribution, lets analyse the extreme values beyond 3 sigma
def extreme_count(sig_factor, feat):
    sig_cutoff = df_train_summ[feat]['mean'] + sig_factor*df_train_summ[feat]['std'] 
    sig_count = len(df_train[df_train[feat] > sig_cutoff])
    print("instances of {} greater than {} sigma ({} cutoff) are {}".format(feat, sig_factor, sig_cutoff, sig_count))
    return

extreme_count(3, feat = 'Credit amount')

In [None]:
plt.hist(df_train['Age'])
plt.title('Age distribution')
extreme_count(3, feat = 'Age')

In [None]:
plt.hist(df_train['Duration'])
plt.title('Duration distribution')
extreme_count(3, feat = 'Duration')

In [None]:
''' Even if there are certain instances where the above features are beyond 3sigma of their mean value, they dont appear to be 
outliers, as its legible to have certain certain loans with high credit value, or loan duration is longer, or older population 
is seeking loan. Hence, I am not eliminating these rows'''

### Finding Missing values, checking if they are legitimate and applying apt transformation

In [None]:
df_train.isnull().sum()

In [None]:
### NaN is a valid field here implying no saving account

df_train['Saving accounts'].value_counts()
df_train['Saving accounts'].unique()

### So, replacing NaN with 'no account'
df_train.loc[df_train['Saving accounts'].isnull(), 'Saving accounts'] = 'no account'

### Replaced in df
df_train['Saving accounts'].value_counts()
df_train['Saving accounts'].unique()

### No NaNs anymore
df_train.isnull().sum()

### Checking data types and categorical states of features for encoding

In [None]:
df_train.dtypes

In [None]:
df_train.nunique()

### Feature Selection for label and one hot encoding

In [None]:
#df_count = (pd.DataFrame(df_train.nunique(), columns=['count'])).reset_index()
#le_list = df_count[df_count['count'] == 2]['index'].values.tolist()
df_dtypes = pd.DataFrame((df_credit.dtypes == 'object'), columns = ['obj_type'])
obj_list = df_dtypes[(df_dtypes.obj_type == True)].index
#print(obj_list)
#ohe_list = list(obj_list.difference(le_list))
print("Features for label encoding:", obj_list)
#print("Features for OHE:", ohe_list)

### Label Encoding

In [None]:
le_obj = LabelEncoder()
for feat in obj_list:
    df_train[feat] = le_obj.fit_transform(df_train[feat])

In [None]:
df_train

### Distribution of Risk variable

In [None]:
df_train.Risk.value_counts()

In [None]:
279/621

In [None]:
df_train.corr()
### Credit amount and Duration have high correlation relatively and can be considered in feature selection step to drop 'Duration'

### Ques1 : More credit history is equivalent to credit worthiness

In [None]:
plt.hist([df_train.loc[df_train['Risk'] == 0, 'Credit History'].values, df_train.loc[df_train['Risk'] == 1, 'Credit History'].values], alpha=0.5, label=['Bad Risk', 'Good Risk'])
plt.legend(loc='upper right')

In [None]:
df_train[df_train['Risk'] == 0]['Credit History'].value_counts()

In [None]:
df_train[df_train['Risk'] == 1]['Credit History'].value_counts()

Conclusion: As the credit history increases, the good risk increases proportionately i..e credit worthiness improves sharply

Ques2 : Are young people more credit worthy?
Yes

In [None]:
plt.hist([df_train.loc[df_train['Risk'] == 0, 'Age'].values, df_train.loc[df_train['Risk'] == 1, 'Age'].values], alpha=0.5, label=['Bad Risk', 'Good Risk'])
plt.legend(loc='upper right')

In [None]:
df_train.Age[df_train.Age <= 30] = 0
df_train.Age[(df_train.Age > 30) & (df_train.Age < 45)] = 1
df_train.Age[(df_train.Age >= 45)] = 2

In [None]:
df_train[df_train['Risk'] == 0]['Age'].value_counts()
df_train[df_train['Risk'] == 1]['Age'].value_counts()

More credit accounts equivalent to more credit worthy?
Inconclusive

In [None]:
plt.hist([df_train.loc[df_train['Risk'] == 0, 'Saving accounts'].values, df_train.loc[df_train['Risk'] == 1, 'Saving accounts'].values], alpha=0.5, label=['Bad Risk', 'Good Risk'])
plt.legend(loc='upper right')

In [None]:
df_train[df_train['Risk'] == 0]['Saving accounts'].value_counts()
df_train[df_train['Risk'] == 1]['Saving accounts'].value_counts()

### Modelling

X_train, y_train prep

In [None]:
y_train = df_train['Risk']
X_train = df_train.drop(columns = ['Risk'])

Test data prep

In [None]:
df_test.isnull().sum()
### So, replacing NaN with 'no account'
df_test.loc[df_test['Saving accounts'].isnull(), 'Saving accounts'] = 'no account'
df_test.isnull().sum()

In [None]:
le_obj = LabelEncoder()
for feat in obj_list:
    df_test[feat] = le_obj.fit_transform(df_test[feat])

In [None]:
df_test.head()

In [None]:
train_cols = df_train.columns.tolist()
test_cols = df_test.columns.tolist()

In [None]:
delta_cols = list(set(train_cols).difference(test_cols))

In [None]:
delta_cols

In [None]:
y_test = df_test['Risk']
X_test = df_test.drop(columns = ['Risk'])

In [None]:
import numpy as np
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=1/5)

In [None]:
### As it is bad to classify a customer as good when they are bad i.e. objective is to reduce FP, we want better precision 
### Hence, applying beta = 1/5 and selecting fbeta_score as evaluation metric
fbeta_mu_score, fbeta_sigma_score = [], []
model_list = [DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression()]
for model in model_list:
    obj = model
    scores = cross_val_score(obj, X_train, y_train, cv=5, scoring = ftwo_scorer)
    fbeta_mu_score.append(np.mean(scores))
    fbeta_sigma_score.append(np.std(scores))

In [None]:
fbeta_mu_score
fbeta_sigma_score

In [None]:
model_index = fbeta_mu_score.index(max(fbeta_mu_score))
selected_model = model_list[model_index]
selected_model

In [None]:
selected_model = RandomForestClassifier(max_depth=3, min_samples_leaf=50)
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)
fbeta_score(y_test, y_pred, beta=1/5)

In [None]:
df_feat = pd.DataFrame()
df_feat['cols'] = X_train.columns
if str(selected_model)[:3] == 'Log' :
    df_feat['importance'] = np.abs(selected_model.coef_[0])
else:
    df_feat['importance'] = selected_model.feature_importances_

In [None]:
top_3 = list(df_feat.sort_values(by = 'importance', ascending = False).head(3)['cols'].values)
print("top 3 features:", top_3)
print("Top 3 features' cumulative importance:", np.round(100*df_feat.sort_values(by = 'importance', ascending = False).head(3)['importance'].sum()))

In [None]:
df_test.head()

In [None]:
# Interpretibility
### Supply the index of specific test row

row_idx = 589
max_display  = 300
data_for_prediction = X_test.loc[row_idx]
data_for_prediction = np.array(data_for_prediction).reshape(1,-1)
# Calculate Shap values
explainer = shap.TreeExplainer(selected_model)
shap_values = explainer.shap_values(data_for_prediction)

pred_probability = selected_model.predict_proba(data_for_prediction)
prediction = np.argmax(pred_probability)
print("prediction: ", prediction, pred_probability)

shap_value = shap_values[prediction]

feature_order = np.argsort(np.sum(np.abs(shap_value), axis=0))
feature_order = np.flip(feature_order[-min(max_display, len(feature_order)):], 0)

top_shape_vals = [shap_value[0][i] for i in feature_order]

top_feature_names = [train_cols[i] for i in feature_order]

fig = plt.figure(figsize=(8,5))
y_pos = np.arange(len(top_feature_names))

values = np.flip(top_shape_vals)
features = np.flip(top_feature_names)

plt.barh(y_pos, values, 0.7, align='center')
plt.yticks(y_pos, fontsize=13)
plt.gca().set_yticklabels(features)
plt.title("Feature Importance")

In [None]:
fig = plt.figure(figsize=(8,5))
y_pos = np.arange(len(top_feature_names))

values = np.flip(top_shape_vals)
features = np.flip(top_feature_names)

plt.barh(y_pos, values, 0.7, align='center')
plt.yticks(y_pos, fontsize=13)
plt.gca().set_yticklabels(features)
plt.title("Feature Importance")

In [None]:
df_test.loc[881,:]

### Roughwork

In [None]:
'''Downloaded the data from the given link, copied the contents in a csv and removed the last 4 columns for which there wasnt
any description. Drafted cols_list to give the header to the dataframe''' 

cols_list = ['Existing_Acc', 'Dur_Month', 'Credit_History', 'Credit_Purpose', 'Credit_Amt', 'Svgs', 'Employed_since', 'I_to_I', 'Gender', 'Guarantors', 'Residence_since', 'Property', 'Age', 'Installment_Plans', 'Housing', 'Existing_Credits', 'Job', 'Liable_Cnt', 'Phone', 'Foreign_Worker', 'Worthy']
df = pd.read_csv("German_data.csv", header=None)
df.columns = cols_list
df.head()

In [None]:
### Confirming the states of each attribute with the data description file for consistency check
[(k, df[k].unique()) for k in df.columns]

In [None]:
### Job has 2 unique vals, while description says 4
### People being libale is just 2 states, doesnt look apt
### Existing credits, Instalment plans and housing -- same reason
### Guarantors is Age column
### Residence_since is Credit_Purpose

cols_selected_till_now = ['Age', 'Property', 'Phone', 'Foreign_Worker', 'Worthy']

### Based on discrepancies arising between the column attributes and names, as the file downloaded didnt have col names mapped strcuture
### Checked for csv form data and found this csv. Will be using this for the rest of the analysis