In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# Monkey patch numpy int deprecation
np.int = int

# Set modern style for visualizations
sns.set_theme(style="whitegrid")
%matplotlib inline

# Data loading and Preprocessing 

In [38]:
# Load the data with correct delimiter
df = pd.read_csv('credit_chart.csv', sep=';')

# Separate features and target
X = df.drop('Creditability', axis=1)
Y = df['Creditability']

# Featuring Engineering

In [35]:
numerical_cols = ['Duration_of_Credit_monthly', 'Credit_Amount', 'Age_years', 'No_of_Credits_at_this_Bank']
categorical_cols = [col for col in X.columns if col not in numerical_cols]

#creating label encoder for categorical variables
label_encoders={}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])
    
# Replace zero in duration to avoid division by zero (if any)
X['Duration_of_Credit_monthly'].replace(0, np.nan, inplace=True)

# Calculate safely
X['Credit_Amount_Per_Month'] = X['Credit_Amount'] / X['Duration_of_Credit_monthly']

# Drop or fill any resulting NaNs or infinities
X['Credit_Amount_Per_Month'].replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(subset=['Credit_Amount_Per_Month'], inplace=True)
    

#Scale numerical features
scaler = StandardScaler()
X[numerical_cols + ['Credit_Amount_Per_Month']] = scaler.fit_transform(X[numerical_cols + ['Credit_Amount_Per_Month']])

In [36]:
X[numerical_cols + ['Credit_Amount_Per_Month']]

Unnamed: 0,Duration_of_Credit_monthly,Credit_Amount,Age_years,No_of_Credits_at_this_Bank,Credit_Amount_Per_Month
0,-0.107268,-0.789704,-1.279195,-0.704660,-0.498671
1,-1.178871,-0.170524,0.040448,1.037709,0.655487
2,-0.777020,-0.863298,-1.103243,-0.704660,-0.439814
3,-0.777020,-0.410059,0.304376,1.037709,-0.054077
4,-0.777020,-0.392722,0.216400,1.037709,-0.039322
...,...,...,...,...,...
995,0.428533,-0.457824,-1.279195,-0.704660,-0.411491
996,0.428533,-0.346018,0.832234,-0.704660,-0.366712
997,0.160633,3.325541,-0.487409,-0.704660,1.343329
998,-0.777020,1.127630,1.448067,-0.704660,1.254599


In [12]:
df.columns

Index(['Creditability', 'Account_Balance', 'Duration_of_Credit_monthly',
       'Payment_Status_of_Previous_Credit', 'Purpose', 'Credit_Amount',
       'Value_Savings_Stocks', 'Length_of_current_employment',
       'Instalment_per_cent', 'Sex_Marital_Status', 'Guarantors',
       'Duration_in_Current_address', 'Most_valuable_available_asset',
       'Age_years', 'Concurrent_Credits', 'Type_of_apartment',
       'No_of_Credits_at_this_Bank', 'Occupation', 'No_of_dependents',
       'Telephone', 'Foreign_Worker'],
      dtype='object')

In [47]:
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=45)

#Model
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)

#Evaluate
lr_pred=lr_model.predict(X_test)
lr_pred_proba=lr_model.predict_proba(X_test)[:,1]



print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(Y_test, lr_pred))
print("\nClassification Report:")
print(classification_report(Y_test, lr_pred))
print("ROC AUC Score:", roc_auc_score(Y_test, lr_pred_proba))





Logistic Regression Results:
Accuracy: 0.78

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.49      0.56        57
           1       0.82      0.90      0.85       143

    accuracy                           0.78       200
   macro avg       0.73      0.69      0.71       200
weighted avg       0.77      0.78      0.77       200

ROC AUC Score: 0.7390504232609495


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
# 2. XGBoost Model
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, Y_train)

# Evaluate XGBoost
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

print("XGBoost Results:")
print("Accuracy:", accuracy_score(Y_test, xgb_pred))
print("\nClassification Report:")
print(classification_report(Y_test, xgb_pred))
print("ROC AUC Score:", roc_auc_score(Y_test, xgb_pred_proba))

XGBoost Results:
Accuracy: 0.765

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.53      0.56        57
           1       0.82      0.86      0.84       143

    accuracy                           0.77       200
   macro avg       0.71      0.69      0.70       200
weighted avg       0.76      0.77      0.76       200

ROC AUC Score: 0.7335296282664704
