In [32]:
# Install libraries
!python -m pip install --upgrade pip -q
!pip install pandas numpy matplotlib seaborn scikit-learn openpyxl -q
!pip install notebook -q
!pip install nbconvert -q
!pip install tabulate -q

In [None]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split    
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Bottom Line Up Front

### Accuracy: 100 %

The model performs at close to 100 % when tested, it shows finds 368 defaulters out of 370 defaulters. 

In [52]:
pd.set_option('display.max_columns', None) # Set option to display all columns
pd.set_option('display.float_format', '{:.2f}'.format) # Set float format to 2 decimal places

df = pd.read_csv('Task 3 and 4_Loan_Data.csv')
df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.55,3915.47,78039.39,5,605,0
1,7442532,5,1958.93,8228.75,26648.44,2,572,1
2,2256073,0,3363.01,2027.83,65866.71,4,602,0
3,4885975,0,4766.65,2501.73,74356.88,5,612,0
4,4700614,1,1345.83,1768.83,23448.33,6,631,0


In [43]:
# Split the data into features and target variable

X = df.drop(columns=['customer_id', 'default']) # drop customer ID and target variable
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, # 20% data for testing
                                                    random_state=42, # make the results reproducible
                                                    stratify=y) # split the default variable equally in test and train sets
print(f"X_train shape: {X_train.shape}") # see how many rows and columns in training set
print(f"X_test shape: {X_test.shape}") # see how many rows and columns in testing set
print(f"y_train shape: {y_train.shape}") # see how many rows in training target variable
print(f"y_test shape: {y_test.shape}") # see how many rows in testing target variable
print(f"y_train value counts:\n{y_train.value_counts()}") # see how many defaults and non-defaults in training target variable
print(f"y_test value counts:\n{y_test.value_counts()}") # see how many defaults and non-defaults in testing target variable

X_train shape: (8000, 6)
X_test shape: (2000, 6)
y_train shape: (8000,)
y_test shape: (2000,)
y_train value counts:
default
0    6519
1    1481
Name: count, dtype: int64
y_test value counts:
default
0    1630
1     370
Name: count, dtype: int64


In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # use standard scaler to scale the features
X_train = scaler.fit_transform(X_train) # fit and transform the training data for accurate results and prevention of data leakage
X_test = scaler.transform(X_test) # transform the testing data for accurate results

In [51]:
# Build a Logistic Regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression() # create the model to calculate the probability of the default
model.fit(X_train, y_train) # calculating  Regression coefficients
probs = model.predict_proba(X_test) # calculate the probabilities for test set
y_pred = model.predict(X_test) # predict the target variable for test set
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") # accuracy of the model
print(f"Classification Report:\n{classification_report(y_test, y_pred)}") # detailed report
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
for name, coef in zip(X.columns, model.coef_[0]):
    print(f"{name}: {coef:.4f}")


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1630
           1       1.00      0.99      1.00       370

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
[[1630    0]
 [   2  368]]
credit_lines_outstanding: 8.7477
loan_amt_outstanding: 0.1189
total_debt_outstanding: 3.8371
income: -2.3544
years_employed: -2.9031
fico_score: -1.1356


In [65]:
# function to scale the features
def calculate_expected_loss(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score):
    new_customer = [[credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score]] # example new customer data
    new_customer_scaled = scaler.transform(new_customer) # scale the new customer data
    prob_default = model.predict_proba(new_customer_scaled)[:, 1][0] # get the probability of default
    expected_loss = loan_amt_outstanding * prob_default * 0.90  # Expected Loss = Loan Amount * Probability of Default * .90
    print(f"Expected Loss: ${expected_loss:.2f}")
    print(f"Probability of Default: {prob_default:.4f}")

In [66]:
# Low risk customer
print(calculate_expected_loss(3, 15000, 20000, 55000, 5, 680))

# High risk customer
print(calculate_expected_loss(10, 5000, 80000, 25000, 1, 550))

Expected Loss: $0.00
Probability of Default: 0.0000
None
Expected Loss: $4500.00
Probability of Default: 1.0000
None


The function takes the user inputs and then returns the probability of default and the Expected loss from the loan, so that the underwriter can make a decision on whether to approve the loan, approve it at a higher interest rate, or not at all.

In [70]:
# while loop to get user input for information to calculate probability of default

while True:
    try:
        print("Enter the following details to calculate the probability of default, enter 'q' at any time to quit:")
        credit_lines_outstanding = input("Credit Lines Outstanding: ")
        if credit_lines_outstanding.lower() == 'q':
            break
        credit_lines_outstanding = int(credit_lines_outstanding)
        loan_amt_outstanding = (input("Loan Amount Outstanding: "))                
        if loan_amt_outstanding.lower() == 'q':
            break
        loan_amt_outstanding = float(loan_amt_outstanding)
        total_debt_outstanding = (input("Total Debt Outstanding: "))
        if total_debt_outstanding.lower() == 'q':
            break
        total_debt_outstanding = float(total_debt_outstanding)                
        income = (input("Income: "))
        if income.lower() == 'q':
            break
        income = float(income)                
        years_employed = (input("Years Employed: "))
        if years_employed.lower() == 'q':
            break
        years_employed = int(years_employed) 
        while True:
            fico_score = (input("FICO Score: "))
            if fico_score.lower() == 'q':
                break
            fico_score = int(fico_score)
            if fico_score < 300 or fico_score > 850:
                print("FICO Score must be between 300 and 850. Please try again.")
                continue
            else:
                break
        calculate_expected_loss(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score)
        another = input("Enter another customer? (y/n): ").lower()
        if another != 'y':
            break
    except Exception as e:
        print(f"Error: {e}. Please enter valid inputs.")


Enter the following details to calculate the probability of default, enter 'q' at any time to quit:
FICO Score must be between 300 and 850. Please try again.
FICO Score must be between 300 and 850. Please try again.
FICO Score must be between 300 and 850. Please try again.
Expected Loss: $4500.00
Probability of Default: 1.0000
Enter the following details to calculate the probability of default, enter 'q' at any time to quit:
Expected Loss: $4500.00
Probability of Default: 1.0000
