# Bank Loan Classification
### Contents
#### Importing and cleaning the data
#### Data exploration
#### Data visualization
#### Modeling the data
#### Results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors

# Importing and cleaning the data

In [None]:
data = pd.read_csv("Visa_For_Lisa_Loan_Modelling.csv", index_col=0)
data

In [None]:
nan_count = pd.isna(data).sum()
nan_count

In [None]:
if nan_count.sum() == 0:
    print("No NaNs to clean.")

# Data exploration

Calculating average values for loan and no-loan groups

In [None]:
means = data.groupby("Personal Loan").mean()
means

Some categories vary more significantly than others

In [None]:
print("Absolute percentage differences between loan and no-loan means:")
differences = round(abs((means.iloc[0] / means.iloc[1] - 1) * 100), 1).sort_values(
    ascending=False
)
differences

In [None]:
categories = list(differences[differences > 5].keys())
print("Categoried with >10% difference between loan and no-loan means:")
print(categories)

# Data visualization

Categories with stronger trends show clearer clustering

In [None]:
plt.scatter(
    data["Income"],
    data["CCAvg"],
    c=data["Personal Loan"],
    cmap=colors.ListedColormap(["blue", "red"]),
)
plt.xlabel("Income")
plt.ylabel("CCAvg")
plt.show()

Categories with weaker trends exhibit less obvious clustering

In [None]:
plt.scatter(
    data["Experience"],
    data["Age"],
    c=data["Personal Loan"],
    cmap=colors.ListedColormap(["blue", "red"]),
)
plt.xlabel("Experiene")
plt.ylabel("Age")
plt.show()

# Modeling the data

## Preparing the data

### Split into train and test datasets

In [None]:
data_shuffled = data.sample(frac=1)

In [None]:
train_pct = 0.8
train_n = int(train_pct * len(data_shuffled))
train_data = data_shuffled.iloc[:train_n]
test_data = data_shuffled[train_n:]
print(f"Train data shape: {train_data.shape}\nTest data shape: {test_data.shape}")

In [None]:
print("Number of personal loans in test data:")
test_data["Personal Loan"].sum()

In [None]:
print("Total number of personal loans:")
data["Personal Loan"].sum()

### Prepare training data for the model

In [None]:
X = train_data[categories]
mean, std = X.mean(), X.std()
X_norm = (X - mean) / std
X_norm = np.array(X_norm.T)

Y = np.array(train_data["Personal Loan"]).reshape(1, len(train_data))
Y_norm = Y.copy()

print("Training dataset X containing (CD Account, ..., Education) in the columns:")
print(X_norm)
print("Training dataset Y containing labels of two classes (0: no loan, 1: loan)")
print(Y_norm)

print("The shape of X is: " + str(X_norm.shape))
print("The shape of Y is: " + str(Y_norm.shape))
print("I have m = %d training examples!" % (X_norm.shape[1]))

## Building the model

In [None]:
# activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
def layer_sizes(X, Y):
    """
    Arguments:
    X -- input dataset of shape (input size, number of examples)
    Y -- labels of shape (output size, number of examples)

    Returns:
    n_x -- the size of the input layer
    n_y -- the size of the output layer
    """
    n_x = X.shape[0]
    n_y = Y.shape[0]

    return (n_x, n_y)


(n_x, n_y) = layer_sizes(X_norm, Y_norm)
print("The size of the input layer is: n_x = " + str(n_x))
print("The size of the output layer is: n_y = " + str(n_y))

In [None]:
def initialize_parameters(n_x, n_y):
    """
    Returns:
    params -- python dictionary containing your parameters:
                    W -- weight matrix of shape (n_y, n_x)
                    b -- bias value set as a vector of shape (n_y, 1)
    """

    W = np.random.randn(n_y, n_x) * 0.01
    b = np.zeros((n_y, 1))

    parameters = {"W": W, "b": b}

    return parameters


parameters = initialize_parameters(n_x, n_y)
print("W = " + str(parameters["W"]))
print("b = " + str(parameters["b"]))

In [None]:
def forward_propagation(X, parameters):
    """
    Argument:
    X -- input data of size (n_x, m)
    parameters -- python dictionary containing your parameters (output of initialization function)

    Returns:
    A -- The output
    """
    W = parameters["W"]
    b = parameters["b"]

    # Forward Propagation to calculate Z.
    Z = np.matmul(W, X) + b
    A = sigmoid(Z)

    return A


A = forward_propagation(X_norm, parameters)

print("Output vector A:", A)

In [None]:
def compute_cost(A, Y):
    """
    Computes the log loss cost function

    Arguments:
    A -- The output of the neural network of shape (n_y, number of examples)
    Y -- "true" labels vector of shape (n_y, number of examples)

    Returns:
    cost -- log loss

    """
    # Number of examples.
    m = Y.shape[1]

    # Compute the cost function.
    logprobs = -np.multiply(np.log(A), Y) - np.multiply(np.log(1 - A), 1 - Y)
    cost = 1 / m * np.sum(logprobs)

    return cost


print("cost = " + str(compute_cost(A, Y_norm)))

In [None]:
def backward_propagation(A, X, Y):
    """
    Implements the backward propagation, calculating gradients

    Arguments:
    A -- the output of the neural network of shape (n_y, number of examples)
    X -- input data of shape (n_x, number of examples)
    Y -- "true" labels vector of shape (n_y, number of examples)

    Returns:
    grads -- python dictionary containing gradients with respect to different parameters
    """
    m = X.shape[1]

    # Backward propagation: calculate partial derivatives denoted as dW, db for simplicity.
    dZ = A - Y
    dW = 1 / m * np.dot(dZ, X.T)
    db = 1 / m * np.sum(dZ, axis=1, keepdims=True)

    grads = {"dW": dW, "db": db}

    return grads


grads = backward_propagation(A, X_norm, Y_norm)

print("dW = " + str(grads["dW"]))
print("db = " + str(grads["db"]))

In [None]:
def update_parameters(parameters, grads, learning_rate=0.1):
    """
    Updates parameters using gradient descent

    Arguments:
    parameters -- python dictionary containing parameters
    grads -- python dictionary containing gradients
    learning_rate -- learning rate parameter for gradient descent

    Returns:
    parameters -- python dictionary containing updated parameters
    """
    # Retrieve each parameter from the dictionary "parameters".
    W = parameters["W"]
    b = parameters["b"]

    # Retrieve each gradient from the dictionary "grads".
    dW = grads["dW"]
    db = grads["db"]

    # Update rule for each parameter.
    W = W - learning_rate * dW
    b = b - learning_rate * db

    parameters = {"W": W, "b": b}

    return parameters


parameters_updated = update_parameters(parameters, grads)

print("W updated = " + str(parameters_updated["W"]))
print("b updated = " + str(parameters_updated["b"]))

In [None]:
def nn_model(X, Y, num_iterations=10, learning_rate=0.1, print_cost=False):
    """
    Arguments:
    X -- dataset of shape (n_x, number of examples)
    Y -- labels of shape (n_y, number of examples)
    num_iterations -- number of iterations in the loop
    learning_rate -- learning rate parameter for gradient descent
    print_cost -- if True, print the cost every iteration

    Returns:
    parameters -- parameters learnt by the model. They are then used to make predictions.
    """

    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[1]

    parameters = initialize_parameters(n_x, n_y)

    # Loop
    for i in range(0, num_iterations):
        # Forward propagation. Inputs: "X, parameters". Outputs: "A".
        A = forward_propagation(X, parameters)

        # Cost function. Inputs: "A, Y". Outputs: "cost".
        cost = compute_cost(A, Y)

        # Backpropagation. Inputs: "A, X, Y". Outputs: "grads".
        grads = backward_propagation(A, X, Y)

        # Gradient descent parameter update. Inputs: "parameters, grads, learning_rate". Outputs: "parameters".
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the cost every iteration.
        if print_cost:
            print("Cost after iteration %i: %f" % (i, cost))

    return parameters

In [None]:
parameters = nn_model(
    X_norm, Y_norm, num_iterations=1000, learning_rate=1, print_cost=True
)
print("W = " + str(parameters["W"]))
print("b = " + str(parameters["b"]))

In [None]:
def predict(X_pred, parameters):
    A = forward_propagation(X_pred, parameters)
    return A > 0.33

# Results

## Training data

In [None]:
Y_pred = predict(X_norm, parameters)

Overall accuracy

In [None]:
pct_correct = round(100 * (1 - np.abs((Y_pred - Y_norm)).sum() / len(Y_pred[0])), 1)
print(f"Accuracy: {pct_correct}%")

Proportion of loans accuracy

In [None]:
pct_correct = round(100 * (1 - abs(1 - Y_pred.sum() / Y_norm.sum())), 1)
print(f"Loan proportion accuracy: {pct_correct}%")

Correctly identified loans

In [None]:
df = pd.DataFrame(columns=["Actual", "Prediction"])
df["Actual"] = Y_norm[0]
df["Prediction"] = Y_pred[0] * 1

In [None]:
loans = df[df["Actual"] == 1]
pct_correct = round(
    100 * (1 - np.abs((loans["Actual"] - loans["Prediction"])).sum() / len(loans)), 1
)
print(f"Loans identified accuracy: {pct_correct}%")

Correctly identified no-loans

In [None]:
no_loans = df[df["Actual"] == 0]
pct_correct = round(
    100
    * (1 - np.abs((no_loans["Actual"] - no_loans["Prediction"])).sum() / len(no_loans)),
    1,
)
print(f"No-loan identification accuracy: {pct_correct}%")

## Test data

In [None]:
X_test = test_data[categories]
# mean, std = X.mean(), X.std() # use same mean and std from train data
X_test_norm = (X_test - mean) / std
X_test_norm = np.array(X_test_norm.T)

Y_test = np.array(test_data["Personal Loan"]).reshape(1, len(test_data))
Y_test_norm = Y_test.copy()

print("Testing dataset X containing (CD Account, ..., Education) in the columns:")
print(X_test_norm)
print("Testing dataset Y containing labels of two classes (0: no loan, 1: loan)")
print(Y_test_norm)

print("The shape of X is: " + str(X_test_norm.shape))
print("The shape of Y is: " + str(Y_test_norm.shape))
print("I have m = %d testing examples!" % (X_test_norm.shape[1]))

In [None]:
Y_test_pred = predict(X_test_norm, parameters)

Overall accuracy

In [None]:
pct_correct = round(
    100 * (1 - np.abs((Y_test_norm - Y_test_pred)).sum() / len(Y_test_pred[0])), 1
)
print(f"Accuracy: {pct_correct}%")

Proportion of loans accuracy

In [None]:
pct_correct = round(100 * (1 - abs(1 - Y_test_pred.sum() / Y_test_norm.sum())), 1)
print(f"Loan proportion accuracy: {pct_correct}%")

Correctly identified loans

In [None]:
df = pd.DataFrame(columns=["Actual", "Prediction"])
df["Actual"] = Y_test_norm[0]
df["Prediction"] = Y_test_pred[0] * 1

In [None]:
loans = df[df["Actual"] == 1]
pct_correct = round(
    100 * (1 - np.abs((loans["Actual"] - loans["Prediction"])).sum() / len(loans)), 1
)
print(f"Loans identified accuracy: {pct_correct}%")

Correctly identified no-loans

In [None]:
no_loans = df[df["Actual"] == 0]
pct_correct = round(
    100
    * (1 - np.abs((no_loans["Actual"] - no_loans["Prediction"])).sum() / len(no_loans)),
    1,
)
print(f"No-loan identification accuracy: {pct_correct}%")