In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import pandas as pd
from math import log
import os

In [3]:
# DESC:
# 1. Calculate the probability of default of the borrower
# 2. Classify the FICO scores data using logistic regression

In [4]:
# Read in loan data from a CSV file
df = pd.read_csv('Loan_Data.csv')

# Define the variable features
features = ['credit_lines_outstanding', 'debt_to_income', 'payment_to_income', 'years_employed', 'fico_score']

# Calculate the payment_to_income ratio
df['payment_to_income'] = df['loan_amt_outstanding'] / df['income']

# Calculate the debt_to_income ratio
df['debt_to_income'] = df['total_debt_outstanding'] / df['income']

clf = LogisticRegression(random_state=0, solver='liblinear', tol=1e-5, max_iter=10000).fit(df[features], df['default'])

print(clf.coef_, clf.intercept_)

# Use the following code to check yourself
y_pred = clf.predict(df[features])
fpr, tpr, thresholds = metrics.roc_curve(df['default'], y_pred)

print((1.0*(abs(df['default']-y_pred)).sum()) / len(df))
print(metrics.auc(fpr, tpr))

[[ 8.18520373  0.54490854  0.01994244 -2.77630853 -0.02418391]] [-0.09162643]
0.0037
0.9925106069101026


In [6]:
x = df['default'].to_list()
y = df['fico_score'].to_list()
n = len(x)

print (len(x), len(y))

default = [0 for i in range(851)]
total = [0 for i in range(851)]

for i in range(n):
    y[i] = int(y[i])
    default[y[i]-300] += x[i]
    total[y[i]-300] += 1

for i in range(0, 551):
    default[i] += default[i-1]
    total[i] += total[i-1]

def log_likelihood(n, k):
    p = k/n
    if (p==0 or p==1):
        return 0
    return k*np.log(p)+ (n-k)*np.log(1-p)

r = 10

dp = [[[-10**18, 0] for i in range(551)] for j in range(r+1)]

for i in range(r+1):
    for j in range(551):
        if (i==0):
            dp[i][j][0] = 0
        else:
            for k in range(j):
                if (total[j]==total[k]):
                    continue
                if (i==1):
                    dp[i][j][0] = log_likelihood(total[j], default[j])
                else:
                    if (dp[i][j][0] < (dp[i-1][k][0] + log_likelihood(total[j]-total[k], default[j] - default[k]))):
                        dp[i][j][0] = log_likelihood(total[j]-total[k], default[j]-default[k]) + dp[i-1][k][0]
                        dp[i][j][1] = k

print (round(dp[r][550][0], 4))

k = 550
l = []

while r >= 0:
    l.append(k+300)
    k = dp[r][k][1]
    r -= 1

print(l)

10000 10000
-4217.8245
[850, 753, 752, 732, 696, 649, 611, 580, 552, 520, 300]


In [None]:
# # The provided code performs data analysis by calculating the probability of default for a given set of observations. 
# The technique used for this analysis is maximum likelihood estimation. 
# The intuition behind the usage of maximum likelihood estimation is that it is a common method for 
# estimating the parameters of a statistical model. In this case, the parameters are the probabilities of 
# default for different sets of observations. Maximum likelihood estimation seeks to find the parameter values 
# that maximize the likelihood function for the observed data.

# # The code first reads in a CSV file using Pandas. It then creates two lists, x and y, that correspond to 
# the 'observation' and 'rank' columns in the data, respectively. These lists are then used to calculate the 
# default and total values for each rank in the data.

# # The log-likelihood function is defined to calculate the likelihood of a given set of parameters. The likelihood 
# function is used to calculate the probability of observing the data given the parameter values. The code then initializes 
# a three-dimensional array, dp, that is used to store the calculated log-likelihood values for different sets of observations. 
# The first dimension represents the number of iterations performed, the second dimension represents the rank of the observation, 
# and the third dimension represents the log-likelihood and the index of the previous observation.

# # Finally, the code calculates the log-likelihood for the given data set by using the dp array. 
# It then prints the results and outputs the indices of the observations that were used in the calculation.