In [209]:
import pandas as pd
import numpy as np
from collections import Counter

In [210]:
loans = pd.read_csv('./accepted_2007_to_2018Q4.csv')

  loans = pd.read_csv('./accepted_2007_to_2018Q4.csv')


In [211]:
status_counts = Counter(loans['loan_status'])
defaulted_loans_labels = ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off']
n_loan_defaults = sum([status_counts[label] for label in defaulted_loans_labels])
n_paid_loans = status_counts['Fully Paid']
print(f"Loan defaults: {n_loan_defaults}")
print(f"Fully paid loans: {n_paid_loans}")

Loan defaults: 269360
Fully paid loans: 1076751


In [212]:
valid_labels = defaulted_loans_labels + ['Fully Paid']

default_loans = loans[loans['loan_status'].isin(defaulted_loans_labels)]
paid_loans = loans[loans['loan_status'] == 'Fully Paid'].sample(n=len(default_loans))

loans = pd.concat([paid_loans, default_loans])
loans = loans.sample(frac=1)

#### Filter out unavailable and unused features

In [213]:
unavailable_features = [
    'chargeoff_within_12_mths',
    'collection_recovery_fee',
    'dti',
    'dti_joint',
    'funded_amnt',
    'funded_amnt_inv',
    'initial_list_status',
    'issue_d',
    'last_credit_pull_d',
    'last_fico_range_high',
    'last_fico_range_low',
    'last_pymnt_amnt',
    'last_pymnt_d',
    'next_pymnt_d',
    'out_prncp',
    'out_prncp_inv',
    'pymnt_plan',
    'recoveries',
    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_int',
    'total_rec_late_fee',
    'total_rec_prncp',
    'hardship_flag',
    'hardship_type',
    'hardship_reason',
    'hardship_status',
    'deferral_term',
    'hardship_amount',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'hardship_length',
    'hardship_dpd',
    'hardship_loan_status',
    'orig_projected_additional_accrued_interest',
    'hardship_payoff_balance_amount',
    'hardship_last_payment_amount',
    'debt_settlement_flag',
    'debt_settlement_flag_date',
    'settlement_status',
    'settlement_date',
    'settlement_amount',
    'settlement_percentage',
    'settlement_term',
]

print(f'Unavailable features: {len(unavailable_features)} / {len(loans.columns)}')

Unavailable features: 45 / 151


In [214]:
unused_features = [
    'addr_state',
    'desc',
    'grade',
    'emp_title',
    'id',
    'member_id',
    'policy_code',
    'sub_grade',
    'title',
    'url',
    'zip_code',
]

print(f'Unused features: {len(unused_features)} / {len(loans.columns)}')

Unused features: 11 / 151


In [215]:
features_to_drop = unavailable_features + unused_features
loans = loans.drop(columns=features_to_drop)
print(f'Remaining features: {len(loans.columns)}')

Remaining features: 95


#### Investigate remaining features

In [216]:
float_cols = loans.select_dtypes(include='float64')
object_cols = loans.select_dtypes(include='object')

print(f'Numerical features: {len(float_cols.columns)}')
print(f'Categorical features: {len(object_cols.columns)}')

possible_values = {}
for col in object_cols.columns:
    possible_values[col] = list(pd.unique(object_cols[col]))

n_na = 0
for key, val in possible_values.items():
    has_na_vals = any(pd.isna(val))
    print(f'{key} -> {val[:10]}, {len(val)}, {has_na_vals}')
    if has_na_vals:
        n_na += 1

print("Number of features with NA:", n_na) 

Numerical features: 84
Categorical features: 11
term -> [' 36 months', ' 60 months'], 2, False
emp_length -> [nan, '< 1 year', '1 year', '7 years', '10+ years', '6 years', '2 years', '8 years', '5 years', '9 years'], 12, True
home_ownership -> ['MORTGAGE', 'OWN', 'RENT', 'NONE', 'ANY', 'OTHER'], 6, False
verification_status -> ['Verified', 'Source Verified', 'Not Verified'], 3, False
loan_status -> ['Fully Paid', 'Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Default'], 4, False
purpose -> ['debt_consolidation', 'credit_card', 'home_improvement', 'other', 'car', 'moving', 'medical', 'vacation', 'house', 'renewable_energy'], 14, False
earliest_cr_line -> ['Oct-1993', 'Aug-2012', 'May-1979', 'Sep-2004', 'Feb-2006', 'Dec-1992', 'Nov-1994', 'Mar-2005', 'Aug-2000', 'Jan-1996'], 715, True
application_type -> ['Individual', 'Joint App'], 2, False
verification_status_joint -> [nan, 'Not Verified', 'Verified', 'Source Verified'], 4, True
sec_app_earliest_cr_line -> [nan,

In [217]:
individual_loans = loans_without_irrelevant_features[loans_without_irrelevant_features['application_type'] == 'Individual']
print("Individual loans:", len(individual_loans), ",", round(len(individual_loans) / len(loans_without_irrelevant_features), 2))
print("Joint loans:", len(loans_without_irrelevant_features) - len(individual_loans), ",", round((len(loans_without_irrelevant_features) - len(individual_loans)) / len(loans_without_irrelevant_features), 2))

Individual loans: 1320305 , 0.98
Joint loans: 25806 , 0.02


#### Convert text features into numerical ones

In [218]:
term_map = lambda x: 1 if x.strip() == '60 months' else 0
loans['term'] = loans['term'].map(term_map)

In [219]:
def get_employment_years(years_string):
    if type(years_string) is not str:
        return years_string

    if '<' in years_string:
        return 0
    
    if '10' in years_string:
        return 10
    
    return int(years_string[0])

loans['emp_length'] =  loans['emp_length'].map(get_employment_years)

In [220]:
home_ownership_map = lambda x: x.lower() if x not in ['OTHER', 'ANY', 'NONE'] else 'other'
loans['home_ownership'] = loans['home_ownership'].map(home_ownership_map)
categorical_home_ownership = pd.get_dummies(loans[['home_ownership']], dtype='float64')
loans = pd.concat([loans, categorical_home_ownership], axis=1)

In [221]:
# Maps date strings of the format Mon-Year to just the year, with NA vals preserved, e.g. Feb-2002 returns 2002.
map_datestr_to_year = lambda x: int(x[-4:]) if type(x) is str else x
loans['earliest_cr_line'] = loans['earliest_cr_line'].map(map_datestr_to_year)
loans['sec_app_earliest_cr_line'] = loans['sec_app_earliest_cr_line'].map(map_datestr_to_year)

In [222]:
categorical_verification_status = pd.get_dummies(loans[['verification_status']], dtype='float64')
categorical_purpose = pd.get_dummies(loans[['purpose']], dtype='float64')
categorical_joint_verification_status = pd.get_dummies(loans[['verification_status_joint']], dtype='float64', dummy_na=True)
loans = pd.concat([loans, categorical_verification_status, categorical_purpose, categorical_joint_verification_status], axis=1)

In [223]:
map_disbursement_method = lambda x: 1 if x == 'Cash' else 0
loans['disbursement_method'] = loans['disbursement_method'].map(map_disbursement_method)

In [224]:
map_application_type = lambda x: 1 if x == 'Individual' else 0
loans['application_type'] = loans['application_type'].map(map_disbursement_method)

In [225]:
map_loan_status = lambda x: 1 if x == 'Fully Paid' else 0
loans['loan_status'] = loans['loan_status'].map(map_loan_status)

In [226]:
for col in ['home_ownership', 'verification_status', 'purpose', 'verification_status_joint']:
    del loans[col]

#### Impute numerical features

In [227]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputed_loans = pd.DataFrame(median_imputer.fit_transform(loans), columns=loans.columns)

In [228]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
scaled_loans = pd.DataFrame(robust_scaler.fit_transform(imputed_loans), columns=loans.columns)

In [229]:
loans = scaled_loans
loans

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,annual_inc,loan_status,delinq_2yrs,earliest_cr_line,fico_range_low,...,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,verification_status_joint_Not Verified,verification_status_joint_Source Verified,verification_status_joint_Verified,verification_status_joint_nan
0,0.504167,0.0,0.214397,0.827353,0.000000,0.800000,0.5,0.0,-0.777778,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.870833,0.0,0.067293,-0.906851,-0.857143,-1.133333,-0.5,0.0,1.333333,0.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.079167,1.0,0.658842,-0.246602,-0.714286,-0.066667,-0.5,2.0,-2.333333,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.420833,1.0,0.566510,0.195540,0.142857,0.488889,-0.5,0.0,0.444444,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.079167,0.0,0.259781,0.098254,0.142857,-0.288889,0.5,0.0,0.666667,-0.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538715,1.481250,0.0,0.522692,2.147638,-0.714286,1.044444,0.5,0.0,-0.444444,0.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538716,-0.370833,0.0,0.004695,-0.290799,0.000000,-0.803556,0.5,0.0,0.111111,-0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538717,-0.245833,0.0,-0.658842,-0.197476,0.571429,0.822222,0.5,0.0,0.666667,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538718,-0.810417,0.0,-0.546166,-0.850101,-0.714286,-0.920000,-0.5,1.0,0.555556,0.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Model fitting

In [230]:

X = loans.loc[:, loans.columns != 'loan_status']
y = loans['loan_status'].apply(lambda x: 0 if x == 0.5 else 1)

print(sorted(X.columns))

['acc_now_delinq', 'acc_open_past_24mths', 'all_util', 'annual_inc', 'annual_inc_joint', 'application_type', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'collections_12_mths_ex_med', 'delinq_2yrs', 'delinq_amnt', 'disbursement_method', 'earliest_cr_line', 'emp_length', 'fico_range_high', 'fico_range_low', 'home_ownership_mortgage', 'home_ownership_other', 'home_ownership_own', 'home_ownership_rent', 'il_util', 'inq_fi', 'inq_last_12m', 'inq_last_6mths', 'installment', 'int_rate', 'loan_amnt', 'max_bal_bc', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_last_delinq', 'mths_since_last_major_derog', 'mths_since_last_record', 'mths_since_rcnt_il', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Example data
features = X.columns
for i in range(len(features)):
    for j in range(i + 1, len(features)):
        feature_1 = features[i]
        feature_2 = features[j]
        
        data = pd.DataFrame({
            feature_1: X[feature_1],
            feature_2: X[feature_2],
            "Label": y
        })
        
        # Scatter plot
        sns.scatterplot(
            data=data,
            x=feature_1,
            y=feature_2,
            hue="Label",
            palette="coolwarm"
        )
        plt.title("Feature Plot with Labels")
        plt.show()        


### Models

In [231]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(conf_matrix)
print(report)

0.6011100386100386
[[31488 22499]
 [20479 33278]]
              precision    recall  f1-score   support

           0       0.61      0.58      0.59     53987
           1       0.60      0.62      0.61     53757

    accuracy                           0.60    107744
   macro avg       0.60      0.60      0.60    107744
weighted avg       0.60      0.60      0.60    107744



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(conf_matrix)
print(report)

0.570519008019008
[[30838 23149]
 [23125 30632]]
              precision    recall  f1-score   support

           0       0.57      0.57      0.57     53987
           1       0.57      0.57      0.57     53757

    accuracy                           0.57    107744
   macro avg       0.57      0.57      0.57    107744
weighted avg       0.57      0.57      0.57    107744



In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(conf_matrix)
print(report)

0.6569739382239382
[[34983 19004]
 [17955 35802]]
              precision    recall  f1-score   support

           0       0.66      0.65      0.65     53987
           1       0.65      0.67      0.66     53757

    accuracy                           0.66    107744
   macro avg       0.66      0.66      0.66    107744
weighted avg       0.66      0.66      0.66    107744



In [235]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(conf_matrix)
print(report)



0.6553033115533116
[[34489 19498]
 [17641 36116]]
              precision    recall  f1-score   support

           0       0.66      0.64      0.65     53987
           1       0.65      0.67      0.66     53757

    accuracy                           0.66    107744
   macro avg       0.66      0.66      0.66    107744
weighted avg       0.66      0.66      0.66    107744

