In [None]:
import joblib
import beneath
import sys
import numpy as np
from category_encoders import TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Read historical data from Beneath

In [None]:
df = await beneath.easy_read("epg/lending-club/loans-history")

### Create target variable

In [None]:
def make_binary(outcome):
    if outcome in ['Charged Off', 'Default']:
        return True
    else:
        return False

In [None]:
df['loan_status_binary'] = df['loan_status'].apply(lambda x: make_binary(x))

Set the target

In [None]:
Y = df[['loan_status_binary']]

### Preprocess the input data

Drop rows where there are nulls

In [None]:
df = df.loc[ df['dti'].isna() == False ]
df = df.loc[ df['revol_util'].isna() == False ]

Set the input features

In [None]:
X = df[['term', 'int_rate', 'loan_amount', 'annual_inc', 
        'acc_now_delinq', 'dti', 'fico_range_high', 'open_acc', 'pub_rec', 'revol_util']]

### Train and score the model

In [None]:
# Split dataset into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2020) 

# train model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)

# score
clf.score(X_test, y_test)

### Save the model in a file

In [None]:
joblib.dump(clf, 'model.pkl')

### Future work
This is a quick and dirty model. It can certainly be improved by:

- adding more features, including categorical ones
- downsampling the majority class for training to balance the two classes (default, no default)
- doing k-fold cross-validation so we don't overfit any one training set
- using a custom cost function to predict defaults because, in this case, false negatives are worse than false positives
- using a more advanced ML model (random forest, xgboost, etc.)
- using more input data