In [None]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
import os
import shutil

shutil.rmtree("lending-club-data")
os.mkdir("lending-club-data")
os.mkdir("lending-club-data/risk-engine")

# Original data

In [None]:
df = pd.read_csv("source-data/loan.csv", dtype=str, parse_dates=["issue_d"])
df.shape

# Enriching original data

In [None]:
bad_loan = [
    "Charged Off",
    "Default",
    "Does not meet the credit policy. Status:Charged Off",
    "In Grace Period",
    "Late (16-30 days)",
    "Late (31-120 days)",
]


def classification(x):
    return 1.0 if x["loan_status"] in bad_loan else 0.0


import datetime


def def_date(x):
    dd = ""
    per = int(x["term"].strip()[:2])
    if x["loan_status"] in bad_loan:
        #         dd = random.choice(pd.date_range(x["issue_date"], periods=per, freq="M"))
        #         random_num = float(x["id"]) % per  # random number from 0 to 36  or 60
        #         random_month = (random_num / 4) ** 2 if random_num < 18 else random_num
        random_month = float(x["id"]) % per
        dd = x["issue_d"] + datetime.timedelta(weeks=(random_month * 4))
    return dd


from dateutil.relativedelta import relativedelta


def maturity_date(x):
    per = int(x["term"].strip()[:2])
    md = x["issue_d"] + relativedelta(months=per)
    return md

In [None]:
df["Opening Year"] = df["issue_d"].dt.year
df["Opening Month"] = df["issue_d"].dt.month
df["Opening Day"] = df["issue_d"].dt.day

In [None]:
df["maturity_date"] = df.apply(lambda x: maturity_date(x), axis=1)

In [None]:
df["loan_class"] = df.apply(lambda x: classification(x), axis=1)

In [None]:
df["default_date"] = df.apply(lambda x: def_date(x), axis=1)

# Computing historical PDs

In [None]:
import numpy as np

df = df.replace(np.nan, "")
opening_pds = pd.pivot_table(
    df,
    index=["sub_grade", "emp_length", "home_ownership"],
    values=["loan_class"],
    aggfunc=np.mean,
).reset_index()
opening_pds.rename(columns={"loan_class": "Opening PD12"}, inplace=True)
opening_pds["Opening PDLT"] = opening_pds["Opening PD12"] * 1.2
opening_pds.head(3)

# Generating risk reports

In [None]:
import random


def loan_matured(x, reporting_date):
    return reporting_date > x["maturity_date"]


def just_issued(x, reporting_date):
    return x["issue_date"] == reporting_date


def initial_stage(x):
    return 1 if x["PD12"] < 0.05 else 2


def initial_pd(x):
    #     print(x['id'], x["sub_grade"], x["emp_length"],x["home_ownership"] )
    init_pd = opening_pds[
        (opening_pds.sub_grade == x["sub_grade"])
        & (opening_pds.emp_length == x["emp_length"])
        & (opening_pds.home_ownership == x["home_ownership"])
    ]["Opening PD12"].iloc[0]
    return init_pd if init_pd > 0 else 0.0001


def initial_pdlt(x):
    return opening_pds[
        (opening_pds.sub_grade == x["sub_grade"])
        & (opening_pds.emp_length == x["emp_length"])
        & (opening_pds.home_ownership == x["home_ownership"])
    ]["Opening PDLT"].iloc[0]


def pd_one(x, reporting_date):
    if x["just_issued"]:
        pd_one = initial_pd(x)
    else:
        pd_one = x["Previous PD12"]
        pd_one = max(min(pd_one * random.gauss(1, 0.2), 0.9), 0.001)
    return pd_one


def pd_lt(x, reporting_date):
    pd_lt = x["PD12"] * 1.2
    return min(pd_lt, 0.9)


def stage(x, reporting_date):
    pd_one = x["PD12"]
    op_pd = x["Opening PD12"]

    stage = x["Previous Stage"]

    if pd_one / op_pd > 1.7:
        stage = min(2, stage + 1)
    if isinstance(x["default_date"], datetime.date):
        if reporting_date >= x["default_date"]:
            stage = 3
    if pd_one > 0.7:
        stage = 3
    return stage


def dayspastdue(x, reporting_date):
    npl = ""
    if isinstance(x["default_date"], datetime.date):
        if reporting_date >= x["default_date"]:
            delta = reporting_date - x["default_date"]
            npl = delta.days
    return npl

# Risk data reports

In [None]:
df_copy = df.copy()
df = df.sample(15000)

In [None]:
import pandas as pd

temp = pd.DataFrame()
reporting_dates = sorted(list(set(df["issue_d"])))


def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month


for rd in reporting_dates:
    # Old
    temp = temp.copy()
    if not temp.empty:
        #         temp.rename(columns = {'PD12': 'Previous PD12', 'PDLT': 'Previous PDLT', 'EAD': 'Previous EAD', 'LGD': 'Previous LGD', 'Stage': 'Previous Stage'}, inplace = True)
        temp["Previous PD12"] = temp["PD12"]
        temp["Previous PDLT"] = temp["PDLT"]
        temp["Previous EAD"] = temp["EAD"]
        temp["Previous LGD"] = temp["LGD"]
        temp["Previous Stage"] = temp["Stage"]
        temp["just_issued"] = False
        temp["PD12"] = temp.apply(lambda x: pd_one(x, rd), axis=1)
        temp["PDLT"] = temp.apply(lambda x: pd_lt(x, rd), axis=1)
        temp["LGD"] = random.choice([0.6, 0.7, 0.8, 0.81])
        temp["EAD"] = temp["Previous EAD"] * random.gauss(1, 0.05)
        temp["Stage"] = temp.apply(lambda x: stage(x, rd), axis=1)

    # New
    sub = df[df.issue_d == rd].copy()
    sub["just_issued"] = True
    sub["Opening PD12"] = sub.apply(lambda x: initial_pd(x), axis=1)
    sub["Opening PDLT"] = sub.apply(lambda x: initial_pdlt(x), axis=1)
    sub["PD12"] = sub["Opening PD12"]
    sub["PDLT"] = sub["Opening PDLT"]
    sub["EAD"] = sub["loan_amnt"].astype(float)
    sub["LGD"] = 0.8

    sub["Stage"] = sub.apply(lambda x: initial_stage(x), axis=1)

    sub["Previous PD12"] = sub["PD12"]
    sub["Previous PDLT"] = sub["PDLT"]
    sub["Previous EAD"] = sub["EAD"]
    sub["Previous LGD"] = sub["LGD"]
    sub["Previous Stage"] = sub["Stage"]

    temp = pd.concat([temp, sub], ignore_index=True)

    temp["DaysPastDue"] = temp.apply(lambda x: dayspastdue(x, rd), axis=1)
    temp["Reporting Date"] = rd.strftime("%Y-%m-%d")

    temp["Months Since Inception"] = temp.apply(
        lambda x: diff_month(rd, x["issue_d"]), axis=1
    )
    temp["loan_matured"] = temp.apply(lambda x: loan_matured(x, rd), axis=1)
    temp = temp[temp.loan_matured == False].copy()
    print(rd, temp.shape[0])

    temp[
        [
            "id",
            "PD12",
            "PDLT",
            "EAD",
            "LGD",
            "Stage",
            "Previous PD12",
            "Previous PDLT",
            "Previous EAD",
            "Previous LGD",
            "Previous Stage",
            "Reporting Date",
            "DaysPastDue",
            "Months Since Inception",
        ]
    ].to_csv(
        "lending-club-data/risk-engine/" + rd.strftime("%Y-%m-%d") + ".csv", index=False
    )

# Static data

In [None]:
df_c = pd.merge(
    left=df[["id", "default_date", "sub_grade", "emp_length", "home_ownership"]],
    right=opening_pds,
    on=["sub_grade", "emp_length", "home_ownership"],
    how="left",
)

In [None]:
df_c[["id", "default_date", "Opening PD12", "Opening PDLT"]].to_csv(
    "lending-club-data/static.csv", index=False
)

In [None]:
df[
    [
        "id",
        "member_id",
        "loan_amnt",
        "funded_amnt",
        "funded_amnt_inv",
        "term",
        "int_rate",
        "installment",
        "grade",
        "sub_grade",
        "emp_title",
        "emp_length",
        "home_ownership",
        "annual_inc",
        "verification_status",
        "issue_d",
        "loan_status",
        "pymnt_plan",
        "url",
        "desc",
        "purpose",
        "title",
        "zip_code",
        "addr_state",
        "dti",
        "delinq_2yrs",
        "earliest_cr_line",
        "inq_last_6mths",
        "mths_since_last_delinq",
        "mths_since_last_record",
        "open_acc",
        "pub_rec",
        "revol_bal",
        "revol_util",
        "total_acc",
        "initial_list_status",
        "out_prncp",
        "out_prncp_inv",
        "total_pymnt",
        "total_pymnt_inv",
        "total_rec_prncp",
        "total_rec_int",
        "total_rec_late_fee",
        "recoveries",
        "collection_recovery_fee",
        "last_pymnt_d",
        "last_pymnt_amnt",
        "next_pymnt_d",
        "last_credit_pull_d",
        "collections_12_mths_ex_med",
        "mths_since_last_major_derog",
        "policy_code",
        "application_type",
        "annual_inc_joint",
        "dti_joint",
        "verification_status_joint",
        "acc_now_delinq",
        "tot_coll_amt",
        "tot_cur_bal",
        "open_acc_6m",
        "open_il_6m",
        "open_il_12m",
        "open_il_24m",
        "mths_since_rcnt_il",
        "total_bal_il",
        "il_util",
        "open_rv_12m",
        "open_rv_24m",
        "max_bal_bc",
        "all_util",
        "total_rev_hi_lim",
        "inq_fi",
        "total_cu_tl",
        "inq_last_12m",
        "Opening Year",
        "Opening Month",
        "Opening Day",
        "maturity_date",
    ]
].to_csv("lending-club-data/loans.csv", index=False)

# Zipping

In [4]:
!cd lending-club-data && zip -r "loans.zip" "loans.csv" -x "*.ipynb_checkpoints*"
!cd lending-club-data && rm loans.csv

  adding: loans.csv (deflated 75%)


In [5]:
!cd lending-club-data && zip -r "static.zip" "static.csv" -x "*.ipynb_checkpoints*"
!cd lending-club-data && rm static.csv

  adding: static.csv (deflated 74%)


In [6]:
!cd lending-club-data && zip -r -q "risk-engine.zip" "risk-engine/" -x "*.ipynb_checkpoints*"
!cd lending-club-data && rm -r -f risk-engine/

In [None]:
!cd lending-club-data && yes | unzip risk-engine.zip

In [None]:
# !cd lending-club-data && rm -rf find -type d -name .ipynb_checkpoints`

In [3]:
!cd lending-club-data/risk-engine && ls -a

[1m[36m.[m[m              2009-05-01.csv 2011-02-01.csv 2012-11-01.csv 2014-08-01.csv
[1m[36m..[m[m             2009-06-01.csv 2011-03-01.csv 2012-12-01.csv 2014-09-01.csv
2007-06-01.csv 2009-07-01.csv 2011-04-01.csv 2013-01-01.csv 2014-10-01.csv
2007-08-01.csv 2009-08-01.csv 2011-05-01.csv 2013-02-01.csv 2014-11-01.csv
2007-09-01.csv 2009-09-01.csv 2011-06-01.csv 2013-03-01.csv 2014-12-01.csv
2007-10-01.csv 2009-10-01.csv 2011-07-01.csv 2013-04-01.csv 2015-01-01.csv
2007-11-01.csv 2009-11-01.csv 2011-08-01.csv 2013-05-01.csv 2015-02-01.csv
2007-12-01.csv 2009-12-01.csv 2011-09-01.csv 2013-06-01.csv 2015-03-01.csv
2008-01-01.csv 2010-01-01.csv 2011-10-01.csv 2013-07-01.csv 2015-04-01.csv
2008-02-01.csv 2010-02-01.csv 2011-11-01.csv 2013-08-01.csv 2015-05-01.csv
2008-03-01.csv 2010-03-01.csv 2011-12-01.csv 2013-09-01.csv 2015-06-01.csv
2008-04-01.csv 2010-04-01.csv 2012-01-01.csv 2013-10-01.csv 2015-07-01.csv
2008-06-01.csv 2010-05-01.csv 2012-02-01.csv 2013-11-01.csv 2015-08-01

In [None]:
!cd lending-club-data && ls -lrt

In [None]:
!zip -r bitvolution.zip ./bitvolution