In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

2023-03-17 17:18:41.007743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("SBAnational.csv")

Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.


In [3]:
def get_frequent(x, thr=0.005):
    count_norm = x.value_counts(normalize=True)
    frequent = count_norm[count_norm >= thr]
    return frequent.index


def plot_numeric_boxplots(data, target, feature):
    fig = px.box(
        data,
        x=target,
        y=feature,
    )
    fig.show()


def plot_category_props(data, x, target):
    prop = data.groupby(x)[target].mean()
    fig = px.bar(x=prop.index, y=prop.values, labels={"x": x, "y": target})
    fig.show()

In [4]:
data['is_default'] = ~data['ChgOffDate'].isna()

## Feature Cleaning

In [5]:
frequent_city = get_frequent(data['City'])
data['City'] = data['City'].apply(lambda x: x if x in frequent_city else 'Other')

frequent_banks = get_frequent(data['Bank'])
data['Bank'] = data['Bank'].apply(lambda x: x if x in frequent_banks else 'Other')


In [6]:
frequent_fr_code = get_frequent(data["FranchiseCode"].astype(str))
data["FranchiseCode"] = data["FranchiseCode"].apply(
    lambda x: str(x) if str(x) in frequent_fr_code else "Other"
)


In [7]:
data['RevLineCr'] = data['RevLineCr'].apply(lambda x: x if x in ("Y", 'N') else 'Other')
data['LowDoc'] = data['LowDoc'].apply(lambda x: x if x in ("Y", 'N') else 'Other')

In [8]:
data['GrAppv'] = data['GrAppv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))
data['SBA_Appv'] = data['SBA_Appv'].apply(lambda x: float(x.replace('$', '').replace('.', '').replace(',', '')))

In [9]:
data['NewExist'].value_counts()

1.0    644869
2.0    253125
0.0      1034
Name: NewExist, dtype: int64

In [10]:
data['is_new'] = data['NewExist'].apply(lambda x: x == 2)

## Feature Engineering

In [11]:
data['same_state'] = data['State'] == data['BankState']

In [12]:
import pgeocode

zip_codes = data['Zip'].astype(str).unique()
nomi = pgeocode.Nominatim('us')
zip_aug = nomi.query_postal_code(zip_codes)

zip_long_map = dict(zip(zip_aug['postal_code'].values, zip_aug['longitude'].values))
zip_lat_map = dict(zip(zip_aug['postal_code'].values, zip_aug['latitude'].values))

data['longitude'] = data['Zip'].astype(str).map(zip_long_map)
data['latitude'] = data['Zip'].astype(str).map(zip_lat_map)

## Featur Selection

In [13]:
NUMERIC_FEATURES = [
    "Term",
    "NoEmp",
    "CreateJob",
    "RetainedJob",
    "longitude",
    "latitude",
    "GrAppv",
    "SBA_Appv",
]

CATEGORICAL_FEATURES = [
    "is_new",
    "FranchiseCode",
    "UrbanRural",
    "City",
    "State",
    "Bank",
    "BankState",
    "RevLineCr",
    "naics_first_two",
    "same_state",
]

TARGET = "is_default"


In [14]:
clean_data = data[['ApprovalFY'] + NUMERIC_FEATURES + CATEGORICAL_FEATURES + [TARGET]]

In [15]:
clean_data = clean_data[clean_data['ApprovalFY'] != '1976A']
clean_data['ApprovalFY'] = clean_data['ApprovalFY'].astype(int)
clean_data.to_parquet("loan_data_clean.parquet")

## Data Split

In [16]:
test_thr = np.quantile(clean_data['ApprovalFY'], 0.90)
train_data = clean_data[clean_data['ApprovalFY'] <= test_thr]
test_data = clean_data[clean_data['ApprovalFY'] > test_thr]

In [17]:
val_thr = np.quantile(train_data['ApprovalFY'], 0.90)
val_data = train_data[train_data['ApprovalFY'] > val_thr]
train_data = train_data[train_data['ApprovalFY'] <= val_thr]

In [104]:
train_data.shape, val_data.shape, test_data.shape

((802301, 20), (39540, 20), (57305, 20))

In [19]:
train_data.to_parquet('train_data.parquet', index=False)
val_data.to_parquet('val_data.parquet', index=False)
test_data.to_parquet('test_data.parquet', index=False)

[data split](#)