In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier

In [29]:
SBA = pd.read_csv("SBA/SBAnational.csv", low_memory=False)
SBA.head()

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,N,Y,,31-May-97,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,N,N,,31-Dec-97,"$287,000.00",$0.00,P I F,$0.00,"$287,000.00","$215,250.00"
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,...,N,Y,,30-Jun-97,"$35,000.00",$0.00,P I F,$0.00,"$35,000.00","$28,000.00"
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,...,N,N,,14-May-97,"$229,000.00",$0.00,P I F,$0.00,"$229,000.00","$229,000.00"


In [75]:
features = [
    "SBA_Appv",
    "GrAppv",
    "DisbursementGross",
    "LowDoc",
    "UrbanRural",
    "FranchiseCode",
    "NewExist",
    "Term",
    "ApprovalFY",
    "NAICS",
    "State",
    "DisbursementDate"
]
outcome = ["MIS_Status"]

predictable = SBA[SBA["MIS_Status"].notnull()]
for feature in features:
    Spredictable = predictable[predictable[feature].notnull()]

approve_loan = pd.get_dummies(predictable["MIS_Status"], drop_first=True).rename(columns={"P I F": "Approve"})
predictable = pd.concat([predictable,approve_loan], axis=1)
predictable = predictable[(predictable["NewExist"] == 1.0 ) | (predictable["NewExist"] == 2.0) ]
predictable = predictable[(predictable["LowDoc"] == "Y" ) | (predictable["LowDoc"] == "N") ]
predictable = predictable[predictable["NoEmp"] <= 1500]
predictable = predictable[predictable["Term"] != 0]    
predictable["ApprovalFY"] = predictable["ApprovalFY"].apply(lambda x: 1976 if x == "1976A" else int(x)).astype(int)


(560548, 28)

In [53]:
def franchise_classifier(mat):
    rows, cols = mat.shape
    assert cols == 1  # if we don't have 2 columns, things are unexpected
    def classify(x):
        return 0  if x == 1 or x == 0 else 1

    if hasattr(mat, 'iloc'):
        return mat.iloc[:, 0].apply(classify).to_frame()
    else:
        return  mat[:, 1].apply(classify).reshape((rows, 1))

def naics_classifier(mat):
    rows, cols = mat.shape
    assert cols == 1
    def encode(x):
        mapping = {
            "0": "0",
            "31": "31-33",
            "32": "31-33",
            "33": "31-33",
            "44": "44-45",
            "48": "48-49",
            "49": "48-49",
        }
        x = str(x)
        x = "0" if x[0] == "0" else x[:2]
        return mapping.get(x, x)

    if hasattr(mat, "iloc"):
        return mat.iloc[:, 0].apply(encode).to_frame()
    else:
        return mat[:, 0].apply(encode).reshape((rows, 1))

def backed_by_real_estate(mat):
    rows, cols = mat.shape
    assert cols == 1
    def encode(x):
        return 0 if x < 240 else 1

    if hasattr(mat, "iloc"):
        return mat.iloc[:, 0].apply(encode).to_frame()
    else:
        return mat[:, 0].apply(encode).reshape((rows, 1))

def SBA_portion(mat):
    currency_cleaning = lambda x: int(float(x[1:].replace(",", "")))
    rows, cols = mat.shape
    assert cols == 2  # if we don't have 2 columns, things are unexpected
    
    if hasattr(mat, 'iloc'):
        mat.iloc[:, 0] = mat.iloc[:, 0].apply(currency_cleaning).astype(int)
        mat.iloc[:, 1] = mat.iloc[:, 1].apply(currency_cleaning).astype(int)
        res = mat.iloc[:, 0] / mat.iloc[:, 1]
        return res.to_frame()
    else:
        mat[:, 0] = mat[:, 0].apply(currency_cleaning).astype(int)
        mat[:, 1] = mat[:, 1].apply(currency_cleaning).astype(int)
        res = mat[:, 0] / mat[:, 1]
        return res.reshape((rows, 1))

def loan_active_during_recession(mat):
    rows, cols = mat.shape
    assert cols == 2  # if we don't have 2 columns, things are unexpected
    
    if hasattr(mat, 'iloc'):
        rec_date = pd.to_datetime(mat.iloc[:, 0]) + pd.to_timedelta(mat.iloc[:, 1]*30, unit="D")
        recession = pd.Series(0, index=mat.index)
        recession[(rec_date >= pd.to_datetime("2007-12-01")) & (rec_date <= pd.to_datetime("2009-06-30"))] = 1 
        return recession.to_frame()
    else:
        rec_date = pd.to_datetime(mat[:, 0]) + pd.to_timedelta(mat[:, 1]*30, unit="D")
        recession = pd.Series(0, index=mat.index)
        recession[(rec_date >= pd.to_datetime("2007-12-01")) & (rec_date <= pd.to_datetime("2009-06-30"))] = 1 
        return recession.to_frame()

def clean_currency(mat):
    rows, cols = mat.shape
    assert cols == 1  # if we don't have 2 columns, things are unexpected

    currency_cleaning = lambda x: int(float(x[1:].replace(",", "")))
    if hasattr(mat, 'iloc'):
        return mat.iloc[:, 0].apply(currency_cleaning).astype(int).to_frame()
    else:
        return  mat[:, 1].apply(currency_cleaning).astype(int).reshape((rows, 1))


disbrustment_gross_pipeline = Pipeline(
    steps=[
        ("clean_data", FunctionTransformer(clean_currency)), 
        ("scaler", StandardScaler())
        ]
)

naics_pipeline = Pipeline(
    steps=[
        ("naics_class", FunctionTransformer(naics_classifier)), 
        ("encoding", OneHotEncoder(sparse=False))
        ]
)

franchise_code_pipeline = Pipeline(
    steps=[
        ("franchise_class", FunctionTransformer(franchise_classifier)), 
        ("encoding", OneHotEncoder(sparse=False))
    ]
)


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [None]:
train, test = train_test_split(predictable, test_size=.25, random_state=42)
train, tune = train_test_split(train, test_size=.1, random_state=42)

In [None]:
tf_pipe = ColumnTransformer(
    transformers=[
    ("naics", naics_pipeline, ["NAICS"]),
    ("urban", OneHotEncoder(sparse=False), ["UrbanRural"]),
    ("real_estate", FunctionTransformer(backed_by_real_estate), ["Term"]),
    ("sba_portion", FunctionTransformer(SBA_portion), ["SBA_Appv", "GrAppv"]),
    ("new", OneHotEncoder(sparse=False, drop="first"), ["NewExist"]),
    ("recession", FunctionTransformer(loan_active_during_recession), ["DisbursementDate", "Term"])
    ]
)
lm_pipe = Pipeline([
    ('columns', tf_pipe),
    ('model', LogisticRegression(penalty='none', max_iter=1000))

])

lm_pipe.fit(train[["UrbanRural", "NewExist", "Term", "SBA_Appv", "GrAppv", "DisbursementDate", "NAICS"]], train["Approve"])
predicted = lm_pipe.predict(tune[["UrbanRural", "NewExist", "Term", "SBA_Appv", "GrAppv", "DisbursementDate", "NAICS", "BankState"]])
print(classification_report(tune["Approve"], predicted))

In [None]:
confusion_matrix(tune["Approve"], predicted)

In [None]:
tf_pipe = ColumnTransformer(
    transformers=[
    ("naics", naics_pipeline, ["NAICS"]),
    ("real_estate", FunctionTransformer(backed_by_real_estate), ["Term"]),
    ("sba_portion", FunctionTransformer(SBA_portion), ["SBA_Appv", "GrAppv"]),
    ("recession", FunctionTransformer(loan_active_during_recession), ["DisbursementDate", "Term"]),
    ("gross_disbursment", disbrustment_gross_pipeline, ["DisbursementGross"])
    ]
)
lm_pipe = Pipeline([
    ('columns', tf_pipe),
    ('model', LogisticRegression(penalty='none', max_iter=1000))
])

lm_pipe.fit(train[["Term", "SBA_Appv", "GrAppv", "DisbursementDate", "NAICS", "DisbursementGross"]], train["Approve"])

predicted = lm_pipe.predict(tune[[ "Term", "SBA_Appv", "GrAppv", "DisbursementDate", "NAICS"]])
print(classification_report(tune["Approve"], predicted))

In [None]:
confusion_matrix(tune["Approve"], predicted)