<a href="https://colab.research.google.com/github/caiobaptistaa/Credit-Risk/blob/main/Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score

import joblib

from sklearn.ensemble import GradientBoostingClassifier

import string

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    CountFrequencyEncoder,
)

from feature_engine.outliers import (
    ArbitraryOutlierCapper,

)

from feature_engine.transformation import (
        YeoJohnsonTransformer,
)


from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.compose import ColumnTransformer

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/SBAnational.csv', low_memory= False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
X_train, X_rest, y_train, y_rest = train_test_split(
    data.drop(['LoanNr_ChkDgt', 'Name', 'State', 'BankState',
               'ApprovalDate', 'FranchiseCode', 'ChgOffDate',
               'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr',
               'MIS_Status'
              ], axis=1), # predictive variables
    data["MIS_Status"], # target
    test_size=0.3, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size = 0.5, random_state = 0)


In [4]:
target = ['MIS_Status']

features = ["City", "Zip", "Bank", "NAICS", "ApprovalFY",
            "Term", "NoEmp", "NewExist", "CreateJob",  "RetainedJob",
            "UrbanRural", "RevLineCr", "LowDoc", "DisbursementGross",
           "GrAppv", "SBA_Appv"]


cat_vars = ['City', 'Zip', 'Bank', 'NAICS']

cat_vars_with_na = ["Bank", "City"]

binary_vars = ['NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc']

binary_miss = ["RevLineCr"]

binary_freq = ["NewExist", "LowDoc"]

num_vars = ["DisbursementGross", "GrAppv", "SBA_Appv"]

disc_vars = ["NoEmp", "CreateJob", "RetainedJob", "Term"]

temp_vars = ["ApprovalFY"]

#### Pipe for Target

In [5]:
y_test = pd.DataFrame(y_test)

In [6]:
sba_pipe_y = Pipeline([
    
    ###################### Missings and Special Characters ######################
    
    # Imputation Categorical - target #
    
    ('frequent_imputation - target', CategoricalImputer(
        imputation_method='frequent', variables = target)),
    
    # Imputation Categorical - features #
    
    ('Binary Imputation', OneHotEncoder(
        variables = "MIS_Status", drop_last = True)),
])

In [7]:
y_test = sba_pipe_y.fit_transform(y_test)

In [8]:
y_test.isnull().sum()

MIS_Status_P I F    0
dtype: int64

In [9]:
y_test.head()

Unnamed: 0,MIS_Status_P I F
864967,1
695944,1
776331,1
208860,1
546687,1


In [10]:
pca = PCA()

In [11]:
X_test.head()

Unnamed: 0,City,Zip,Bank,NAICS,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,SBA_Appv
864967,ALVIN,77512,LOANS FROM OLD CLOSED LENDERS,0,1996,180,60,1.0,0,0,0,0,N,"$2,500,000.00","$2,500,000.00","$750,000.00"
695944,DEER PARK,77536,WELLS FARGO BANK NATL ASSOC,0,1994,180,1,2.0,0,0,0,N,N,"$105,000.00","$105,000.00","$94,500.00"
776331,SALINA,67401,BANK OF AMERICA NATL ASSOC,235110,1995,120,18,1.0,0,0,0,N,N,"$165,000.00","$165,000.00","$140,250.00"
208860,JASPER,97438,WELLS FARGO BANK NATL ASSOC,445120,2007,84,20,1.0,2,22,2,Y,N,"$313,623.00","$85,000.00","$42,500.00"
546687,ORLANDO,32825,OLD FLORIDA BANK,453220,2002,84,1,2.0,0,1,1,0,N,"$50,000.00","$50,000.00","$42,500.00"


### Predict

In [13]:
sba_pipe = joblib.load('sba_pipe.pkl')

In [14]:
y_pred = sba_pipe.predict(X_test)
y_prob = sba_pipe.predict_proba(X_test)

In [15]:
log_loss = log_loss(y_test, y_prob)

accuracy = accuracy_score(y_test, y_pred)

f1_score = f1_score(y_test, y_pred)

roc_auc_score = roc_auc_score(y_test, y_pred)

In [16]:
from tabulate import tabulate

tablefinish = [["Accuracy", accuracy],
               ["Logloss", log_loss],
                ["F1 test", f1_score], 
          ["AUC", roc_auc_score]]

head_fim = ["Scores"]

print(tabulate(tablefinish, headers = head_fim, tablefmt = "grid"))

+----------+----------+
|          |   Scores |
| Accuracy | 0.947173 |
+----------+----------+
| Logloss  | 0.138309 |
+----------+----------+
| F1 test  | 0.968183 |
+----------+----------+
| AUC      | 0.900755 |
+----------+----------+
