<a href="https://colab.research.google.com/github/caiobaptistaa/Credit-Risk/blob/main/Pipeline_Final_End_to_End.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline End-to-End

In [None]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
import feature_engine
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier

import string

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    CountFrequencyEncoder,
)

from feature_engine.outliers import (
    ArbitraryOutlierCapper,

)

from feature_engine.transformation import (
        YeoJohnsonTransformer,
)


from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.compose import ColumnTransformer

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

#### Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/SBAnational.csv', low_memory= False)

## Preprocessors ##
import preprocessors as pp

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(899164, 27)


Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,84,4,2.0,0,0,1,0,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,60,2,2.0,0,0,1,0,N,Y,,31-May-97,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,180,7,1.0,0,0,1,0,N,N,,31-Dec-97,"$287,000.00",$0.00,P I F,$0.00,"$287,000.00","$215,250.00"
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,60,2,1.0,0,0,1,0,N,Y,,30-Jun-97,"$35,000.00",$0.00,P I F,$0.00,"$35,000.00","$28,000.00"
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,240,14,1.0,7,7,1,0,N,N,,14-May-97,"$229,000.00",$0.00,P I F,$0.00,"$229,000.00","$229,000.00"


#### Separating into Train, Val and Test

In [None]:
X_train, X_rest, y_train, y_rest = train_test_split(
    data.drop(['LoanNr_ChkDgt', 'Name', 'State', 'BankState',
               'ApprovalDate', 'FranchiseCode', 'ChgOffDate',
               'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr',
               'MIS_Status'
              ], axis=1), # predictive variables
    data["MIS_Status"], # target
    test_size=0.3, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size = 0.5, random_state = 0)

X_train.shape, X_test.shape

((629414, 16), (134875, 16))

In [None]:
X_train.head()

Unnamed: 0,City,Zip,Bank,NAICS,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,SBA_Appv
724990,STOCKTON,65785,LIBERTY BANK,0,1994,120,3,1.0,0,0,0,N,Y,"$40,000.00","$40,000.00","$36,000.00"
165681,GLOUCESTER,1930,BANKGLOUCESTER,812990,2006,84,3,2.0,0,0,1,N,N,"$100,000.00","$100,000.00","$85,000.00"
34675,JACKSON,39209,REGIONS BANK,0,1981,144,1,1.0,0,0,0,N,N,"$90,000.00","$90,000.00","$81,000.00"
49636,BRICK,8724,SANTANDER BANK NATL ASSOC,621111,2005,36,1,2.0,3,1,1,N,N,"$35,000.00","$35,000.00","$17,500.00"
125242,OAKLAND,94607,WELLS FARGO BANK NATL ASSOC,0,1998,120,25,1.0,0,0,0,0,N,"$750,000.00","$750,000.00","$562,500.00"


#### Configuration

In [None]:
target = ['MIS_Status']

features = ["City", "Zip", "Bank", "NAICS", "ApprovalFY",
            "Term", "NoEmp", "NewExist", "CreateJob",  "RetainedJob",
            "UrbanRural", "RevLineCr", "LowDoc", "DisbursementGross",
           "GrAppv", "SBA_Appv"]


cat_vars = ['City', 'Zip', 'Bank', 'NAICS']

cat_vars_with_na = ["Bank", "City"] ##with freq

binary_vars = ['NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc']

binary_miss = ["RevLineCr"]

binary_freq = ["NewExist", "LowDoc"]

num_vars = ["DisbursementGross", "GrAppv", "SBA_Appv"]

disc_vars = ["NoEmp", "CreateJob", "RetainedJob", "Term"]

temp_vars = ["ApprovalFY"]

##### Load special characteres 

In [None]:
import string
a = string.ascii_letters+string.punctuation+string.whitespace
alpha = list(a)

In [None]:
y_train.head()

724990     P I F
165681     P I F
34675     CHGOFF
49636      P I F
125242     P I F
Name: MIS_Status, dtype: object

In [None]:
y_train = pd.DataFrame(y_train)
y_train.head()

Unnamed: 0,MIS_Status
724990,P I F
165681,P I F
34675,CHGOFF
49636,P I F
125242,P I F


In [None]:
X_train.head()

Unnamed: 0,City,Zip,Bank,NAICS,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,SBA_Appv
724990,STOCKTON,65785,LIBERTY BANK,0,1994,120,3,1.0,0,0,0,N,Y,"$40,000.00","$40,000.00","$36,000.00"
165681,GLOUCESTER,1930,BANKGLOUCESTER,812990,2006,84,3,2.0,0,0,1,N,N,"$100,000.00","$100,000.00","$85,000.00"
34675,JACKSON,39209,REGIONS BANK,0,1981,144,1,1.0,0,0,0,N,N,"$90,000.00","$90,000.00","$81,000.00"
49636,BRICK,8724,SANTANDER BANK NATL ASSOC,621111,2005,36,1,2.0,3,1,1,N,N,"$35,000.00","$35,000.00","$17,500.00"
125242,OAKLAND,94607,WELLS FARGO BANK NATL ASSOC,0,1998,120,25,1.0,0,0,0,0,N,"$750,000.00","$750,000.00","$562,500.00"


#### Pipe for Target

In [None]:
sba_pipe_y = Pipeline([
    
    ###################### Missings and Special Characters ######################
    
    # Imputation Categorical - target #
    
    ('frequent_imputation - target', CategoricalImputer(
        imputation_method='frequent', variables = target)),
    
    # Imputation Categorical - features #
    
    ('Binary Imputation', OneHotEncoder(
        variables = "MIS_Status", drop_last = True)),
])

In [None]:
sba_pipe_y.fit(y_train)

Pipeline(steps=[('frequent_imputation - target',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['MIS_Status'])),
                ('Binary Imputation',
                 OneHotEncoder(drop_last=True, variables='MIS_Status'))])

In [None]:
y_train = sba_pipe_y.transform(y_train)

In [None]:
y_train.isnull().sum()

MIS_Status_P I F    0
dtype: int64

In [None]:
y_train.head()

Unnamed: 0,MIS_Status_P I F
724990,1
165681,1
34675,0
49636,1
125242,1


In [None]:
y_test = sba_pipe_y.transform(pd.DataFrame(y_test))
y_val = sba_pipe_y.transform(pd.DataFrame(y_val))

In [None]:
y_val.head()

Unnamed: 0,MIS_Status_P I F
225350,0
597175,1
29540,1
852705,1
560315,1


In [None]:
y_test.head()

Unnamed: 0,MIS_Status_P I F
864967,1
695944,1
776331,1
208860,1
546687,1


In [None]:
pca = PCA()

#### Pipeline for Features

In [None]:
sba_pipe = Pipeline([
    
    ###################### Missings and Special Characters ######################

    # Imputation Categorical - features #
    
    ('frequent_imputation - features', CategoricalImputer(
        imputation_method='frequent', variables= cat_vars_with_na)),
    
    # Binary Treatment - Assignment of Values (Software in-house) #
    
    ('BinaryAssign', pp.BinaryAssign(
        variables = binary_vars)),
    
    # Imputation Categorical - binary features (freq) #
    
    ('missing_imputation - binary features (freq)', CategoricalImputer(
        imputation_method='missing', variables= binary_miss)),
    
    # Imputation Categorical - binary features (miss) #
    
    ('frequent_imputation - binary features (miss)', CategoricalImputer(
        imputation_method='frequent', variables= binary_freq)),
    
    # Dolar Sign Substitution - numerical featuers (Software in-house) #
    
    ('Dollar Signs', pp.SubsNum(variables = num_vars)),   
    
    # Special Characters Substitution - temporal variables (Software in-house) #
    
    ('SpecialCh', pp.SpecialCh(
        variables = temp_vars, alpha = alpha)),
    
    # Transformation from temporal variables to decades variables (0-5) #
    
    ('Decades', pp.FunDec(
        variables = temp_vars)),
    
    # Outliers for continuous variables #
    
    ('Outliers', ArbitraryOutlierCapper(
        max_capping_dict = {'DisbursementGross': 1563934, 'GrAppv': 1520000, 'SBA_Appv': 1230000},
                                       min_capping_dict=None)),
    
    # Yeo-Johnson transformation #
    
    ('Yeo-Johnson', YeoJohnsonTransformer(
        variables = num_vars)),
    
    
    # Decimal Cut #
    
    ('Decimal Cut for Zip and NAICS', pp.CutDec(
        variables = ['Zip', 'NAICS'])),
    
    # Rare Labels #
    
    ('Rare Label', RareLabelEncoder(
        tol=0.01, n_categories=1, variables = cat_vars)),
    
    ##### Encoders ########
    
    # Binary Vars - Encoder #
    
    ('Ordinal Encoder for almost-Binary vars', OrdinalEncoder(
        encoding_method = "arbitrary", variables = binary_vars)),
    
    # Categorical Vars - Encoder #
    
    ('Frequency Encoder for Cat_vars', CountFrequencyEncoder(
        encoding_method = "frequency", variables= cat_vars)),
    
    
    ##### Scaler ##########
    
    ('MinMax Scaler for all variables', MinMaxScaler()),
    
    # Np to Df #
    
    ('NPtoDF', pp.NPtoDF(
        variables = features)),
    
    # PCA - continuous variables #
    
    ('PCA - continuous variables', ColumnTransformer([('pca', pca, num_vars)], remainder = 'passthrough')),

      # Np to Df #
    
    ('NPtoDF2', pp.NPtoDF(
        variables = ["PC1", "PC2", "PC3", "City", "Zip", "Bank", "NAICS", "ApprovalFY",
            "Term", "NoEmp", "NewExist", "CreateJob",  "RetainedJob",
            "UrbanRural", "RevLineCr", "LowDoc"])),

    # Drop PCA 2 e 3 #

    ('drop_features', DropFeatures(features_to_drop= ["PC2", "PC3"])),

    # Neural Network #

    ('Neural Network', MLPClassifier(
        hidden_layer_sizes= 200, activation = 'relu', solver = 'adam')),
])

In [None]:
sba_pipe.fit(X_train, y_train.values.ravel())



Pipeline(steps=[('frequent_imputation - features',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Bank', 'City'])),
                ('BinaryAssign',
                 BinaryAssign(variables=['NewExist', 'UrbanRural', 'RevLineCr',
                                         'LowDoc'])),
                ('missing_imputation - binary features (freq)',
                 CategoricalImputer(variables=['RevLineCr'])),
                ('frequent_imputation - binary features (miss)',
                 Catego...
                                   transformers=[('pca', PCA(),
                                                  ['DisbursementGross',
                                                   'GrAppv', 'SBA_Appv'])])),
                ('NPtoDF2',
                 NPtoDF(variables=['PC1', 'PC2', 'PC3', 'City', 'Zip', 'Bank',
                                   'NAICS', 'ApprovalFY', 'Term', 'NoEmp',
                                   

In [None]:
y_pred_train = sba_pipe.predict(X_train)
y_pred_val = sba_pipe.predict(X_val)

y_prob_train = sba_pipe.predict_proba(X_train)
y_prob_val = sba_pipe.predict_proba(X_val)

In [None]:
log_loss_train = log_loss(y_train, y_prob_train)

accuracy_train = accuracy_score(y_train, y_pred_train)

f1_train = f1_score(y_train, y_pred_train)

rocauc_train = roc_auc_score(y_train, y_pred_train)

log_loss_val = log_loss(y_val, y_prob_val)

accuracy_val = accuracy_score(y_val, y_pred_val)

f1_val = f1_score(y_val, y_pred_val)

rocauc_val = roc_auc_score(y_val, y_pred_val)


In [None]:
from tabulate import tabulate

tablefinish = [["Accuracy", accuracy_train, accuracy_val],
               ["Logloss", log_loss_train, log_loss_val],
                ["F1 test", f1_train, f1_val], 
          ["AUC", rocauc_train, rocauc_val]]

head_fim = ["Train", "Validation"]

print(tabulate(tablefinish, headers = head_fim, tablefmt = "grid"))

+----------+----------+--------------+
|          |    Train |   Validation |
| Accuracy | 0.905062 |     0.903222 |
+----------+----------+--------------+
| Logloss  | 0.239465 |     0.244842 |
+----------+----------+--------------+
| F1 test  | 0.943984 |     0.942897 |
+----------+----------+--------------+
| AUC      | 0.784547 |     0.780746 |
+----------+----------+--------------+
