In [18]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [19]:
current_path = os.getcwd()
pickle_path = os.path.join(current_path, "model", "loan_predict.pkl")
model = pickle.load(open(pickle_path, "rb"))

In [20]:
csv_file_path = 'dataset/Lending_TRAINING_DATA.csv'
get_result(csv_file_path,model)

Testing data size is: (7759, 46)
accuracy of Queue ID (STP or Under Writing) classification is :  97.08725351205052
confusion matrix for Application status classification :
 [[5357   15]
 [ 211 2176]]
accuracy of Application status for Stright Through Process is :  84.26724137931035
confusion matrix for Stright Through Process is 
 [[2755    1]
 [ 875 1937]]
accuracy of Application status for Under Writing is :  89.6850753080785
confusion matrix for Under Writing is 
 [[ 582    0]
 [ 226 1383]]


In [5]:
def preprocess_data(test_df):
    # Approved amount is used only for calculating accuracy 
    test_df.loc[test_df['APPROVED.AMOUNT'] > 1, 'APPROVED.AMOUNT'] = 0
    test_df['APPLICATION.STATUS'] = test_df['APPROVED.AMOUNT'].fillna(1).astype('int')
    features_drop = ['APPLICATION.ID','DSA.ID','DEALER.ID','APP.DATE','TIME.STAMP','CITY',
                'STATE','ZIP.CODE','CREDIT.CARDS.CATEGORY','TRADER.BUSINESS.PROOF',
                'PRIMARY.ASSET.MODELNO','CAR.CATEGORY','DEDUPE.REF.ID1','LOAN.TYPE']

    test_df.drop(features_drop,axis = 1,inplace=True)
    features_drop = ['NET.TAKE.HOME.SALARY','HOUSE.SURROGATE.DOCUMENT.TYPE','APPROVED.AMOUNT',
                'NAME.SCORE','TRADER.YEAR.IN.BUSINESS']

    test_df.drop(features_drop,axis = 1,inplace=True)
    most_frequent_category = test_df['PAN.STATUS'].mode()[0]
    test_df['PAN.STATUS'] = test_df['PAN.STATUS'].fillna(most_frequent_category)
    test_df.loc[test_df['CREDIT.CARD.NUMBER'] > 1, 'CREDIT.CARD.NUMBER'] = 1
    test_df['CREDIT.CARD.NUMBER'] = test_df['CREDIT.CARD.NUMBER'].fillna(0).astype('int')
    
    test_df.loc[test_df['APPLICATION.SCORE'] == '-', 'APPLICATION.SCORE'] = 0
    test_df.loc[test_df['APPLICATION.SCORE'] == 'RENGANATHAN MOHAN ', 'APPLICATION.SCORE'] = 0
    test_df['APPLICATION.SCORE'] = test_df['APPLICATION.SCORE'].astype('float')
    test_df1 = test_df['OFFICE.ADDRESS.SCORE'].replace('NOT_AUTHORIZED',np.nan).astype('float')
    test_df2 = test_df['RESIDENTIAL.ADDRESS.SCORE'].replace('NOT_AUTHORIZED',np.nan).astype('float')
    test_df['OFFICE.ADDRESS.SCORE'] = test_df1.fillna(test_df1.mean()).astype('float')
    test_df['RESIDENTIAL.ADDRESS.SCORE'] =  test_df2.fillna(test_df2.mean()).astype('float')
    test_df['APPLICATION.SCORE'] = test_df['APPLICATION.SCORE'].fillna(0)
    
    test_df1 = test_df['CIBIL.SCORE'].replace({'0':0,'-':1,'000-1':2}).astype('float')
    test_df['CIBIL.SCORE'] = test_df1.fillna(test_df1.mean())
    
    # Binary Features
    cat_features_binary = ['APPLICATION.STATUS','QUEUE.ID','GENDER',
                    'MARITAL.STATUS','VOTER_ID','DRIVING_LICENSE','AADHAAR','PAN','BANK_PASSBOOK']

    le=LabelEncoder()
    for feature in cat_features_binary:
        test_df[feature]=le.fit_transform(test_df[feature])

    # Multiple categorical features

    cat_features = ['EMPLOY.CONSTITUTION','EDUCATION','CURRENT.STAGE','RESIDENCE.TYPE',
                    'PAN.STATUS','OWN.HOUSE.TYPE','PRIMARY.ASSET.CTG','PRIMARY.ASSET.MAKE','ASSET.MAKE','ASSET.CTG']
    for feature in cat_features:
        test_df[feature]=le.fit_transform(test_df[feature])
    return test_df

def get_accuracy(test_df,rfc,rfc_stp,rfc_uw):
    test_X = test_df.drop(['QUEUE.ID','APPLICATION.STATUS'],axis = 1)
    test_y = test_df['QUEUE.ID']

    x_test = test_X.copy()
    y_test = test_y
    y_pred_rf = rfc.predict(x_test)

    print('accuracy of Queue ID (STP or Under Writing) classification is : ',accuracy_score(y_test,y_pred_rf)*100)
    print("confusion matrix for Application status classification :\n",confusion_matrix(y_test,y_pred_rf))
#     print('classification report for Application status classification :\n',classification_report(y_test,y_pred_rf))

    test_df['QUEUE.ID'] = y_pred_rf
    new_df = test_df.loc[test_df['QUEUE.ID'] == 0] # Accuracy for Straight through process
    
    new_df.drop(['QUEUE.ID'],axis = 1,inplace = True)
    x_test = new_df.drop(['APPLICATION.STATUS'],axis = 1)
    y_test = new_df['APPLICATION.STATUS']

#     print(x_test.shape, y_test.shape)

    y_pred_rfc_stp = rfc_stp.predict(x_test)

    print("accuracy of Application status for Stright Through Process is : ",accuracy_score(y_test,y_pred_rfc_stp)*100)
    print("confusion matrix for Stright Through Process is \n",confusion_matrix(y_test,y_pred_rfc_stp))
#     print('classification report for Stright Through Process is \n',classification_report(y_test,y_pred_rfc_stp))
    
    # 
    new_df = test_df.loc[test_df['QUEUE.ID'] == 1] # Under Writing
    new_df.drop(['QUEUE.ID'],axis = 1,inplace = True)
    x_test = new_df.drop(['APPLICATION.STATUS'],axis = 1)
    y_test = new_df['APPLICATION.STATUS']

    y_pred_rfc_uw = rfc_uw.predict(x_test)

    print('accuracy of Application status for Under Writing is : ',accuracy_score(y_test,y_pred_rfc_uw) * 100 )
    print('confusion matrix for Under Writing is \n',confusion_matrix(y_test,y_pred_rfc_uw))
#     print('classification report for Under Writing is \n',classification_report(y_test,y_pred))
    
def get_result(csv_file_path,model):
    test_df = pd.read_csv(csv_file_path)
    print('Testing data size is:',test_df.shape)
    df = preprocess_data(test_df)
    get_accuracy(df,model[0],model[1],model[2])
