In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bank

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def ConfusionSpread (cnf_matrix):
#############################################
# returns the confusion matrix converted to percent of a whole 
# returned as a list positions [00,01, 10,11]
########################################33
    print("confusion matrix")
    print(cnf_matrix)
    
    sum = cnf_matrix[0][0]+cnf_matrix[0][1]+cnf_matrix[1][0]+cnf_matrix[1][1]

    cnf_list=[cnf_matrix[0][0]/sum,cnf_matrix[0][1]/sum,cnf_matrix[1][0]/sum,cnf_matrix[1][1]/sum]
    return cnf_list

In [4]:
def successrate(y_values):
    success=sum(y_values)
    total = len(y_values)
    return success/total

In [16]:
def LogRegress(X_train, y_train,X_test,y_test):
#####################################
# use Logistic Regression to train and test a model
# returning test score

    # create model
    LRmodel = LogisticRegression()
    # fit model with training data
#     print("X-train",X_train.head())
#     print("y_train",y_train.head())
    LRmodel.fit(X_train, y_train)

       # confusion matrix 
    y_pred=LRmodel.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

    cnf_result=ConfusionSpread(cnf_matrix) # convert confusion matrix to list of percents
    print("training loan success",successrate(y_train))
    print("testing loan success",successrate(y_test))  
    print("predicted loan success",successrate(y_pred)) 
    
    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    params=LRmodel.get_params(deep=False)
#    print("test_score",test_score)
#    print("params=",params)
    return cnf_result

In [6]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in usbank, wells, quicken
# set > 2 select from data_df for 'respondent id' = set
# split for train and test data
# call LogRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    return LogRegress(X_train, y_train,X_test,y_test)

In [7]:
def AssignData2(set, data_df,test_df):
##############################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# choose the lender group for BOTH train and test data
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in usbank, wells quicken
# set > 2 select from data_df for 'respondent id' = set
# call LogRegress


    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
        test_df=test_df.loc[test_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
        test_df=test_df.loc[(test_df['respondent id'] == USBank) | (test_df['respondent id'] == Wells) | (test_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target) for training
    X_train=data_df.drop(["action modified","respondent id"],axis=1)
    y_train=data_df["action modified"]
    
            # Assign X (data) and y (target) for test
    X_test=test_df.drop(["action modified","respondent id"],axis=1)
    y_test=test_df["action modified"]
#    print("shape", X.shape, y.shape)


    return LogRegress(X_train, y_train,X_test,y_test)

In [14]:
def LenderTest(test_data,testname):
#################################################
# goes through the 4 tests of lender groups with one version of the test data
# all 3 usbank, wells, quicken
#  USBANK 
# Wells 
# Quicken
# returns the results in list
############################################################

    # for returning confusion matrix
    result_dict={}
    # get result for all 3
    print("all")
    result_dict['all'+ testname]=AssignData(1,test_data)
    # result for USBank 
    print("usbank")
    result_dict['usbank'+ testname]=AssignData(USBank,test_data)
        # result for Wells
    print("wells")
    result_dict['wells'+ testname]=AssignData(Wells,test_data)
    #result for Quicken
    print("quicken")
    result_dict['quicken'+ testname]=AssignData(Quicken,test_data)

    return result_dict

In [9]:
def LenderTest2(train_data,test_data,testname):
#################################################
# goes through the 4 tests of lender groups with one version of the test data
# all 3 usbank, wells, quicken
#  USBANK 
# Wells 
# Quicken
# returns the results in list
############################################################

# for returning confusion matirx
    result_dict={}
    # get result for all 3
    print("all")
    result_dict['all'+ testname]=AssignData2(1,train_data,test_data)
    # result for USBank, 
    print("usbank")
    result_dict['usbank'+ testname]=AssignData2(USBank,train_data,test_data)
        # result for Wells
    print("wells")
    result_dict['wells'+ testname]=AssignData2(Wells,train_data,test_data)
    #result for Quicken
    print("quicken")
    result_dict['quicken'+ testname]=AssignData2(Quicken,train_data,test_data)

    return result_dict

# main body of code 
read in data file

prepare training data sets

In [10]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount",\
                     "applicant race 1","sex","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]

race_sex_less_data=loan_data.drop(["sex","applicant race 1"], axis=1)


prepare unique group test data sets 

In [11]:
# single gender  - genderless data
female_data=loan_data[loan_data["sex"] == 2]
female_data=female_data.drop(["sex","applicant race 1"],axis=1)
male_data=loan_data[loan_data["sex"] == 1]
male_data=male_data.drop(["sex","applicant race 1"],axis=1)

#single race - raceless data
asian_data=loan_data[loan_data["applicant race 1"] == 2]
asian_data=asian_data.drop(["sex","applicant race 1"],axis=1)

black_data=loan_data[loan_data["applicant race 1"] == 3]
black_data=black_data.drop(["sex","applicant race 1"],axis=1)

white_data=loan_data[loan_data["applicant race 1"] == 5]
white_data=white_data.drop(["sex","applicant race 1"],axis=1)


# run tests and concat lender sets to dataframe
run for 3 lenders, as a group and individually

all data - train and test with same data

raceless and sexless (blind) data - train and test same data

asian -train with blind training data , test with unique group test data

black -train with blind training data , test with unique group test data 

white -train with blind training data , test with unique group test data

In [17]:
    #race gender included
print("test")
result_df=pd.DataFrame(LenderTest(loan_data,'_test'), index=['true-','false-','false+','true+'])
    #blind test norace or gender
print("blind")
confusion_df=pd.DataFrame(LenderTest(race_sex_less_data,'_blindtest'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
    # results for the effectof gender -female
print("female")
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,female_data,'_female'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
    # results for the effectof gender -Male
print("male")
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,male_data,'_male'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)

#results for the effect of race asian
print("asian")
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,asian_data,'_asian'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
# #results for the effect of race black
print("black")
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,black_data,'_black'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
# #results for the effect of race white
print("white")
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,white_data,'_white'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)

result_df

test
all
confusion matrix
[[ 112 1839]
 [  54 6513]]
training loan success 0.7748992290533401
testing loan success 0.7709556233857713
predicted loan success 0.9805118572434844
usbank
confusion matrix
[[  95  540]
 [  91 1948]]
training loan success 0.7518703241895262
testing loan success 0.762528047868362
predicted loan success 0.9304412864622289
wells
confusion matrix
[[  69  632]
 [  22 3026]]
training loan success 0.8114163777007202
testing loan success 0.813016804481195
predicted loan success 0.9757268604961323
quicken
confusion matrix
[[   0  565]
 [   0 1531]]
training loan success 0.7309466984884646
testing loan success 0.7304389312977099
predicted loan success 1.0
blind
all
confusion matrix
[[  71 1880]
 [  28 6539]]
training loan success 0.7748992290533401
testing loan success 0.7709556233857713
predicted loan success 0.9883775534162949
usbank
confusion matrix
[[  99  536]
 [ 110 1929]]
training loan success 0.7518703241895262
testing loan success 0.762528047868362
predicted l

Unnamed: 0,all_test,usbank_test,wells_test,quicken_test,all_blindtest,usbank_blindtest,wells_blindtest,quicken_blindtest,all_female,usbank_female,...,wells_asian,quicken_asian,all_black,usbank_black,wells_black,quicken_black,all_white,usbank_white,wells_white,quicken_white
true-,0.013149,0.035527,0.018405,0.0,0.008335,0.037023,0.014671,0.0,0.011064,0.054991,...,0.029734,0.0,0.018847,0.107383,0.032328,0.0,0.008011,0.035007,0.014483,0.0
false-,0.215896,0.201945,0.168578,0.269561,0.220709,0.200449,0.172313,0.269561,0.226715,0.215182,...,0.194053,0.267176,0.343681,0.332215,0.297414,0.307143,0.199924,0.188879,0.162501,0.261925
false+,0.00634,0.034031,0.005868,0.0,0.003287,0.041137,0.004268,0.0,0.004124,0.040347,...,0.004695,0.0,0.004435,0.053691,0.008621,0.0,0.003816,0.031968,0.004594,0.0
true+,0.764616,0.728497,0.807149,0.730439,0.767668,0.721391,0.808749,0.730439,0.758097,0.68948,...,0.771518,0.732824,0.633038,0.506711,0.661638,0.692857,0.788249,0.744147,0.818422,0.738075


In [None]:
result_df=result_df.reset_index()
result_df

In [None]:
result_df.to_csv('confpercentLogregress.csv', index=False,header=True,encoding='utf-8')