In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bank

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def ConfusionSpread (cnf_matrix):
    print("confusion matrix")
    print(cnf_matrix)
    cnf_list=[cnf_matrix[0][0],cnf_matrix[0][1],cnf_matrix[1][0],cnf_matrix[1][1]]
    return cnf_list

In [4]:
def LogRegress(X_train, y_train,X_test,y_test):
#####################################
# use Logistic Regression to train and test a model
# returning test score

    # create model
    LRmodel = LogisticRegression()
    # fit model with training data
#     print("X-train",X_train.head())
#     print("y_train",y_train.head())
    LRmodel.fit(X_train, y_train)

       # confusion matrix 
    y_pred=LRmodel.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

    cnf_result=ConfusionSpread(cnf_matrix)
    
    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    params=LRmodel.get_params(deep=False)
#    print("test_score",test_score)
#    print("params=",params)
    return cnf_result

In [5]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in usbank, wells, quicken
# set > 2 select from data_df for 'respondent id' = set
# split for train and test data
# call LogRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    return LogRegress(X_train, y_train,X_test,y_test)

In [6]:
def AssignData2(set, data_df,test_df):
##############################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# choose the lender group for BOTH train and test data
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in usbank, wells quicken
# set > 2 select from data_df for 'respondent id' = set
# call LogRegress


    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
        test_df=test_df.loc[test_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
        test_df=test_df.loc[(test_df['respondent id'] == USBank) | (test_df['respondent id'] == Wells) | (test_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target) for training
    X_train=data_df.drop(["action modified","respondent id"],axis=1)
    y_train=data_df["action modified"]
    
            # Assign X (data) and y (target) for test
    X_test=test_df.drop(["action modified","respondent id"],axis=1)
    y_test=test_df["action modified"]
#    print("shape", X.shape, y.shape)


    return LogRegress(X_train, y_train,X_test,y_test)

In [56]:
def LenderTest(test_data,testname):
#################################################
# goes through the 4 tests of lender groups with one version of the test data
# all 3 usbank, wells, quicken
#  USBANK 
# Wells 
# Quicken
# returns the results in list
############################################################

    # for returning confusion matrix
    result_dict={}
    # get result for all 3
    result_dict['all'+ testname]=AssignData(1,test_data)
    # result for USBank 
    result_dict['usbank'+ testname]=AssignData(USBank,test_data)
        # result for Wells
    result_dict['wells'+ testname]=AssignData(Wells,test_data)
    #result for Quicken
    result_dict['quicken'+ testname]=AssignData(Quicken,test_data)

    return result_dict

In [34]:
def LenderTest2(train_data,test_data,testname):
#################################################
# goes through the 4 tests of lender groups with one version of the test data
# all 3 usbank, wells, quicken
#  USBANK 
# Wells 
# Quicken
# returns the results in list
############################################################

# for returning confusion matirx
    result_dict={}
    # get result for all 3
    result_dict['all'+ testname]=AssignData2(1,train_data,test_data)
    # result for USBank, 
    result_dict['usbank'+ testname]=AssignData2(USBank,train_data,test_data)
        # result for Wells
    result_dict['wells'+ testname]=AssignData2(Wells,train_data,test_data)
    #result for Quicken
    result_dict['quicken'+ testname]=AssignData2(Quicken,train_data,test_data)

    return result_dict

# main body of code 
read in data file

prepare training data sets

In [28]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount",\
                     "applicant race 1","sex","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]

race_sex_less_data=loan_data.drop(["sex","applicant race 1"], axis=1)


prepare unique group test data sets 

In [29]:
# single gender  - genderless data
female_data=loan_data[loan_data["sex"] == 2]
female_data=female_data.drop(["sex","applicant race 1"],axis=1)
male_data=loan_data[loan_data["sex"] == 1]
male_data=male_data.drop(["sex","applicant race 1"],axis=1)

#single race - raceless data
asian_data=loan_data[loan_data["applicant race 1"] == 2]
asian_data=asian_data.drop(["sex","applicant race 1"],axis=1)

black_data=loan_data[loan_data["applicant race 1"] == 3]
black_data=black_data.drop(["sex","applicant race 1"],axis=1)

white_data=loan_data[loan_data["applicant race 1"] == 5]
white_data=white_data.drop(["sex","applicant race 1"],axis=1)


# run tests and concat lender sets to dataframe
run for 3 lenders, as a group and individually

all data - train and test with same data

raceless and sexless (blind) data - train and test same data

asian -train with blind training data , test with unique group test data

black -train with blind training data , test with unique group test data 

white -train with blind training data , test with unique group test data

In [57]:
    #race gender included
result_df=pd.DataFrame(LenderTest(loan_data,'_test'), index=['true-','false-','false+','true+'])
    #blind test norace or gender
confusion_df=pd.DataFrame(LenderTest(race_sex_less_data,'_blindtest'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
    # results for the effectof gender -female
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,female_data,'_female'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
    # results for the effectof gender -Male
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,male_data,'_male'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)

#results for the effect of race asian
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,asian_data,'_asian'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
# #results for the effect of race black
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,black_data,'_black'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)
# #results for the effect of race white
confusion_df=pd.DataFrame(LenderTest2(race_sex_less_data,white_data,'_white'), index=['true-','false-','false+','true+'])
result_df=pd.concat([result_df, confusion_df],axis=1)

result_df

confusion matrix
[[ 112 1839]
 [  54 6513]]
confusion matrix
[[  95  540]
 [  91 1948]]
confusion matrix
[[  69  632]
 [  22 3026]]
confusion matrix
[[   0  565]
 [   0 1531]]
confusion matrix
[[  71 1880]
 [  28 6539]]
confusion matrix
[[  99  536]
 [ 110 1929]]
confusion matrix
[[  55  646]
 [  16 3032]]
confusion matrix
[[   0  565]
 [   0 1531]]
confusion matrix
[[ 110 2254]
 [  41 7537]]
confusion matrix
[[ 184  720]
 [ 135 2307]]
confusion matrix
[[  95  880]
 [  29 3620]]
confusion matrix
[[   0  485]
 [   0 1487]]
confusion matrix
[[  151  4001]
 [   69 15718]]
confusion matrix
[[ 195 1242]
 [ 178 4868]]
confusion matrix
[[ 138 1539]
 [  39 7848]]
confusion matrix
[[   0 1038]
 [   0 2854]]
confusion matrix
[[ 18 288]
 [  2 927]]
confusion matrix
[[ 21 107]
 [  8 329]]
confusion matrix
[[ 19 124]
 [  3 493]]
confusion matrix
[[ 0 35]
 [ 0 96]]
confusion matrix
[[ 17 310]
 [  4 571]]
confusion matrix
[[ 32  99]
 [ 16 151]]
confusion matrix
[[ 15 138]
 [  4 307]]
confusion matrix

Unnamed: 0,all_test,usbank_test,wells_test,quicken_test,all_blindtest,usbank_blindtest,wells_blindtest,quicken_blindtest,all_female,usbank_female,...,wells_asian,quicken_asian,all_black,usbank_black,wells_black,quicken_black,all_white,usbank_white,wells_white,quicken_white
true-,112,95,69,0,71,99,55,0,110,184,...,19,0,17,32,15,0,212,311,186,0
false-,1839,540,632,565,1880,536,646,565,2254,720,...,124,35,310,99,138,43,5291,1678,2087,1241
false+,54,91,22,0,28,110,16,0,41,135,...,3,0,4,16,4,0,101,284,59,0
true+,6513,1948,3026,1531,6539,1929,3032,1531,7537,2307,...,493,96,571,151,307,97,20861,6611,10511,3497


In [20]:
result_df.to_csv('confusionLogregress.csv', index=True,header=True,encoding='utf-8')