In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bank

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def FtestMutualInfo(X,y):
# F-test in regression compares the fits of different linear models
# Null Hypothesis – There is no significance to this independent variable
#If the P value for the F-test of overall significance test is less than your significance level, 
# you can reject the null-hypothesis and conclude that your model provides a better fit than the intercept-only model.
# mutual information can capture any kind of dependency between variables
#    print(X.head(2))
    f_test, pval = f_regression(X, y)
    print("f_test",f_test)
    print("Pval",pval)
    
    mi = mutual_info_regression(X, y)
    print("mi",mi)
    print("---------------------------------")
    
    return

In [4]:
def LinRegress(X_train, y_train,X_test,y_test):
#####################################
# use Linear Regression to train and test a model
# returning test score

    # create model
    LRmodel = LinearRegression()
    # fit model with training data
    LRmodel.fit(X_train, y_train)

     # make predictions using the testing data
    y_pred=LRmodel.predict(X_test)
        
#     # The coefficients
#     print('Coefficients: \n', LRmodel.coef_)
#     # The mean squared error
#     print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
#     # Explained variance score: 1 is perfect prediction
#     print('Variance score: %.2f' % r2_score(y_test,y_pred))

#     # print params
#    params=LRmodel.get_params(deep=False)
#    print("params=",params)


    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    print("test_score",test_score)
    return test_score


In [5]:
def LogRegress(X_train, y_train,X_test,y_test):
#####################################
# use Logistic Regression to train and test a model
# returning test score

    # create model
    LRmodel = LogisticRegression()
    # fit model with training data
#     print("X-train",X_train.head())
#     print("y_train",y_train.head())
    LRmodel.fit(X_train, y_train)

       # confusion matrix 
#     y_pred=LRmodel.predict(X_test)
#     cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
#     print("confusion matrix")
#     print(cnf_matrix)
    
    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
    params=LRmodel.get_params(deep=False)
#    print("test_score",test_score)
#    print("params=",params)
    return test_score

In [6]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in Banks
# set = 2 select from data_df for 'respondent id' in Online
# set > 3 select from data_df for 'respondent id' = set
# split for train and test data
# call LinRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 3 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 2:
        data_df=data_df.loc[(data_df['respondent id'] == Quicken) | (data_df['respondent id'] == AMEC) | (data_df['respondent id'] == Guaranteed), :]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Bell), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    FtestMutualInfo(X,y)
    
    return LinRegress(X_train, y_train,X_test,y_test)

In [7]:
def AssignData2(set, data_df,test_df):
##############################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# choose the lender group for BOTH train and test data
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in Banks
# set = 2 select from data_df for 'respondent id' in Online
# set > 3 select from data_df for 'respondent id' = set
# call LinRegress


    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 3 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
        test_df=test_df.loc[test_df['respondent id'] == set,:]
    elif set == 2:
        data_df=data_df.loc[(data_df['respondent id'] == Quicken) | (data_df['respondent id'] == AMEC) | (data_df['respondent id'] == Guaranteed), :]
        test_df=test_df.loc[(test_df['respondent id'] == Quicken) | (test_df['respondent id'] == AMEC) | (test_df['respondent id'] == Guaranteed), :]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Bell), :]
        test_df=test_df.loc[(test_df['respondent id'] == USBank) | (test_df['respondent id'] == Wells) | (test_df['respondent id'] == Bell), :]
           
        # Assign X (data) and y (target) for training
    X_train=data_df.drop(["action modified","respondent id"],axis=1)
    y_train=data_df["action modified"]
    
            # Assign X (data) and y (target) for test
    X_test=test_df.drop(["action modified","respondent id"],axis=1)
    y_test=test_df["action modified"]
#    print("shape", X.shape, y.shape)


    return LinRegress(X_train, y_train,X_test,y_test)

In [8]:
def LenderTest(test_data):
#################################################
# goes through the 9 tests of lender groups with one version of the test data
# 0- All data
# 1- all banks
# 2- all online lenders
# 3 USBANK 4- Wells 5- Bell 
# 6- Quicken 7- AMEC 8- Guaranteed
# returns the results in list
############################################################
    result_list=[]
    # get result for all data
    result_list.append(AssignData(0,test_data))
    # get result for all banks
    result_list.append(AssignData(1,test_data))
    #result for all online
    result_list.append(AssignData(2,test_data))
    # result for USBank, Wells, Bell 
    result_list.append(AssignData(USBank,test_data))
    result_list.append(AssignData(Wells,test_data))
    result_list.append(AssignData(Bell,test_data))
    #result for Quicken, AMEC, Guaranteed
    result_list.append(AssignData(Quicken,test_data))
    result_list.append(AssignData(AMEC,test_data))
    result_list.append(AssignData(Guaranteed,test_data))
    return result_list

In [9]:
def LenderTest2(train_data,test_data):
#################################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# goes through the 9 tests  with one version of the test data
# 0- All data
# 1- all banks
# 2- all online lenders
# 3 USBANK 4- Wells 5- Bell 
# 6- Quicken 7- AMEC 8- Guaranteed
# returns the results in list
############################################################
    result_list=[]
    # get result for all data
    result_list.append(AssignData2(0,train_data,test_data))
    # get result for all banks
    result_list.append(AssignData2(1,train_data,test_data))
    #result for all online
    result_list.append(AssignData2(2,train_data,test_data))
    # result for USBank, Wells, Bell 
    result_list.append(AssignData2(USBank,train_data,test_data))
    result_list.append(AssignData2(Wells,train_data,test_data))
    result_list.append(AssignData2(Bell,train_data,test_data))
    #result for Quicken, AMEC, Guaranteed
    result_list.append(AssignData2(Quicken,train_data,test_data))
    result_list.append(AssignData2(AMEC,train_data,test_data))
    result_list.append(AssignData2(Guaranteed,train_data,test_data))
    return result_list

# main body of code 
read in data file

prepare training data sets

In [10]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount","county",\
                     "applicant race 1","sex", "hoepa status","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]

sexless_data=loan_data.drop(["sex"], axis=1)
raceless_data=loan_data.drop(["applicant race 1"], axis=1)
loantypeless_data=loan_data.drop(["loan type modified"], axis=1) 
incomeratioless_data=loan_data.drop(["income loan ratio"], axis=1) 


prepare unique group test data sets 

In [11]:
# single gender  - genderless data
female_data=loan_data[loan_data["sex"] == 2]
female_data=female_data.drop(["sex"],axis=1)
male_data=loan_data[loan_data["sex"] == 1]
male_data=male_data.drop(["sex"],axis=1)

#single race - raceless data
indian_data=loan_data[loan_data["applicant race 1"] == 1]
indian_data=indian_data.drop(["applicant race 1"],axis=1)
asian_data=loan_data[loan_data["applicant race 1"] == 2]
asian_data=asian_data.drop(["applicant race 1"],axis=1)
black_data=loan_data[loan_data["applicant race 1"] == 3]
black_data=black_data.drop(["applicant race 1"],axis=1)
hawaii_data=loan_data[loan_data["applicant race 1"] == 4]
hawaii_data=hawaii_data.drop(["applicant race 1"],axis=1)
white_data=loan_data[loan_data["applicant race 1"] == 5]
white_data=white_data.drop(["applicant race 1"],axis=1)

# single loan type - loantypeless data
govtbacked_data=loan_data[loan_data["loan type modified"] == 0]
govtbacked_data=govtbacked_data.drop(["loan type modified"],axis=1)
conventional_data=loan_data[loan_data["loan type modified"] == 1]
conventional_data=conventional_data.drop(["loan type modified"],axis=1)

# single income ratio group - income ratio less data  (income ratio = (income / loan amount) * 100 )
lowincomeratio_data=loan_data[loan_data["income loan ratio"] <= 100]
lowincomeratio_data=lowincomeratio_data.drop(["income loan ratio"],axis=1)
medincomeratio_data=loan_data[(loan_data["income loan ratio"] > 100) & (loan_data["income loan ratio"] < 200) ]
medincomeratio_data=medincomeratio_data.drop(["income loan ratio"],axis=1)
highincomeratio_data=loan_data[loan_data["income loan ratio"] >= 200]
highincomeratio_data=highincomeratio_data.drop(["income loan ratio"],axis=1)

# run tests and append lender sets to dataframe
for each training group run with same train and test data

next run with training data and unique group test data

In [12]:
result_df=pd.DataFrame(index=["all","banks","online","usbank","wells","bell","quicken","amec","guaranteed"])


result_df["all_fields"]=LenderTest(loan_data)

# #results for the effectof gender 
# result_df["sexless"]=LenderTest(sexless_data)
# result_df["female"]=LenderTest2(sexless_data,female_data)
# result_df["male"]=LenderTest2(sexless_data,male_data)

# #results for the effect of race
# result_df["raceless"]=LenderTest(raceless_data)
# result_df["indian"]=LenderTest2(raceless_data,indian_data)
# result_df["asian"]=LenderTest2(raceless_data,asian_data)
# result_df["black"]=LenderTest2(raceless_data,black_data)
# result_df["hawaiian"]=LenderTest2(raceless_data,hawaii_data)
# result_df["white"]=LenderTest2(raceless_data,white_data)

# #results for the effect of loan type
# result_df["loantypeless"]=LenderTest(loantypeless_data)
# result_df["govtbacked"]=LenderTest2(loantypeless_data,govtbacked_data)
# result_df["conventional"]=LenderTest2(loantypeless_data,conventional_data)

# # results for the effect of income / loan ratio 
# result_df["incomeratioless"]=LenderTest(incomeratioless_data)
# result_df["low income ratio"]=LenderTest2(incomeratioless_data,lowincomeratio_data)
# result_df["med income ratio"]=LenderTest2(incomeratioless_data,medincomeratio_data)
# result_df["high income ratio"]=LenderTest2(incomeratioless_data,highincomeratio_data)
# result_df

   agency code  property type  loan purpose  loan amount  county  \
0            9              1             3           43     109   
1            9              1             1           53      33   

   applicant race 1  sex  hoepa status  lien status  loan type modified  \
0                 5    2             2            1                   1   
1                 5    2             2            1                   0   

   income cleaned  income loan ratio  
0              41                105  
1               0                  0  
f_test [1.37084300e+03 1.38599479e+02 2.86100150e+03 5.86133086e+02
 4.41096455e+00 1.71718082e+00 2.20435968e+02            nan
 1.33548457e+03 9.33953622e+00 1.50284004e+02 1.01642297e+00]
Pval [6.81720989e-296 5.96284360e-032 0.00000000e+000 1.03051087e-128
 3.57138464e-002 1.90062607e-001 9.37122566e-050             nan
 2.02629986e-288 2.24385320e-003 1.69243409e-034 3.13374075e-001]


  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.01954932 0.00169434 0.03766357 0.007057   0.         0.00130538
 0.00572312 0.00074743 0.01476851 0.         0.00801359 0.01327502]
---------------------------------
   agency code  property type  loan purpose  loan amount  county  \
0            9              1             3           43     109   
1            9              1             1           53      33   

   applicant race 1  sex  hoepa status  lien status  loan type modified  \
0                 5    2             2            1                   1   
1                 5    2             2            1                   0   

   income cleaned  income loan ratio  
0              41                105  
1               0                  0  
f_test [1.39122804e+03 1.41288261e+02 2.09168086e+03 5.13262970e+02
 6.75018050e-01 4.49626208e+01 6.65976748e+01            nan
 1.38246627e+03 4.39049762e+01 1.39408037e+02 7.30893725e+00]
Pval [2.74026524e-298 1.62176564e-032 0.00000000e+000 9.00117254e-113
 4.11313479e-001 2.

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.02744156 0.00272106 0.04817369 0.01105474 0.00087717 0.00855039
 0.00584612 0.         0.01987591 0.         0.00325273 0.01563627]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
32971            7              1             3          300      19   
32972            7              1             1          147      13   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
32971                 5    1             2            1                   1   
32972                 5    1             2            1                   0   

       income cleaned  income loan ratio  
32971             170                176  
32972              40                368  
f_test [         nan   0.99384046 819.50402253  72.41799899   6.50179656
 164.2633322  196.57295934          nan   8.38009787 135.0780899
  13.16643845   6.52124955]
Pval [            nan 3.18821562e-001 1.50405697e-175 1.90361302e-017
 1.07863

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.00974434 0.         0.03459682 0.         0.         0.01582562
 0.01553463 0.         0.         0.00595611 0.         0.00750455]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
14996            9              1             1          305      53   
14997            9              1             1          157     123   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
14996                 5    1             2            1                   0   
14997                 2    1             2            1                   0   

       income cleaned  income loan ratio  
14996              95                321  
14997              40                392  
f_test [           nan 2.17905038e+01 6.56509594e+02 2.34078039e+02
 3.91954978e-01 1.37222538e+01 6.28971370e+01            nan
 2.99319429e+02 4.35454825e+00 6.52436125e+01 1.67545905e+01]
Pval [            nan 3.07799926e-006 1.41826332e-140

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.0011953  0.         0.03711634 0.00630931 0.00090834 0.0078656
 0.         0.         0.         0.         0.0175757  0.0201809 ]
---------------------------------
   agency code  property type  loan purpose  loan amount  county  \
0            9              1             3           43     109   
1            9              1             1           53      33   

   applicant race 1  sex  hoepa status  lien status  loan type modified  \
0                 5    2             2            1                   1   
1                 5    2             2            1                   0   

   income cleaned  income loan ratio  
0              41                105  
1               0                  0  
f_test [           nan 7.87434918e+01 4.79052009e+02 1.33944393e+02
 1.13179788e-01 3.19847060e+01 2.46875909e+01            nan
 6.95962037e+02 9.52472637e+00 6.80946487e+01 4.40955439e+01]
Pval [            nan 7.86002006e-019 1.47929002e-104 7.59815777e-031
 7.36557341e-001 1.5

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.         0.00230866 0.02801974 0.00597971 0.00115568 0.01673973
 0.01667615 0.0004033  0.01536702 0.         0.00248658 0.02008433]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
25690            3              1             1          213       3   
25691            3              1             3          362      19   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
25690                 2    1             2            1                   1   
25691                 5    1             2            1                   1   

       income cleaned  income loan ratio  
25690               0                  0  
25691               0                  0  
f_test [           nan 1.50015406e-01 1.33979572e+02 1.44028184e+01
 1.19088040e+00 4.01768779e-01 6.46107877e-02            nan
 5.91079484e+00 1.52404608e+01 8.54703522e+00 5.06877829e+01]
Pval [           nan 6.98531960e-01 1.02423372e-30 1.

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.         0.01297201 0.01194903 0.00014996 0.0059416  0.
 0.         0.00771447 0.         0.         0.         0.00144829]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
38324            7              1             3          181       3   
38325            7              1             3          422      53   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
38324                 6    3             2            1                   0   
38325                 6    3             2            1                   1   

       income cleaned  income loan ratio  
38324             101                179  
38325             238                177  
f_test [         nan          nan  36.63749587   0.33156315   0.88955416
   0.59190234   3.48708513          nan          nan 115.07960029
   0.23806843   3.76502041]
Pval [           nan            nan 1.48384623e-09 5.64755560e-01
 3.45625348e-01 4.4

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.         0.         0.         0.00497192 0.         0.
 0.         0.         0.         0.00018479 0.00420498 0.        ]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
32971            7              1             3          300      19   
32972            7              1             1          147      13   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
32971                 5    1             2            1                   1   
32972                 5    1             2            1                   0   

       income cleaned  income loan ratio  
32971             170                176  
32972              40                368  
f_test [        nan 33.59494947 13.98480755  2.87074555  0.17699363  2.59972592
  0.50818004         nan         nan 19.93802552  2.04164682 30.01253525]
Pval [           nan 7.17491970e-09 1.86219907e-04 9.02612662e-02
 6.73986596e-01 1.06941169e-01 4

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.         0.00715909 0.         0.         0.         0.
 0.00200708 0.         0.02281962 0.00426166 0.         0.        ]
---------------------------------
       agency code  property type  loan purpose  loan amount  county  \
46705            7              1             3          450      53   
46706            7              1             3          175      53   

       applicant race 1  sex  hoepa status  lien status  loan type modified  \
46705                 5    1             2            1                   1   
46706                 5    2             2            1                   1   

       income cleaned  income loan ratio  
46705             345                130  
46706             187                 94  
f_test [        nan         nan 28.2968896   9.24480657  1.54162222  0.07433243
  7.27293561         nan  2.20356287  9.01981418 15.91396496 43.15042408]
Pval [           nan            nan 1.22086979e-07 2.40808458e-03
 2.14596744e-01 7.85172734e-01 7

  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


mi [0.         0.         0.01897966 0.         0.         0.0113234
 0.00919095 0.         0.         0.         0.03751307 0.        ]
---------------------------------
