In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bank

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def FtestMutualInfo(X,y):
# F-test in regression compares the fits of different linear models
# Null Hypothesis – There is no significance to this independent variable
#If the P value for the F-test of overall significance test is less than your significance level, 
# you can reject the null-hypothesis and conclude that your model provides a better fit than the intercept-only model.
# mutual information can capture any kind of dependency between variables
#    print(X.head(2))
    f_test, pval = f_regression(X, y)
    print("f_test",f_test)
    print("Pval",pval)
    
    mi = mutual_info_regression(X, y)
    print("mi",mi)
    print("---------------------------------")
    
    return

In [4]:
def LinRegress(X_train, y_train,X_test,y_test):
#####################################
# use Linear Regression to train and test a model
# returning coefficients

    # create model
    LRmodel = LinearRegression()
    # fit model with training data
    LRmodel.fit(X_train, y_train)

     # make predictions using the testing data
    y_pred=LRmodel.predict(X_test)
        
#     # The coefficients
    model_coeff=LRmodel.coef_
    print('Coefficients: \n', model_coeff)
#     # The mean squared error
#     print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
#     # Explained variance score: 1 is perfect prediction
#     print('Variance score: %.2f' % r2_score(y_test,y_pred))

#     # print params
#    params=LRmodel.get_params(deep=False)
#    print("params=",params)


    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    print("test_score",test_score)

    return model_coeff


In [5]:
def LogRegress(X_train, y_train,X_test,y_test):
#####################################
# use Logistic Regression to train and test a model
# returning test score

    # create model
    LRmodel = LogisticRegression()
    # fit model with training data
#     print("X-train",X_train.head())
#     print("y_train",y_train.head())
    LRmodel.fit(X_train, y_train)

       # confusion matrix 
#     y_pred=LRmodel.predict(X_test)
#     cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
#     print("confusion matrix")
#     print(cnf_matrix)
    
    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
    params=LRmodel.get_params(deep=False)
#    print("test_score",test_score)
#    print("params=",params)

    return test_score

In [6]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in Banks
# set = 2 select from data_df for 'respondent id' in Online
# set > 3 select from data_df for 'respondent id' = set
# split for train and test data
# call LinRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 3 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 2:
        data_df=data_df.loc[(data_df['respondent id'] == Quicken) | (data_df['respondent id'] == AMEC) | (data_df['respondent id'] == Guaranteed), :]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Bell), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#    FtestMutualInfo(X,y)
    
    return LinRegress(X_train, y_train,X_test,y_test)

In [7]:
def AssignData2(set, data_df,test_df):
##############################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# choose the lender group for BOTH train and test data
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' in Banks
# set = 2 select from data_df for 'respondent id' in Online
# set > 3 select from data_df for 'respondent id' = set
# call LinRegress


    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 3 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
        test_df=test_df.loc[test_df['respondent id'] == set,:]
    elif set == 2:
        data_df=data_df.loc[(data_df['respondent id'] == Quicken) | (data_df['respondent id'] == AMEC) | (data_df['respondent id'] == Guaranteed), :]
        test_df=test_df.loc[(test_df['respondent id'] == Quicken) | (test_df['respondent id'] == AMEC) | (test_df['respondent id'] == Guaranteed), :]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Bell), :]
        test_df=test_df.loc[(test_df['respondent id'] == USBank) | (test_df['respondent id'] == Wells) | (test_df['respondent id'] == Bell), :]
           
        # Assign X (data) and y (target) for training
    X_train=data_df.drop(["action modified","respondent id"],axis=1)
    y_train=data_df["action modified"]
    
            # Assign X (data) and y (target) for test
    X_test=test_df.drop(["action modified","respondent id"],axis=1)
    y_test=test_df["action modified"]
#    print("shape", X.shape, y.shape)


    return LinRegress(X_train, y_train,X_test,y_test)

In [26]:
def LenderTest(test_data):
#################################################
# goes through the 9 tests of lender groups with one version of the test data
# 0- All data
# 1- all banks
# 2- all online lenders
# 3 USBANK 4- Wells 5- Bell 
# 6- Quicken 7- AMEC 8- Guaranteed
# returns the results in list
############################################################
    lender_df=pd.DataFrame(columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    # get result for all data
    result_list=pd.DataFrame([AssignData(0,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    print("LenderTest")
    print(result_list)
    lender_df=pd.concat([lender_df, result_list])
    print(lender_df)
    # get result for all banks
#    result_list.append(AssignData(1,test_data))
    #result for all online
#    result_list.append(AssignData(2,test_data))
    # result for USBank, Wells, Bell 
    result_list=pd.DataFrame([AssignData(USBank,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])
    result_list=pd.DataFrame([AssignData(Wells,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])
#    result_list.append(AssignData(Bell,test_data))
    #result for Quicken, AMEC, Guaranteed
#     result_list.append(AssignData(Quicken,test_data))
#     result_list.append(AssignData(AMEC,test_data))
#     result_list.append(AssignData(Guaranteed,test_data))
    lender_df["Lender"]=['all_lenders','usbank','wells']
    return lender_df

In [9]:
def LenderTest2(train_data,test_data):
#################################################
# FOR USE WHEN TRAINING AND TESTING ON DIFFERENT SETS
# goes through the 9 tests  with one version of the test data
# 0- All data
# 1- all banks
# 2- all online lenders
# 3 USBANK 4- Wells 5- Bell 
# 6- Quicken 7- AMEC 8- Guaranteed
# returns the results in list
############################################################
    result_list=[]
    # get result for all data
    result_list.append(AssignData2(0,train_data,test_data))
    # get result for all banks
#    result_list.append(AssignData2(1,train_data,test_data))
    #result for all online
#    result_list.append(AssignData2(2,train_data,test_data))
    # result for USBank, Wells, Bell 
    result_list.append(AssignData2(USBank,train_data,test_data))
    result_list.append(AssignData2(Wells,train_data,test_data))
#     result_list.append(AssignData2(Bell,train_data,test_data))
    #result for Quicken, AMEC, Guaranteed
#     result_list.append(AssignData2(Quicken,train_data,test_data))
#     result_list.append(AssignData2(AMEC,train_data,test_data))
#     result_list.append(AssignData2(Guaranteed,train_data,test_data))
    return result_list

# main body of code 
read in data file

prepare training data sets

In [10]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount","county",\
                     "applicant race 1","sex", "hoepa status","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]

# sexless_data=loan_data.drop(["sex"], axis=1)
# raceless_data=loan_data.drop(["applicant race 1"], axis=1)
# loantypeless_data=loan_data.drop(["loan type modified"], axis=1) 
# incomeratioless_data=loan_data.drop(["income loan ratio"], axis=1) 


prepare unique group test data sets 

In [11]:
# single gender  - genderless data
female_data=loan_data[loan_data["sex"] == 2]
female_data=female_data.drop(["sex"],axis=1)
male_data=loan_data[loan_data["sex"] == 1]
male_data=male_data.drop(["sex"],axis=1)

#single race - raceless data
indian_data=loan_data[loan_data["applicant race 1"] == 1]
indian_data=indian_data.drop(["applicant race 1"],axis=1)
asian_data=loan_data[loan_data["applicant race 1"] == 2]
asian_data=asian_data.drop(["applicant race 1"],axis=1)
black_data=loan_data[loan_data["applicant race 1"] == 3]
black_data=black_data.drop(["applicant race 1"],axis=1)
hawaii_data=loan_data[loan_data["applicant race 1"] == 4]
hawaii_data=hawaii_data.drop(["applicant race 1"],axis=1)
white_data=loan_data[loan_data["applicant race 1"] == 5]
white_data=white_data.drop(["applicant race 1"],axis=1)

# single loan type - loantypeless data
govtbacked_data=loan_data[loan_data["loan type modified"] == 0]
govtbacked_data=govtbacked_data.drop(["loan type modified"],axis=1)
conventional_data=loan_data[loan_data["loan type modified"] == 1]
conventional_data=conventional_data.drop(["loan type modified"],axis=1)

# single income ratio group - income ratio less data  (income ratio = (income / loan amount) * 100 )
lowincomeratio_data=loan_data[loan_data["income loan ratio"] <= 100]
lowincomeratio_data=lowincomeratio_data.drop(["income loan ratio"],axis=1)
medincomeratio_data=loan_data[(loan_data["income loan ratio"] > 100) & (loan_data["income loan ratio"] < 200) ]
medincomeratio_data=medincomeratio_data.drop(["income loan ratio"],axis=1)
highincomeratio_data=loan_data[loan_data["income loan ratio"] >= 200]
highincomeratio_data=highincomeratio_data.drop(["income loan ratio"],axis=1)

# run tests and append lender sets to dataframe
for each training group run with same train and test data

next run with training data and unique group test data

In [27]:
#result_df=pd.DataFrame(index=["all","banks","online","usbank","wells","bell","quicken","amec","guaranteed"])
#result_df=pd.DataFrame(index=["all","usbank","wells"],columns=["agency code","property type","loan purpose","loan amount","county",\
#                      "applicant race 1","sex", "hoepa status","lien status","loan type modified",\
#                     "income cleaned","income loan ratio"])


result_df=LenderTest(loan_data)

#result_df["all_fields"]=LenderTest(loan_data)

# #results for the effectof gender 
# result_df["sexless"]=LenderTest(sexless_data)
# result_df["female"]=LenderTest2(sexless_data,female_data)
# result_df["male"]=LenderTest2(sexless_data,male_data)

# #results for the effect of race
# result_df["raceless"]=LenderTest(raceless_data)
# result_df["indian"]=LenderTest2(raceless_data,indian_data)
# result_df["asian"]=LenderTest2(raceless_data,asian_data)
# result_df["black"]=LenderTest2(raceless_data,black_data)
# result_df["hawaiian"]=LenderTest2(raceless_data,hawaii_data)
# result_df["white"]=LenderTest2(raceless_data,white_data)

# #results for the effect of loan type
# result_df["loantypeless"]=LenderTest(loantypeless_data)
# result_df["govtbacked"]=LenderTest2(loantypeless_data,govtbacked_data)
# result_df["conventional"]=LenderTest2(loantypeless_data,conventional_data)

# # results for the effect of income / loan ratio 
# result_df["incomeratioless"]=LenderTest(incomeratioless_data)
# result_df["low income ratio"]=LenderTest2(incomeratioless_data,lowincomeratio_data)
# result_df["med income ratio"]=LenderTest2(incomeratioless_data,medincomeratio_data)
# result_df["high income ratio"]=LenderTest2(incomeratioless_data,highincomeratio_data)
result_df

Coefficients: 
 [-2.10446878e-02 -1.95126033e-01 -8.81049758e-02  5.94675167e-05
 -5.90972071e-05  1.09153911e-02 -3.22002550e-02 -2.77555756e-16
 -1.60400753e-01  5.83617654e-02  2.74727490e-05 -2.84877275e-04]
LenderTest
   agency code  property type  loan purpose  loan amount    county  \
0    -0.021045      -0.195126     -0.088105     0.000059 -0.000059   

   applicant race 1     sex  hoepa status  lien status  loan type modified  \
0          0.010915 -0.0322 -2.775558e-16    -0.160401            0.058362   

   income cleaned  income loan ratio  
0        0.000027          -0.000285  
   agency code  property type  loan purpose  loan amount    county  \
0    -0.021045      -0.195126     -0.088105     0.000059 -0.000059   

   applicant race 1     sex  hoepa status  lien status  loan type modified  \
0          0.010915 -0.0322 -2.775558e-16    -0.160401            0.058362   

   income cleaned  income loan ratio  
0        0.000027          -0.000285  
Coefficients: 
 [ 0.00000

Unnamed: 0,agency code,property type,loan purpose,loan amount,county,applicant race 1,sex,hoepa status,lien status,loan type modified,income cleaned,income loan ratio,Lender
0,-0.021045,-0.195126,-0.088105,5.9e-05,-5.9e-05,0.010915,-0.0322,-2.775558e-16,-0.160401,0.058362,2.7e-05,-0.000285,all_lenders
0,0.0,-0.192678,-0.115177,0.000164,-2.3e-05,0.030843,-0.038665,0.0,-0.176475,0.063009,-1e-06,-0.000337,usbank
0,0.0,-0.200697,-0.085322,-5.1e-05,2.2e-05,0.019166,-0.028034,-1.387779e-16,-0.188987,0.075401,0.000141,-0.000445,wells


In [28]:
result_df.to_csv('coefficients.csv', index=False,header=True,encoding='utf-8')
