In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bank

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def FtestMutualInfo(X,y):
# F-test in regression compares the fits of different linear models
# Null Hypothesis â€“ There is no significance to this independent variable
#If the P value for the F-test of overall significance test is less than your significance level, 
# you can reject the null-hypothesis and conclude that your model provides a better fit than the intercept-only model.
# mutual information can capture any kind of dependency between variables
#    print(X.head(2))
    f_test, pval = f_regression(X, y)
    print("f_test",f_test)
    print("Pval",pval)
    
    mi = mutual_info_regression(X, y)
    print("mi",mi)
    print("---------------------------------")
    
    return

In [11]:
def LinRegress(X_train, y_train,X_test,y_test):
#####################################
# use Linear Regression to train and test a model
# returning coefficients

    # create model
    LRmodel = LinearRegression()
    # fit model with training data
    LRmodel.fit(X_train, y_train)

     # make predictions using the testing data
    y_pred=LRmodel.predict(X_test)
        
#     # The coefficients
    model_coeff=LRmodel.coef_
#    print('Coefficients: \n', model_coeff)
#     # The mean squared error
#     print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
#     # Explained variance score: 1 is perfect prediction
#     print('Variance score: %.2f' % r2_score(y_test,y_pred))

#     # print params
#    params=LRmodel.get_params(deep=False)
#    print("params=",params)


    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    print("test_score",test_score)

    return model_coeff


In [5]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' USBank, Wells, Quicken
# set > 2 select from data_df for 'respondent id' = set
# split for train and test data
# call LinRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#    FtestMutualInfo(X,y)
    
    return LinRegress(X_train, y_train,X_test,y_test)

In [6]:
def LenderTest(test_data):
#################################################
# goes through the 4 tests of lender groups with one version of the test data
#  3 lenders usbank, wells quicken
#  USBANK 
#  Wells 
# Quicken 
# returns the results in list
############################################################
    lender_df=pd.DataFrame(columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    # get result for all 3
    result_list=pd.DataFrame([AssignData(1,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])

    # result for USBank
    result_list=pd.DataFrame([AssignData(USBank,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])
        # result for  Wells
    result_list=pd.DataFrame([AssignData(Wells,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])
     # result for Quicken    
    result_list=pd.DataFrame([AssignData(Quicken,test_data)],columns=["agency code","property type","loan purpose","loan amount",\
                                "county","applicant race 1","sex", "hoepa status","lien status",\
                                "loan type modified","income cleaned","income loan ratio"])
    lender_df=pd.concat([lender_df, result_list])

    # add the index
    lender_df["Lender"]=['3 lenders','usbank','wells','quicken']
    return lender_df

# main body of code 
read in data file

prepare training data sets

In [7]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount","county",\
                     "applicant race 1","sex", "hoepa status","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]



# run test


In [12]:
result_df=LenderTest(loan_data)

result_df

Unnamed: 0,agency code,property type,loan purpose,loan amount,county,applicant race 1,sex,hoepa status,lien status,loan type modified,income cleaned,income loan ratio,Lender
0,0.000834,-0.167691,-0.086701,4.5e-05,3.2e-05,0.023832,-0.029546,9.714451000000001e-17,-0.1703858,0.089063,5.8e-05,-0.000331,3 lenders
0,0.0,-0.1926777,-0.115177,0.000164,-2.3e-05,0.030843,-0.038665,0.0,-0.176475,0.063009,-1e-06,-0.000337,usbank
0,0.0,-0.200697,-0.085322,-5.1e-05,2.2e-05,0.019166,-0.028034,-1.387779e-16,-0.1889868,0.075401,0.000141,-0.000445,wells
0,0.0,-1.06035e-16,-0.037002,0.000128,-5.5e-05,0.004554,-0.0136,2.775558e-17,-6.938894e-18,0.124543,-0.000408,-0.000229,quicken


In [13]:
result_df.to_csv('coefficients.csv', index=False,header=True,encoding='utf-8')
