# Linear Regression
read loan data file into dataframe and run linear regression on the selected variables

train the model and retrieve the coefficients for the selected variables and write to datafile

coefficient / variable

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
Bell=19581    # Bell Bankte

# online lenders
Quicken=7197000003        #Quicken Loan
AMEC=411941324             #American Mortgage & Equity Consultants Inc.
Guaranteed=364327855      # Guaranteed Rate

In [3]:
def FtestMutualInfo(X,y):
# F-test in regression compares the fits of different linear models
# Null Hypothesis – There is no significance to this independent variable
#If the P value for the F-test of overall significance test is less than your significance level, 
# you can reject the null-hypothesis and conclude that your model provides a better fit than the intercept-only model.
# mutual information can capture any kind of dependency between variables
#    print(X.head(2))
    f_test, pval = f_regression(X, y)
    print("f_test",f_test)
    print("Pval",pval)
    
    mi = mutual_info_regression(X, y)
    print("mi",mi)
    print("---------------------------------")
    
    return

In [4]:
def LinRegress(X_train, y_train,X_test,y_test):
#####################################
# use Linear Regression to train and test a model
# returning coefficients

    # create model
    LRmodel = LinearRegression()
    # fit model with training data
    LRmodel.fit(X_train, y_train)

     # make predictions using the testing data
    y_pred=LRmodel.predict(X_test)
        
#     # The coefficients
    model_coeff=LRmodel.coef_
#    print('Coefficients: \n', model_coeff)
    
    feature_names = X_train.columns
    feature_df=sorted(zip(model_coeff, feature_names), reverse=True)
#     # The mean squared error
#     print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
#     # Explained variance score: 1 is perfect prediction
#     print('Variance score: %.2f' % r2_score(y_test,y_pred))

#     # print params
#    params=LRmodel.get_params(deep=False)
#    print("params=",params)


    # validate the model using testing data
    test_score=LRmodel.score(X_test, y_test)
#    print("test_score",test_score)

    return feature_df


In [5]:
def AssignData(set, data_df):
##############################################
# choose the lender group for test
# set = 0 use all of data_df this is all lenders
# set = 1 select from data_df for 'respondent id' USBank, Wells, Quicken
# set > 2 select from data_df for 'respondent id' = set
# split for train and test data
# call LinRegress

    #reduce the data used for set > 0 to a subset of lenders or a particular lender
    if set > 2 :
        data_df=data_df.loc[data_df['respondent id'] == set,:]
    elif set == 1:
        data_df=data_df.loc[(data_df['respondent id'] == USBank) | (data_df['respondent id'] == Wells) | (data_df['respondent id'] == Quicken), :]
           
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#    FtestMutualInfo(X,y)
    
    return LinRegress(X_train, y_train,X_test,y_test)

# main body of code 
read in data file

prepare training data sets

In [6]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount","county",\
                     "applicant race 1","sex", "hoepa status","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]


# run test


In [7]:
feature_df=pd.DataFrame([])
feature_df=feature_df.append(AssignData(1,loan_data),ignore_index=True) 
feature_df=feature_df.rename(columns={0:"score",1:"feature"})

feature_df

Unnamed: 0,score,feature
0,0.08906281,loan type modified
1,0.02383197,applicant race 1
2,0.000833723,agency code
3,5.820872e-05,income cleaned
4,4.503111e-05,loan amount
5,3.241338e-05,county
6,9.714451000000001e-17,hoepa status
7,-0.0003307327,income loan ratio
8,-0.02954571,sex
9,-0.08670141,loan purpose


In [None]:
feature_df.to_csv('LinRfeatures.csv', index=False,header=True,encoding='utf-8')
