# Random Forest
reads in file - organize data into 4 dataframes - all 3 lenders of interest, each lender: usbank, wells fargo and quicken loan

run Random Forest model on each dataframe with the following output written to a csv file:

    confusion matrix by quadrant (true-, false+, false-, true-)/result count/lender(all, usbank, wells, quicken)

    confusion matrix by quadrant with percent (true-, false+, false-, true-)/ percent of total / lender

    model score / lender

    feature importance for the given variables score / feature

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

respondent IDs for each lender

In [2]:
# banks
USBank= 504713  # US Bank
Wells=451965  #Wells Fargo
# online lender
Quicken=7197000003        #Quicken Loan

In [3]:
def ConfusionSpread (lender,cnf_matrix):
######################################
# format confusion matrix for output as dataframe
######################################
#    print("confusion matrix")
#    print(cnf_matrix)
    cnf_df=pd.DataFrame([['true-',cnf_matrix[0][0],lender],['false+',cnf_matrix[0][1],lender],\
                             ['false-',cnf_matrix[1][0],lender],['true+',cnf_matrix[1][1],lender]])
    return cnf_df

In [4]:
def ConfusionPercent (lender,cnf_matrix):
######################################
# format confusion matrix for output as dataframe using percents
############################################
#    print("confusion matrix")
#    print(cnf_matrix)
    sum=cnf_matrix[0][0]+cnf_matrix[0][1]+cnf_matrix[1][0]+cnf_matrix[1][1]
    cnf_df=pd.DataFrame([['true-',cnf_matrix[0][0]/sum,lender],['false+',cnf_matrix[0][1]/sum,lender],\
                             ['false-',cnf_matrix[1][0]/sum,lender],['true+',cnf_matrix[1][1]/sum,lender]])
    return cnf_df

In [5]:
def RForest(lender,X_train, y_train,X_test,y_test,output):
#####################################
# use Logistic Regression to train and test a model
# output determines the dataframe returnes 
# returning a dataframe with: 
# output =0 the confusion matrix, 
# output = 1  the confusion matrix percents 
# output = 2  the test score

# Create, fit, and score a Random Forest Classifier
    RFmodel = RandomForestClassifier(n_estimators=50)
    RFmodel = RFmodel.fit(X_train, y_train)

    # Random Forests in sklearn will automatically calculate feature importance
    feature_names = X_train.columns
    importances = RFmodel.feature_importances_
    feature_df=sorted(zip(RFmodel.feature_importances_, feature_names), reverse=True)

       # confusion matrix 
    y_pred=RFmodel.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    cnf_result=ConfusionSpread(lender,cnf_matrix)
    cnf_percent=ConfusionPercent(lender,cnf_matrix)
    
    # validate the model using testing data
    test_score=RFmodel.score(X_test, y_test)
    score_df=pd.DataFrame([[test_score,lender]])    
#    print("test_score",test_score)

    if output == 0:
        return cnf_result
    elif output == 1:
        return cnf_percent
    elif output == 2:
        return score_df
    elif output == 3:
        return feature_df

In [6]:
def AssignData(lender, data_df,output):
##############################################
# prepared train and test data 
######################################
        # Assign X (data) and y (target)
    X=data_df.drop(["action modified","respondent id"],axis=1)
    y=data_df["action modified"]
#    print("shape", X.shape, y.shape)  , stratify=y

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    return RForest(lender,X_train, y_train,X_test,y_test,output)

# main body of code 
read in data file

prepare training data sets

In [7]:
file="AllData2017.csv"
loan_data=pd.read_csv(file)

loan_data=loan_data[["respondent id","agency code","property type","loan purpose","loan amount",\
                     "applicant race 1","sex","lien status","loan type modified",\
                     "action modified","income cleaned","income loan ratio"]]
              
lender_df=loan_data.loc[(loan_data['respondent id'] == USBank) | (loan_data['respondent id'] == Wells) | (loan_data['respondent id'] == Quicken), :]    
usbank_df=loan_data.loc[loan_data['respondent id'] == USBank,:]
wells_df=loan_data.loc[loan_data['respondent id'] == Wells,:]
quicken_df=loan_data.loc[loan_data['respondent id'] == Quicken,:]



# run tests and append results to dataframe
for each lender train and test 
result = 

confusion matrix dataframe

confusion matrix percent dataframe

test score dataframe


In [8]:
cnf_out=0
cnf_percent_out=1
success_score=2
feature_result=3
# compute confusion matrix output
cnf_out_df=pd.DataFrame([])
cnf_out_df=cnf_out_df.append(AssignData('all',lender_df,cnf_out),ignore_index=True)
cnf_out_df=cnf_out_df.append(AssignData('usbank',usbank_df,cnf_out),ignore_index=True)
cnf_out_df=cnf_out_df.append(AssignData('wells',wells_df,cnf_out),ignore_index=True)
cnf_out_df=cnf_out_df.append(AssignData('quicken',quicken_df,cnf_out),ignore_index=True)
cnf_out_df=cnf_out_df.rename(columns={0:'quadrant',1:'cnf_result',2:'lender'})
cnf_out_df

Unnamed: 0,quadrant,cnf_result,lender
0,true-,504,all
1,false+,1447,all
2,false-,704,all
3,true+,5863,all
4,true-,228,usbank
5,false+,407,usbank
6,false-,243,usbank
7,true+,1796,usbank
8,true-,196,wells
9,false+,505,wells


In [9]:
# compute confusion matrix percent output
cnf_percent_out_df=pd.DataFrame([])
cnf_percent_out_df=cnf_percent_out_df.append(AssignData('all',lender_df,cnf_percent_out),ignore_index=True)
cnf_percent_out_df=cnf_percent_out_df.append(AssignData('usbank',usbank_df,cnf_percent_out),ignore_index=True)
cnf_percent_out_df=cnf_percent_out_df.append(AssignData('wells',wells_df,cnf_percent_out),ignore_index=True)
cnf_percent_out_df=cnf_percent_out_df.append(AssignData('quicken',quicken_df,cnf_percent_out),ignore_index=True)
cnf_percent_out_df=cnf_percent_out_df.rename(columns={0:'quadrant',1:'cnf_percent',2:'lender'})
cnf_percent_out_df

Unnamed: 0,quadrant,cnf_percent,lender
0,true-,0.059169,all
1,false+,0.169876,all
2,false-,0.079361,all
3,true+,0.691594,all
4,true-,0.081526,usbank
5,false+,0.155946,usbank
6,false-,0.091997,usbank
7,true+,0.670531,usbank
8,true-,0.053081,wells
9,false+,0.133902,wells


In [10]:
# compute model score
success_score_df=pd.DataFrame([])
success_score_df=success_score_df.append(AssignData('all',lender_df,success_score),ignore_index=True)
success_score_df=success_score_df.append(AssignData('usbank',usbank_df,success_score),ignore_index=True)
success_score_df=success_score_df.append(AssignData('wells',wells_df,success_score),ignore_index=True)
success_score_df=success_score_df.append(AssignData('quicken',quicken_df,success_score),ignore_index=True)
success_score_df=success_score_df.rename(columns={0:'score',1:'lender'})
success_score_df

Unnamed: 0,score,lender
0,0.748885,all
1,0.753927,usbank
2,0.804748,wells
3,0.683683,quicken


In [11]:
# compute store the importance assigned to the variables used in the training
feature_df=pd.DataFrame([])
feature_df=feature_df.append(AssignData('all',lender_df,feature_result),ignore_index=True) 
feature_df=feature_df.rename(columns={0:"score",1:"feature"})

feature_df



Unnamed: 0,score,feature
0,0.296801,income loan ratio
1,0.288618,loan amount
2,0.256379,income cleaned
3,0.053255,loan purpose
4,0.032131,sex
5,0.028329,applicant race 1
6,0.016606,agency code
7,0.01283,lien status
8,0.010802,loan type modified
9,0.004248,property type


In [None]:
cnf_out_df.to_csv('RFconf.csv', index=False,header=True,encoding='utf-8')

In [None]:
cnf_percent_out_df.to_csv('RFconfpercent.csv', index=False,header=True,encoding='utf-8')

In [None]:
success_score_df.to_csv('RFscore.csv', index=False,header=True,encoding='utf-8')

In [12]:
feature_df.to_csv('RFfeatures.csv', index=False,header=True,encoding='utf-8')