In [1]:
import os
import pandas as pd
import numpy as np
from datetime import date
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


In [2]:
perfSample = pd.read_csv('perfSample.csv')
accSample = pd.read_csv('accSample.csv')

In [3]:
print(perfSample.dtypes)

loanID                                  int64
monthlyReportingPeriod                 object
servicerName                           object
currIntRate                           float64
currActualUPB                         float64
loanAge                                 int64
remainingMonthsToLegalMaturity        float64
adjustedMonthsToMaturity              float64
maturityDate                           object
msa                                     int64
currLoanDelinquencyStatus              object
modFlag                                object
zeroBalCode                           float64
zeroBalEffDate                         object
lastPaidInstallmentDate                object
foreclosureDate                        object
dispositionDate                        object
foreclosureCosts                      float64
propertyPreservationAndRepairCosts    float64
assetRecoveryCosts                    float64
miscHoldingExpnAndCredits             float64
assocTaxesForHoldingProperty      

In [4]:
# perfSample.head(2)

In [5]:
# regex , assert position at start of line, match any whitespaces, .
perfSample = perfSample.replace(r'^\s+$', np.nan, regex=True)

In [6]:
# perfSample.head(2)
perfSample = perfSample.loc[:,'loanID':'foreclosureDate']

In [7]:
# replace nan with 0
perfSample['foreclosureDate'] = perfSample['foreclosureDate'].replace(np.nan, 0)

In [8]:
perfSample['foreclosureDate'] = perfSample['foreclosureDate'].apply(lambda x: 1 if x != 0 else 0)

In [9]:
# perfSample.head(2)

In [10]:
accSample.dtypes

loanID                                  int64
origChannel                            object
sellerName                             object
origIntRate                           float64
orgUPB                                  int64
orgLoanTerm                             int64
orginationDate                         object
firstPaymentDate                       object
orgLTV                                float64
orgCombinedLTV                        float64
numBorrowers                          float64
orgDebttoIncomeRatio                  float64
borrowerCreditScoreAtOrigination      float64
firstTimeHomeBuyer                     object
loanPurpose                            object
propType                               object
numUnits                                int64
occType                                object
propertyState                          object
zipCodeShort                            int64
priMortgageInsurancePercent           float64
prodType                          

In [11]:
accSample = accSample.loc[:,'loanID':'prodType']

In [12]:
accSample.head(2)

Unnamed: 0,loanID,origChannel,sellerName,origIntRate,orgUPB,orgLoanTerm,orginationDate,firstPaymentDate,orgLTV,orgCombinedLTV,...,borrowerCreditScoreAtOrigination,firstTimeHomeBuyer,loanPurpose,propType,numUnits,occType,propertyState,zipCodeShort,priMortgageInsurancePercent,prodType
0,100007365142,R,JPMORGAN CHASE BANK NA,8.0,75000,360,12/1999,02/2000,79.0,,...,763.0,N,R,SF,1,P,PA,173,,FRM
1,100007386460,B,JPMORGAN CHASE BANK NA,7.875,55000,180,01/2000,03/2000,69.0,,...,633.0,N,R,CO,1,P,MD,208,,FRM


In [13]:
combinedSample = pd.concat([perfSample, accSample], axis=1)

In [14]:
combinedSample.head(2)

Unnamed: 0,loanID,monthlyReportingPeriod,servicerName,currIntRate,currActualUPB,loanAge,remainingMonthsToLegalMaturity,adjustedMonthsToMaturity,maturityDate,msa,...,borrowerCreditScoreAtOrigination,firstTimeHomeBuyer,loanPurpose,propType,numUnits,occType,propertyState,zipCodeShort,priMortgageInsurancePercent,prodType
0,100007365142,01/01/2000,,8.0,,0,360.0,359.0,01/2030,0,...,763.0,N,R,SF,1,P,PA,173,,FRM
1,100007365142,02/01/2000,,8.0,,1,359.0,358.0,01/2030,0,...,633.0,N,R,CO,1,P,MD,208,,FRM


In [15]:
combinedSample.isna().sum()

loanID                                   0
monthlyReportingPeriod                   0
servicerName                        489161
currIntRate                              0
currActualUPB                        81266
loanAge                                  0
remainingMonthsToLegalMaturity         379
adjustedMonthsToMaturity              4567
maturityDate                           379
msa                                      0
currLoanDelinquencyStatus              172
modFlag                                  0
zeroBalCode                         486490
zeroBalEffDate                      486490
lastPaidInstallmentDate             499827
foreclosureDate                          0
loanID                                   0
origChannel                              0
sellerName                               0
origIntRate                              0
orgUPB                                   0
orgLoanTerm                              0
orginationDate                           0
firstPaymen

In [16]:
combinedSample = combinedSample.drop(['servicerName', 'currActualUPB', 'adjustedMonthsToMaturity', 
                     'orgCombinedLTV', 'numBorrowers', 'orgDebttoIncomeRatio', 
                     'borrowerCreditScoreAtOrigination', 'priMortgageInsurancePercent',
                     'lastPaidInstallmentDate', 'zeroBalEffDate', 'zeroBalCode', 'maturityDate',
                    'monthlyReportingPeriod', 'orginationDate', 'firstPaymentDate', 'loanID'], axis=1)

In [17]:
combinedSample.isna().sum()

currIntRate                         0
loanAge                             0
remainingMonthsToLegalMaturity    379
msa                                 0
currLoanDelinquencyStatus         172
modFlag                             0
foreclosureDate                     0
origChannel                         0
sellerName                          0
origIntRate                         0
orgUPB                              0
orgLoanTerm                         0
orgLTV                              2
firstTimeHomeBuyer                  0
loanPurpose                         0
propType                            0
numUnits                            0
occType                             0
propertyState                       0
zipCodeShort                        0
prodType                            0
dtype: int64

In [18]:
# finding nan, null values
combinedSample[combinedSample.isnull().any(axis=1)]

Unnamed: 0,currIntRate,loanAge,remainingMonthsToLegalMaturity,msa,currLoanDelinquencyStatus,modFlag,foreclosureDate,origChannel,sellerName,origIntRate,...,orgLoanTerm,orgLTV,firstTimeHomeBuyer,loanPurpose,propType,numUnits,occType,propertyState,zipCodeShort,prodType
2956,8.250,41,319.0,22380,,N,1,R,CITIMORTGAGE INC.,8.250,...,360,75.0,N,R,SF,1,P,CA,935,FRM
3586,8.000,12,348.0,17820,,N,1,B,JPMORGAN CHASE BANK NATIONAL ASSOCIATION,9.250,...,360,80.0,N,R,SF,1,P,IL,600,FRM
13021,8.000,40,320.0,26420,,N,1,R,WELLS FARGO BANK N.A.,8.375,...,360,97.0,N,P,SF,1,P,TN,372,FRM
18544,7.875,52,308.0,14460,,N,1,B,JPMORGAN CHASE BANK NA,8.250,...,360,33.0,N,P,SF,1,P,IL,607,FRM
26010,9.875,67,293.0,48620,,N,1,R,OTHER,7.875,...,360,82.0,N,R,SF,1,P,OH,448,FRM
32399,8.550,8,352.0,48660,,N,0,B,JPMORGAN CHASE BANK NA,8.125,...,360,56.0,N,C,SF,1,P,CA,957,FRM
35631,8.500,75,285.0,40060,,N,1,C,JPMORGAN CHASE BANK NA,7.750,...,360,95.0,N,P,PU,1,P,TN,380,FRM
35861,7.625,72,288.0,48620,,N,1,C,OTHER,7.250,...,360,79.0,N,P,SF,1,P,IL,600,FRM
39099,8.500,69,291.0,17460,,N,1,C,JPMORGAN CHASE BANK NA,7.875,...,360,95.0,Y,P,SF,1,P,OK,731,FRM
39953,8.500,26,334.0,33100,,N,1,R,RBC MORTGAGE COMPANY,8.375,...,360,95.0,N,P,SF,1,P,MO,631,FRM


In [19]:
# removing nan, null valued rows
combinedSample['currLoanDelinquencyStatus'] = combinedSample['currLoanDelinquencyStatus'].replace(np.nan, 0)
# combinedSample[combinedSample.isnull().any(axis=1)]
combinedSample = combinedSample.dropna(how='any')
combinedSample.isna().sum()

currIntRate                       0
loanAge                           0
remainingMonthsToLegalMaturity    0
msa                               0
currLoanDelinquencyStatus         0
modFlag                           0
foreclosureDate                   0
origChannel                       0
sellerName                        0
origIntRate                       0
orgUPB                            0
orgLoanTerm                       0
orgLTV                            0
firstTimeHomeBuyer                0
loanPurpose                       0
propType                          0
numUnits                          0
occType                           0
propertyState                     0
zipCodeShort                      0
prodType                          0
dtype: int64

In [20]:
combinedSample = combinedSample[combinedSample.currLoanDelinquencyStatus != 'X']

In [21]:
# combinedSample.head(2)

In [22]:
# combinedSample.iloc[39]['currLoanDelinquencyStatus']

In [23]:
# cconverting 
combinedSample['currLoanDelinquencyStatus'] = pd.to_numeric(combinedSample['currLoanDelinquencyStatus'])

In [24]:
combinedSample.head(2)

Unnamed: 0,currIntRate,loanAge,remainingMonthsToLegalMaturity,msa,currLoanDelinquencyStatus,modFlag,foreclosureDate,origChannel,sellerName,origIntRate,...,orgLoanTerm,orgLTV,firstTimeHomeBuyer,loanPurpose,propType,numUnits,occType,propertyState,zipCodeShort,prodType
0,8.0,0,360.0,0,0,N,0,R,JPMORGAN CHASE BANK NA,8.0,...,360,79.0,N,R,SF,1,P,PA,173,FRM
1,8.0,1,359.0,0,0,N,0,B,JPMORGAN CHASE BANK NA,7.875,...,180,69.0,N,R,CO,1,P,MD,208,FRM


In [25]:
type(combinedSample.iloc[39]['currLoanDelinquencyStatus'])

numpy.int64

In [26]:
type(combinedSample.iloc[5000]['currLoanDelinquencyStatus'])

numpy.int64

In [27]:
def scaleCurrLoanDelinquencyStatus(x):
    if x < 30 :
        return 0
    elif x < 60 :
        return 1
    elif x < 90 :
        return 2
    elif x < 120 :
        return 3
    else:
        return 4
    
        

In [28]:
combinedSample['currLoanDelinquencyStatus'] = combinedSample['currLoanDelinquencyStatus'].apply(scaleCurrLoanDelinquencyStatus)

In [29]:
def dateToOrdinal(x):
    return date.toordinal(x)

In [30]:
# combinedSample['monthlyReportingPeriod'] = combinedSample['monthlyReportingPeriod'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y"))

In [31]:
# combinedSample['monthlyReportingPeriod'] = combinedSample['monthlyReportingPeriod'].apply(dateToOrdinal)

In [32]:
combinedSample.columns

Index(['currIntRate', 'loanAge', 'remainingMonthsToLegalMaturity', 'msa',
       'currLoanDelinquencyStatus', 'modFlag', 'foreclosureDate',
       'origChannel', 'sellerName', 'origIntRate', 'orgUPB', 'orgLoanTerm',
       'orgLTV', 'firstTimeHomeBuyer', 'loanPurpose', 'propType', 'numUnits',
       'occType', 'propertyState', 'zipCodeShort', 'prodType'],
      dtype='object')

#### one hot encoding for random forest 

In [33]:
print("Original features: \n", list(combinedSample.columns), "\n")
combinedSample = pd.get_dummies(combinedSample)
print("Features after One-hot Encoding: \n", list(combinedSample.columns), "\n")

Original features: 
 ['currIntRate', 'loanAge', 'remainingMonthsToLegalMaturity', 'msa', 'currLoanDelinquencyStatus', 'modFlag', 'foreclosureDate', 'origChannel', 'sellerName', 'origIntRate', 'orgUPB', 'orgLoanTerm', 'orgLTV', 'firstTimeHomeBuyer', 'loanPurpose', 'propType', 'numUnits', 'occType', 'propertyState', 'zipCodeShort', 'prodType'] 

Features after One-hot Encoding: 
 ['currIntRate', 'loanAge', 'remainingMonthsToLegalMaturity', 'msa', 'currLoanDelinquencyStatus', 'foreclosureDate', 'origIntRate', 'orgUPB', 'orgLoanTerm', 'orgLTV', 'numUnits', 'zipCodeShort', 'modFlag_N', 'modFlag_Y', 'origChannel_B', 'origChannel_C', 'origChannel_R', 'sellerName_AMTRUST BANK', 'sellerName_BANK OF AMERICA  N.A.', 'sellerName_BISHOPS GATE RESIDENTIAL MORTGAGE TRUST', 'sellerName_CITIMORTGAGE  INC.', 'sellerName_CITIZENS MORTGAGE CORPORATION', 'sellerName_DOWNEY SAVINGS AND LOAN ASSOCIATION  F.A.', 'sellerName_EVERBANK', 'sellerName_FIRST TENNESSEE BANK NATIONAL ASSOCIATION', 'sellerName_FLAGSTA

In [34]:
# Get All Column's name of the input file
columnHeaders = list(combinedSample.columns.values)

#'quality' is the class attribute we are predicting
classColumn = 'foreclosureDate'
riskFeature = combinedSample.drop(columns=['foreclosureDate'])
riskClass = combinedSample[classColumn]

In [35]:
combinedSample.groupby('foreclosureDate').count()

Unnamed: 0_level_0,currIntRate,loanAge,remainingMonthsToLegalMaturity,msa,currLoanDelinquencyStatus,origIntRate,orgUPB,orgLoanTerm,orgLTV,numUnits,...,propertyState_TX,propertyState_UT,propertyState_VA,propertyState_VI,propertyState_VT,propertyState_WA,propertyState_WI,propertyState_WV,propertyState_WY,prodType_FRM
foreclosureDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,486110,486110,486110,486110,486110,486110,486110,486110,486110,486110,...,486110,486110,486110,486110,486110,486110,486110,486110,486110,486110
1,170,170,170,170,170,170,170,170,170,170,...,170,170,170,170,170,170,170,170,170,170


In [36]:
combinedSample.head(5)

Unnamed: 0,currIntRate,loanAge,remainingMonthsToLegalMaturity,msa,currLoanDelinquencyStatus,foreclosureDate,origIntRate,orgUPB,orgLoanTerm,orgLTV,...,propertyState_TX,propertyState_UT,propertyState_VA,propertyState_VI,propertyState_VT,propertyState_WA,propertyState_WI,propertyState_WV,propertyState_WY,prodType_FRM
0,8.0,0,360.0,0,0,0,8.0,75000,360,79.0,...,0,0,0,0,0,0,0,0,0,1
1,8.0,1,359.0,0,0,0,7.875,55000,180,69.0,...,0,0,0,0,0,0,0,0,0,1
2,8.0,2,358.0,0,0,0,8.0,130000,360,61.0,...,0,0,0,0,0,0,0,0,0,1
3,8.0,3,357.0,0,0,0,8.375,107000,360,95.0,...,0,0,0,0,0,0,0,0,0,1
4,8.0,4,356.0,0,0,0,8.0,37000,180,63.0,...,0,0,0,0,0,0,0,0,0,1


In [37]:

# Splitting given dataset - Train: 75%, Test: 25%
trainFeature, testFeature, trainClass, testClass = train_test_split(riskFeature, riskClass, stratify=riskClass, train_size=.75, test_size=.25)
trainAccuracy = []
testAcuracy = []

#### training the random forest classifier 

In [38]:
forest = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0, n_jobs=4)
forest.fit(trainFeature, trainClass)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [39]:
# Printing test and train accuracy of the classifier
print("Train set accuracy: {:.2f}".format(forest.score(trainFeature, trainClass)))
print("Test set accuracy: {:.2f}".format(forest.score(testFeature, testClass)))

Train set accuracy: 1.00
Test set accuracy: 1.00


In [40]:
# Confusion matrix(6x6) including 'All' for test data
prediction = forest.predict(testFeature)
print("Confusion matrix:")
print(pd.crosstab(testClass, prediction, rownames=['True'], colnames=['Predicted'], margins=True))

Confusion matrix:
Predicted       0     All
True                     
0          121528  121528
1              42      42
All        121570  121570


In [41]:
# Application of 10-fold stratified cross-validation
crossValidationScore = cross_val_score(forest, trainFeature, trainClass, cv=10)
print("Cross-validation scores: {}".format(crossValidationScore))
print("Average cross-validation score: {:.2f}".format(crossValidationScore.mean()))

Cross-validation scores: [0.99964356 0.99964356 0.99964355 0.99964355 0.99964355 0.99964355
 0.99964355 0.99964355 0.99967096 0.99967096]
Average cross-validation score: 1.00
