<img style="float: left;" src="./images/fanniemae.png">

___________
# Mortgage Loan Default Classifier

### Problem Statement:
- Predict whether a mortgage loan will default using Fannie Mae Loan Performance data set from previous quarter

____________

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
df = pd.read_csv('./data/complete2005q1.csv')
df.head()

Unnamed: 0,LOAN IDENTIFIER,ORIGINATION CHANNEL,SELLER NAME,ORIGINAL INTEREST RATE,ORIGINAL UPB,ORIGINAL LOAN TERM,ORIGINATION DATE,FIRST PAYMENT DATE,ORIGINAL LOAN-TO-VALUE (LTV),ORIGINAL COMBINED LOAN-TO-VALUE (CLTV),...,PROPERTY TYPE,NUMBER OF UNITS,OCCUPANCY TYPE,PROPERTY STATE,ZIP CODE SHORT,PRODUCT TYPE,RELOCATION MORTGAGE INDICATOR,DEFAULT,MI,MIN CREDIT SCORE
0,100000102115,C,"CITIMORTGAGE, INC.",4.5,95000,120,02/2005,04/2005,51,51.0,...,SF,1,P,TX,750,FRM,N,0,0.0,783.0
1,100004036273,C,"BANK OF AMERICA, N.A.",6.625,139000,360,01/2005,03/2005,95,95.0,...,SF,1,S,FL,349,FRM,N,0,1.0,755.0
2,100004945019,R,"BANK OF AMERICA, N.A.",5.875,104000,360,12/2004,02/2005,68,68.0,...,PU,1,P,SC,295,FRM,N,0,0.0,689.0
3,100013634177,C,"CITIMORTGAGE, INC.",6.25,51000,240,12/2004,02/2005,69,69.0,...,SF,1,P,WI,544,FRM,N,0,0.0,788.0
4,100014052527,C,OTHER,5.625,60000,240,12/2004,02/2005,75,75.0,...,SF,1,P,IN,479,FRM,N,0,0.0,700.0


In [3]:
df.columns

Index(['LOAN IDENTIFIER', 'ORIGINATION CHANNEL', 'SELLER NAME',
       'ORIGINAL INTEREST RATE', 'ORIGINAL UPB', 'ORIGINAL LOAN TERM',
       'ORIGINATION DATE', 'FIRST PAYMENT DATE',
       'ORIGINAL LOAN-TO-VALUE (LTV)',
       'ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)', 'NUMBER OF BORROWERS',
       'ORIGINAL DEBT TO INCOME RATIO', 'FIRST TIME HOME BUYER INDICATOR',
       'LOAN PURPOSE', 'PROPERTY TYPE', 'NUMBER OF UNITS', 'OCCUPANCY TYPE',
       'PROPERTY STATE', 'ZIP CODE SHORT', 'PRODUCT TYPE',
       'RELOCATION MORTGAGE INDICATOR', 'DEFAULT', 'MI', 'MIN CREDIT SCORE'],
      dtype='object')

In [4]:
categorical_features = ['ORIGINATION CHANNEL', 'SELLER NAME',
                        'FIRST TIME HOME BUYER INDICATOR',
                        'LOAN PURPOSE', 'PROPERTY TYPE', 'OCCUPANCY TYPE',
                        'PROPERTY STATE', 'ZIP CODE SHORT', 'PRODUCT TYPE',
                        'RELOCATION MORTGAGE INDICATOR']

In [5]:
df = df.drop(columns=['ORIGINATION DATE', 'FIRST PAYMENT DATE'])

In [6]:
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [7]:
df.shape

(279961, 990)

In [8]:
# explore choropleth

In [9]:
# balance target
# split classes
df_maj = df[df['DEFAULT'] == 0]
df_min = df[df['DEFAULT'] == 1]
 
# downsample majority
df_maj_resample = resample(df_maj, 
                           replace=False,    
                           n_samples=df_min.shape[0],
                           random_state=42)             


# concat downsample and minority
# df_resample = 
df_resample = pd.concat([df_maj_resample, df_min])
 
# Display new class counts
df_resample['DEFAULT'].value_counts()

1    11759
0    11759
Name: DEFAULT, dtype: int64

In [10]:
# train test split
X = df_resample.drop(columns=['DEFAULT'])
y = df_resample['DEFAULT']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    shuffle=True,
                                                    random_state=42)

In [11]:
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [12]:
log = LogisticRegression()
log.fit(X_train_sc, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
log.score(X_train_sc, y_train)

0.7575121895906566

In [14]:
cross_val_score(log, X_train_sc, y_train, cv=5)


array([0.71655329, 0.73044218, 0.7196712 , 0.72222222, 0.72291548])

In [15]:
log.score(X_test_sc, y_test)

0.7173469387755103

In [16]:
preds = log.predict(X_test_sc)

In [17]:
from sklearn.metrics import confusion_matrix, r2_score

In [18]:
confusion_matrix(y_test, preds)

array([[2007,  933],
       [ 729, 2211]], dtype=int64)

In [19]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [33]:
from dsi6.metrics import ClassificationMetrics

IndentationError: unexpected indent (metrics.py, line 11)

In [21]:
# cm = metrics.ClassificationMetrics()
# cm.fit(y_test, preds)

In [22]:
# cm.describe()

In [23]:
# False negative rate = 24%

In [24]:
def false_negative_rate(fn=fn, tp=tp):
    return fn/(tp+fn)

def accuracy(tn=tn, tp=tp, y_test=y_test):
    return (tn + tp)/len(y_test)

In [25]:
false_negative_rate()

0.2479591836734694

In [26]:
preds_prob = log.predict_proba(X_test_sc)

In [27]:
preds_adj = pd.DataFrame(log.predict_proba(X_test_sc), columns=['NoDefault', 'Default'])
preds_adj['y_pred'] = preds
preds_adj['y_pred_adj'] = preds_adj['Default'].map(lambda x: 1 if x >= .33 else 0)
preds_adj.head()

Unnamed: 0,NoDefault,Default,y_pred,y_pred_adj
0,0.295208,0.704792,1,1
1,0.24385,0.75615,1,1
2,0.731485,0.268515,0,0
3,0.466654,0.533346,1,1
4,0.27921,0.72079,1,1


In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, preds_adj['y_pred_adj']).ravel()

In [29]:
confusion_matrix(y_test, preds_adj['y_pred_adj'])

array([[1511, 1429],
       [ 332, 2608]], dtype=int64)

In [30]:
false_negative_rate(fn, tp)

0.11292517006802721

In [31]:
accuracy(tn, tp, y_test)

0.7005102040816327