## import necessary packages

In [30]:
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_validate

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as mp
%matplotlib inline

from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

## Read the data

In [2]:
acqusition = pd.read_csv('/Volumes/Backup Plus/Documents/Data Science/Projects/fannieMae_project/processed_data/Acquisition_2007.txt', sep="|",index_col=False)
performance = pd.read_csv('/Volumes/Backup Plus/Documents/Data Science/Projects/fannieMae_project/processed_data/Performance_2007.txt', sep="|",index_col=False)
df=pd.merge(acqusition, performance, on='LoanID', how='left')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88073 entries, 0 to 88072
Data columns (total 27 columns):
LoanID                88073 non-null int64
Channel               88073 non-null object
SellerName            88073 non-null object
OrInterestRate        88073 non-null float64
OrUnpaidPrinc         88073 non-null int64
OrLoanTerm            88073 non-null int64
Origination           88073 non-null object
FirstPayment          88073 non-null object
OrLTV                 88073 non-null int64
OrCLTV                88073 non-null float64
NumBorrowers          88073 non-null float64
DTIRat                88073 non-null float64
CreditScore           88073 non-null float64
FTHomeBuyer           88073 non-null object
LoanPurpose           88073 non-null object
PropertyType          88073 non-null object
NumUnits              88073 non-null int64
OccType               88073 non-null object
PropertyState         88073 non-null object
Zip                   88073 non-null int64
MortInsPerc 

In [4]:
df.isnull().sum()

LoanID                    0
Channel                   0
SellerName                0
OrInterestRate            0
OrUnpaidPrinc             0
OrLoanTerm                0
Origination               0
FirstPayment              0
OrLTV                     0
OrCLTV                    0
NumBorrowers              0
DTIRat                    0
CreditScore               0
FTHomeBuyer               0
LoanPurpose               0
PropertyType              0
NumUnits                  0
OccType                   0
PropertyState             0
Zip                       0
MortInsPerc               0
ProductType               0
CoCreditScore             0
MortInsType               0
RelocationMortgage        0
MSA                   77062
ForeclosureDate       77062
dtype: int64

In [5]:
df['Default']=df['ForeclosureDate']
df['Default'].fillna(0, inplace=True)
df.loc[df['Default'] != 0, 'Default'] = 1
df['Default'] = df['Default'].astype(int)

In [6]:
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date:pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)

In [7]:
df.rename(columns={'Origination':'loan_date',
                          'FirstPayment':'first_payment_date'}, 
                 inplace=True)

df[['loan_date', 'first_payment_date', 'ForeclosureDate']]=df[['loan_date', 'first_payment_date', 'ForeclosureDate']].apply(lookup)

In [8]:
df['first_payment_year']=df['first_payment_date'].dt.year
df['loan_year']=df['loan_date'].dt.year

In [9]:
df["Default"].mean()

0.12502128915785768

In [10]:
df.dtypes

LoanID                         int64
Channel                       object
SellerName                    object
OrInterestRate               float64
OrUnpaidPrinc                  int64
OrLoanTerm                     int64
loan_date             datetime64[ns]
first_payment_date    datetime64[ns]
OrLTV                          int64
OrCLTV                       float64
NumBorrowers                 float64
DTIRat                       float64
CreditScore                  float64
FTHomeBuyer                   object
LoanPurpose                   object
PropertyType                  object
NumUnits                       int64
OccType                       object
PropertyState                 object
Zip                            int64
MortInsPerc                  float64
ProductType                   object
CoCreditScore                float64
MortInsType                  float64
RelocationMortgage            object
MSA                          float64
ForeclosureDate       datetime64[ns]
D

In [11]:
num_col_drop=["LoanID", "Zip", "MSA", 'first_payment_year', 'loan_year']
cat_col_add=['Zip', 'first_payment_year', 'loan_year']

## processing numeric columns

In [12]:
# Create a boolean mask for numeric columns
num_mask = ((df.dtypes == int) | (df.dtypes == float))

# Get list of numeric column names
num_columns = df.columns[num_mask].tolist()
num=df[num_columns]

# drop "categorical" columns
num=num.drop(num_col_drop, axis=1)
num=num.fillna(num.mean())

## processing categorical columns

In [13]:
# Create a boolean mask for categorical columns
cat_mask = (df.dtypes == object)

# Get list of categorical column names
cat_columns = df.columns[cat_mask].tolist()

cat_columns.extend(cat_col_add)
cat=df[cat_columns]
cat=cat.astype('category')
cat=cat.fillna(cat.mode().iloc[0]) # fill na with most requent category

## concat num and cat columns and create dummies for cat

In [14]:
df_new = pd.concat([num, cat], axis=1).reset_index(drop=True)

In [15]:
df_new = pd.get_dummies(df_new, drop_first=True)
df_new.shape

(88073, 995)

In [16]:
# Get column names first
names = df_new.columns
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df_new)
scaled_df = pd.DataFrame(df_new, columns=names)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
y = scaled_df['Default'].values
X = scaled_df.drop(['Default'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

## regular logistic regression

In [18]:
model = LogisticRegression(random_state=1, class_weight="balanced")
model = model.fit(X_train, y_train)
predict = model.predict(X_test)



In [19]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.94      0.69      0.80     19355
           1       0.24      0.70      0.35      2664

   micro avg       0.69      0.69      0.69     22019
   macro avg       0.59      0.69      0.57     22019
weighted avg       0.86      0.69      0.74     22019



In [20]:
confusion_matrix(y_test, predict)

array([[13325,  6030],
       [  811,  1853]])

In [21]:
model.score(X_test, y_test)

0.6893137744675053

## XGBClassifier

In [25]:
model=xgb.XGBClassifier(max_depth=2, objective="reg:logistic")
model.fit(X_train, y_train)
model.predict(X_test)
model.score(X_test, y_test)

0.879104409827876

## cross validation with XGBoost

In [27]:
from sklearn.model_selection import cross_val_score
# Create full pipeline
pipeline = Pipeline([
                     ("clf", xgb.XGBClassifier())
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X_train, y_train, scoring="roc_auc", cv=3)

# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

3-fold AUC:  0.7600276632133482


## cross validation and parameter tuning with XGBoost

In [32]:
# Create the parameter grid
gbm_param_grid = {
    'clf__learning_rate': np.arange(0.05, 1, 0.05),
    'clf__max_depth': np.arange(3, 10, 1),
    'clf__n_estimators': np.arange(50, 200, 50)
}


In [36]:

# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=xgb.XGBClassifier(),param_distributions=gbm_param_grid, n_iter=2, scoring="roc_auc", cv=2,verbose=1)

# Fit the estimator
randomized_roc_auc.fit(X_train,y_train)

# Compute metrics
print(randomized_roc_auc.best_score_)
print(randomized_roc_auc.best_estimator_)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter clf for estimator XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1). Check the list of available parameters with `estimator.get_params().keys()`.

## SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
# We set random_state=0 for reproducibility 
linear_classifier = SGDClassifier(random_state=0)

# Instantiate the GridSearchCV object and run the search
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':['hinge', 'log'], 'penalty':['l1','l2']}
searcher = GridSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

accuracy: 0.879014


## XGBoost's baked in cross-validation to measure accurancy

In [None]:
# Create the DMatrix: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))