# Feature Engineering & Kendalltau Feature Selection
Author: Taiyuan Zhang

Last Edited: 2022/08/01

---

Comments:

Please load with mean-target dataset.

In [1]:
%%time

# import necessary dependencies
import os
import re
import gc

import time

import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
np.random.seed(42)

import category_encoders as ce

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_theme(style = "whitegrid", palette = "pastel")

from imblearn.combine import SMOTETomek

from warnings import simplefilter
simplefilter(action = "ignore", category = FutureWarning)
simplefilter(action = "ignore", category = DeprecationWarning)

CPU times: total: 2.48 s
Wall time: 2.69 s


In [2]:
%%time

# import local dependencies
from scipy.stats import kendalltau

CPU times: total: 0 ns
Wall time: 0 ns


In [3]:
%%time

# config directories
raw_dir = "F:\FURP\Sample_orig\Raw_Samples"
detected_dir = "F:\FURP\Sample_orig\Mean-Target_Detected_Samples"
resampled_dir = "F:\FURP\Sample_orig\Resampled_Samples"

plots_dir = "F:\FURP\Sample_orig\Data_Plots"

# config file paths
raw_paths = os.listdir(raw_dir)

# config necessary global variables
cols_orig = [
    "Credit Score",
    "First Payment Date",
    "First Time Homebuyer Flag",
    "Maturity Date",
    "Metropolitan Statistical Area (MSA) Or Metropolitan Division",
    "Mortgage Insurance Percentage (MI %)",
    "Number of Units",
    "Occupancy Status",
    "Original Combined Loan-to-Value (CLTV)",
    "Original Debt-to-Income (DTI) Ratio",
    "Original UPB",
    "Original Loan-to-Value (LTV)",
    "Original Interest Rate",
    "Channel",
    "Prepayment Penalty Mortgage (PPM) Flag",
    "Amortization Type (Formerly Product Type)",
    "Property State",
    "Property Type",
    "Postal Code",
    "Loan Sequence Number",
    "Loan Purpose",
    "Original Loan Term",
    "Number of Borrowers",
    "Seller Name",
    "Servicer Name",
    "Super Conforming Flag",
    "Pre-HARP Loan Sequence Number",
    "Program Indicator",
    "HARP Indicator",
    "Property Valuation Method",
    "Interest Only (I/O) Indicator"
]

cat_cols = [
    'First Time Homebuyer Flag',
    'Occupancy Status',
    'Channel',
    'Prepayment Penalty Mortgage (PPM) Flag',
    'Property Type',
    'Loan Purpose',
    'Seller Name',
    'Servicer Name',
    'Program Indicator',
]

del_cols = [
    'Amortization Type (Formerly Product Type)', 
    'Super Conforming Flag', 
    'HARP Indicator', 
    'Interest Only (I/O) Indicator',
    "Property State",
    "Loan Sequence Number",
    "Pre-HARP Loan Sequence Number",
]

CPU times: total: 0 ns
Wall time: 1e+03 µs


In [4]:
%%time

# define utils data
def credit_encoder(x: int):
    if x >= 300 and x <= 600:
        return 0
    elif x >= 601 and x <= 660:
        return 1
    elif x >= 661 and x <= 780:
        return 2
    elif x >= 781 and x <= 850:
        return 3
    else:
        raise Exception("Found Unavailable Credit Score!")

CPU times: total: 0 ns
Wall time: 0 ns


In [5]:
%%time

# walk through the raw text files
for fileName in raw_paths:
    print("Detecting {}...".format(fileName))
    df = pd.read_csv(os.path.join(raw_dir, fileName),
                     sep = "|",
                     names = cols_orig,
                     low_memory = False)
    
    # apply feature engineering
    # 1. Delete columns
    for col in del_cols:
        del df[col]
        gc.collect()

    # 2. Drop rows `Credit Score` == 999 or `Credit Score` < 300 (Not Available)
    df = df.drop(df[df["Credit Score"] == 9999].index)

    # 3. filna(0) with Metropolitan Statistical Area (MSA) Or Metropolitan Division
    df["Metropolitan Statistical Area (MSA) Or Metropolitan Division"] = df["Metropolitan Statistical Area (MSA) Or Metropolitan Division"].fillna(0)

    # 4. Add Credit Rank as target, Delete Credit Score;
    df["Credit Rank"] = df["Credit Score"].apply(lambda x: credit_encoder(x))
    del df["Credit Score"]
    gc.collect()

    # 5. Add Extra Feature
    # 5-1. First Payment Year, Maturity Year
    df["First Payment Year"] = df["First Payment Date"].apply(lambda x: x // 100)
    df["Maturity Year"] = df["Maturity Date"].apply(lambda x: x // 100)

    # 5-2. Date Gap
    FPM = df["First Payment Date"].apply(lambda x: pd.to_datetime(x, format = "%Y%m"))
    MM = df["Maturity Date"].apply(lambda x: pd.to_datetime(x, format = "%Y%m"))
    df["Month Gap"] = (MM - FPM).apply(lambda x:x.days)
    del FPM
    del MM
    gc.collect()

    # 6. Apply Mean Target Encoder
    rank_onehot = pd.get_dummies(df["Credit Rank"],
                                   drop_first = True)     # to avoid lineaer dependencies
    for col in cat_cols:
        for index in rank_onehot.columns:
            te_encoder = ce.TargetEncoder(col)
            df[col + "_" + str(index)] = te_encoder.fit_transform(df[col],
                                                                  rank_onehot[index])
        del df[col]
    
    del rank_onehot
    gc.collect()
            
    # Fin. Write to parquet
    parquetName = "mean-target_detected_{}.parquet.gzip".format(fileName.replace(".txt", ""))
    df.to_parquet(os.path.join(detected_dir, parquetName),
                  compression = "gzip",
                  index = False)

Detecting sample_orig_1999.txt...
Detecting sample_orig_2000.txt...
Detecting sample_orig_2001.txt...
Detecting sample_orig_2002.txt...
Detecting sample_orig_2003.txt...
Detecting sample_orig_2004.txt...
Detecting sample_orig_2005.txt...
Detecting sample_orig_2006.txt...
Detecting sample_orig_2007.txt...
Detecting sample_orig_2008.txt...
Detecting sample_orig_2009.txt...
Detecting sample_orig_2010.txt...
Detecting sample_orig_2011.txt...
Detecting sample_orig_2012.txt...
Detecting sample_orig_2013.txt...
Detecting sample_orig_2014.txt...
Detecting sample_orig_2015.txt...
Detecting sample_orig_2016.txt...
Detecting sample_orig_2017.txt...
Detecting sample_orig_2018.txt...
Detecting sample_orig_2019.txt...
Detecting sample_orig_2020.txt...
Detecting sample_orig_2021.txt...
CPU times: total: 4min 49s
Wall time: 4min 51s


In [6]:
%%time
# config file paths
detected_paths = os.listdir(detected_dir)

# config necessary global variables
kendalltau_score = {}

CPU times: total: 0 ns
Wall time: 0 ns


In [7]:
%%time

# walk through the files
df = pd.DataFrame()

for fileName in detected_paths:
    temp = pd.read_parquet(os.path.join(detected_dir, fileName))
    df = pd.concat([df, temp])
    del temp
    gc.collect()
    
# calculate kendalltau score
Y = df["Credit Rank"]
cols = df.columns

for col in cols:
    if col != "Credit Rank":
        X = df[col]
        corr, pvalue = kendalltau(X, Y)
        del X
        gc.collect()

        kendalltau_score[col] = (corr, pvalue)
        print("{}: {}".format(col, kendalltau_score[col]))

print("-" * 30)
        
corr_cols = []

for key in kendalltau_score.keys():
    print("{}: {}".format(key, kendalltau_score[key]))
    if kendalltau_score[key][0] > 0.10 or kendalltau_score[key][0] < 0.10:
        corr_cols.append(key)

print()

for col in corr_cols:
    print(col)

  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


First Payment Date: (0.16536655000175332, 0.0)
Maturity Date: (0.08468986896487356, 0.0)
Metropolitan Statistical Area (MSA) Or Metropolitan Division: (0.03859501509192453, 0.0)
Mortgage Insurance Percentage (MI %): (-0.0917241111035743, 0.0)
Number of Units: (-0.0076535782171634585, 3.2660737269434563e-17)
Original Combined Loan-to-Value (CLTV): (-0.11576680036331492, 0.0)
Original Debt-to-Income (DTI) Ratio: (-0.0950706183904821, 0.0)
Original UPB: (0.06974496582846414, 0.0)
Original Loan-to-Value (LTV): (-0.11285134034284702, 0.0)
Original Interest Rate: (-0.22010300832909416, 0.0)
Postal Code: (0.022578643117985166, 1.2941792584055904e-202)
Original Loan Term: (-0.05681336529103074, 0.0)
Number of Borrowers: (-0.039563209804786156, 0.0)
Property Valuation Method: (-0.07563268565282615, 0.0)
First Payment Year: (0.16843332969496694, 0.0)
Maturity Year: (0.08574406846095908, 0.0)
Month Gap: (-0.04807940717871184, 0.0)
First Time Homebuyer Flag_1: (-0.2036991246446356, 0.0)
First Time

In [8]:
# To Apply Feature Selection:
kbest = [
    "First Payment Date",
    "Maturity Date",
    "Metropolitan Statistical Area (MSA) Or Metropolitan Division",
    "Mortgage Insurance Percentage (MI %)",
    "Number of Units",
    "Original Combined Loan-to-Value (CLTV)",
    "Original Debt-to-Income (DTI) Ratio",
    "Original UPB",
    "Original Loan-to-Value (LTV)",
    "Original Interest Rate",
    "Postal Code",
    "Original Loan Term",
    "Number of Borrowers",
    "Property Valuation Method",
    "First Payment Year",
    "Maturity Year",
    "Month Gap",
    "First Time Homebuyer Flag_1",
    "First Time Homebuyer Flag_2",
    "First Time Homebuyer Flag_3",
    "Occupancy Status_1",
    "Occupancy Status_2",
    "Occupancy Status_3",
    "Channel_1",
    "Channel_2",
    "Channel_3",
    "Prepayment Penalty Mortgage (PPM) Flag_1",
    "Prepayment Penalty Mortgage (PPM) Flag_2",
    "Prepayment Penalty Mortgage (PPM) Flag_3",
    "Property Type_1",
    "Property Type_2",
    "Property Type_3",
    "Loan Purpose_1",
    "Loan Purpose_2",
    "Loan Purpose_3",
    "Seller Name_1",
    "Seller Name_2",
    "Seller Name_3",
    "Servicer Name_1",
    "Servicer Name_2",
    "Servicer Name_3",
    "Program Indicator_1",
    "Program Indicator_2",
    "Program Indicator_3",
]

---