# Data Preprocessing for Modeling 

In [1]:
# Standardizing all numeric variables 
from sklearn.preprocessing import StandardScaler
from pyspark.sql.types import StringType 
from pyspark.sql.functions import col 
import numpy as np
import pandas as pd 

from numpy import random 
random.seed(seed=12345)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
import pandas as pd
from glob import glob

# Path to the CSV files
path = '/GWSB/home/g35026169/Desktop/Andrew_s Data/Sample_*'

# Use glob to match the pattern and get a list of file names
files = glob(path)

# Read each CSV file and store them in a list
dfs = [pd.read_csv(file) for file in files]

# Concatenate all DataFrames in the list into one
df = pd.concat(dfs, ignore_index=True)

In [3]:
# Drop specified columns
df = df.drop(columns=['LOAN SEQUENCE NUMBER','MONTHLY REPORTING PERIOD', 'CURRENT ACTUAL UPB', 'OrigQuarter'])
df.head()

Unnamed: 0,CURRENT LOAN DELINQUENCY STATUS,LOAN AGE,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),DEFAULT,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,OCCUPANCY STATUS,ORIGINAL INTEREST RATE,PROPERTY TYPE,LOAN PURPOSE,SELLER NAME,OrigYear,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
0,0,0,7.375,Undefined,0,775,N,P,7.375,PU,N,Other sellers,2023,2023Q1,395.44,3.6,6.0,0.0
1,0,1,7.375,0.9913,0,775,N,P,7.375,PU,N,Other sellers,2023,2023Q1,397.51,3.5,5.0,-0.0035
2,0,2,7.375,0.984,0,775,N,P,7.375,PU,N,Other sellers,2023,2023Q1,400.47,3.4,4.9,-0.0035
3,0,3,7.375,0.9767,0,775,N,P,7.375,PU,N,Other sellers,2023,2023Q1,403.44,3.7,4.0,-0.0035
4,0,4,7.375,0.9715,0,775,N,P,7.375,PU,N,Other sellers,2023,2023Q1,405.61,3.6,3.0,-0.0035


In [4]:
full_timeseries = df.sort_values(by='OrigDate', ascending=True)

In [5]:
full_timeseries = full_timeseries.dropna(subset=["% Change in UPB"])

In [6]:
# Convert Decimal to Percentage in the % Change in UPB Column
full_timeseries['% Change in UPB'] = full_timeseries['% Change in UPB'] * 100
full_timeseries.tail()

Unnamed: 0,CURRENT LOAN DELINQUENCY STATUS,LOAN AGE,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),DEFAULT,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,OCCUPANCY STATUS,ORIGINAL INTEREST RATE,PROPERTY TYPE,LOAN PURPOSE,SELLER NAME,OrigYear,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
4998,2,3,7.125,0.9802,0,770,Y,P,7.125,SF,P,"ROCKET MORTGAGE, LLC",2023,2023Q1,403.44,3.7,4.0,0.0
4999,3,4,7.125,0.9749,0,770,Y,P,7.125,SF,P,"ROCKET MORTGAGE, LLC",2023,2023Q1,405.61,3.6,3.0,0.0
5000,4,5,7.125,0.9668,0,770,Y,P,7.125,SF,P,"ROCKET MORTGAGE, LLC",2023,2023Q1,409.0,3.5,3.2,0.0
5001,5,6,7.125,0.9597,0,770,Y,P,7.125,SF,P,"ROCKET MORTGAGE, LLC",2023,2023Q1,412.04,3.8,3.7,0.0
5002,6,7,7.125,0.9541,1,770,Y,P,7.125,SF,P,"ROCKET MORTGAGE, LLC",2023,2023Q1,414.98,3.8,3.7,0.12


In [7]:
total_rows = len(full_timeseries) 
print(f"Total number of rows in the DataFrame: {total_rows}")

Total number of rows in the DataFrame: 4998


In [8]:
# Reset the index 
full_timeseries.reset_index(drop=True, inplace=True)

In [9]:
# Now that the DataFrame is sorted and the index has been reset, fetch indices for 2022 and 2023
end_test_idx = total_rows

print(f"End of test index (inclusive): {end_test_idx}")

End of test index (inclusive): 4997


In [10]:
# Calculate the index that represents 80% of the data up to the end of 2023
split_index = int(end_test_idx * 0.8)

# Print the calculated split index
print(f"Training data ends at index: {split_index}")
print(f"Testing data starts at index: {split_index + 1} and ends at index: {end_test_idx}")

Training data ends at index: 3997
Testing data starts at index: 3998 and ends at index: 4997


In [11]:
full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'] = pd.to_numeric(full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'], errors='coerce').fillna(-1)

In [12]:
contains_undefined = full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'].isin(['undefined']).any()

print("Is 'undefined' present in the ELTV column?", contains_undefined)

Is 'undefined' present in the ELTV column? False


In [13]:
full_timeseries['CURRENT LOAN DELINQUENCY STATUS'] = full_timeseries['CURRENT LOAN DELINQUENCY STATUS'].astype(str)
full_timeseries['CURRENT INTEREST RATE'] = full_timeseries['CURRENT INTEREST RATE'].astype(np.float32)
full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'] = full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'].astype(np.float32)
full_timeseries['ORIGINAL INTEREST RATE'] = full_timeseries['ORIGINAL INTEREST RATE'].astype(np.float32)
full_timeseries['index_sa'] = full_timeseries['index_sa'].astype(np.float32)
full_timeseries['UNRATE'] = full_timeseries['UNRATE'].astype(np.float32)
full_timeseries['inflation'] = full_timeseries['inflation'].astype(np.float32)
full_timeseries['% Change in UPB'] = full_timeseries['% Change in UPB'].astype(np.float32)
full_timeseries['OrigYear'] = full_timeseries['OrigYear'].astype(str)

full_timeseries.drop(columns=['CURRENT LOAN DELINQUENCY STATUS'], inplace=True)
full_timeseries.drop(columns=['LOAN AGE'], inplace=True)

In [14]:
column_types = full_timeseries.dtypes
print(column_types)

CURRENT INTEREST RATE             float32
ESTIMATED LOAN TO VALUE (ELTV)    float32
DEFAULT                             int64
CREDIT SCORE                        int64
FIRST TIME HOMEBUYER FLAG          object
OCCUPANCY STATUS                   object
ORIGINAL INTEREST RATE            float32
PROPERTY TYPE                      object
LOAN PURPOSE                       object
SELLER NAME                        object
OrigYear                           object
OrigDate                           object
index_sa                          float32
UNRATE                            float32
inflation                         float32
% Change in UPB                   float32
dtype: object


# PiML 

In [15]:
from piml import Experiment
exp = Experiment()
exp.data_loader(data = full_timeseries, silent=True)

In [16]:
exp.data_summary(feature_exclude=[], feature_type={"OrigYear": "categorical"})

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,name,n_missing,mean,std,min,q1,median,q3,max
0,CURRENT INTEREST RATE,0,5.222,1.3092,1.999,4.25,5.0,6.125,46.25
1,ESTIMATED LOAN TO VALUE (ELTV),0,0.7952,0.3002,-1.0,0.713,0.8517,0.9607,2.3467
2,CREDIT SCORE,0,732.4845,403.2014,300.0,673.0,719.0,764.0,9999.0
3,ORIGINAL INTEREST RATE,0,5.3453,1.2654,1.999,4.375,5.25,6.25,10.75
4,index_sa,0,241.5146,65.9347,136.86,192.02,220.78,268.38,414.98
5,UNRATE,0,5.8833,2.1356,3.4,4.2,5.1,7.3,14.8
6,inflation,0,2.5223,1.9701,-2.1,1.4,2.1,3.2,9.1
7,% Change in UPB,0,-0.0792,0.1332,-1.0,-0.0971,-0.0451,-0.0166,1.383

Unnamed: 0,name,n_missing,n_unique,top1,top2,top3,n_others
0,DEFAULT,0,2,0.0 : 4269333,1.0 : 419831,0,0
1,FIRST TIME HOMEBUYER FLAG,0,3,N : 4071092,Y : 617879,9 : 193,0
2,OCCUPANCY STATUS,0,3,P : 4232195,I : 301236,S : 155733,0
3,PROPERTY TYPE,0,6,SF : 3745826,PU : 572753,CO : 303031,67554
4,LOAN PURPOSE,0,3,P : 1832745,N : 1628546,C : 1227873,0
5,SELLER NAME,0,96,Other sell : 2433082,WELLS FARG : 317236,COUNTRYWID : 184306,1754540
6,OrigYear,0,24,2003.0 : 316577,2004.0 : 298012,2005.0 : 284833,3789742
7,OrigDate,0,94,2003Q3 : 85785,2003Q2 : 79417,2004Q2 : 78070,4445892


Data Shape:(4689164, 16)


In [17]:
custom_train_idx = np.arange(0, 3751330)
custom_test_idx = np.arange(3751331, 4689163)
exp.data_prepare(target='DEFAULT', task_type='classification', sample_weight=None,
                train_idx=custom_train_idx, test_idx=custom_test_idx, random_state = 12345)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,Config,Value
0,Excluded columns,[]
1,Target variable,DEFAULT
2,Sample weight,
3,Task type,classification
4,Split method,manual
5,Test ratio,0.2
6,Random state,12345


In [18]:
np.random.seed(12345) 
exp.model_train()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

VBox(children=(Box(children=(Box(children=(HTML(value="<h4 style='margin: 10px 0px;'>Choose Model</h4>"), Box(…

In [19]:
np.random.seed(12345) 
exp.model_interpret()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

VBox(children=(Dropdown(layout=Layout(width='20%'), options=('Select Model', 'XGB2', 'XGB2_v2'), style=Descrip…

In [None]:
np.random.seed(12345)
exp.model_diagnose()