# Data Preprocessing for Modeling - Finding Model Hyperparameters

### Since the full dataset is too large to run through the model tuning code with the grid method, a smaller sample is created just for the purpose of finding model hyperparameters.

In [18]:
from sklearn.preprocessing import StandardScaler
from pyspark.sql.types import StringType 
from pyspark.sql.functions import col 
import numpy as np
import pandas as pd 

from numpy import random 
random.seed(seed=12345)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from glob import glob

# Path to all Sample CSV files
path = '/GWSB/home/g35026169/Desktop/Andrew_s Data/Sample_*'

# Use glob to match the pattern and get a list of file names
files = glob(path)

# Read each CSV file and store them in a list
dfs = [pd.read_csv(file) for file in files]

# Concatenate all DataFrames in the list into one
df = pd.concat(dfs, ignore_index=True)

In [None]:
import pandas as pd

# Drop duplicates based on the loan sequence number to identify unique loans
unique_loans = df.drop_duplicates(subset=['LOAN SEQUENCE NUMBER'])

# Sample 10 loans from each year, stratified by delinquency status if needed
sampled_loans = unique_loans.groupby(['OrigYear']).sample(n=300,random_state=12345)

# Retrieve all activities for the sampled loans
final_sample = df[df['LOAN SEQUENCE NUMBER'].isin(sampled_loans['LOAN SEQUENCE NUMBER'])]

# Save or analyze your final sample
final_sample.to_csv('/GWSB/home/g35026169/Desktop/sampleforparameter.csv')  # Saving to a CSV file

### Small sample is ran through both XGB1 and XGB2 models to obtain hyperparameters via the grid method

In [19]:
import pandas as pd
from glob import glob

# Path to the CSV files
path = 'sampleforparameter.csv'

# Use glob to match the pattern and get a list of file names
files = glob(path)

# Read each CSV file and store them in a list
dfs = [pd.read_csv(file) for file in files]

# Concatenate all DataFrames in the list into one
df = pd.concat(dfs, ignore_index=True)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

In [20]:
full_timeseries = df.sort_values(by='OrigDate', ascending=True)

# Drop specified columns
full_timeseries = full_timeseries.drop(columns=['LOAN SEQUENCE NUMBER','MONTHLY REPORTING PERIOD', 'CURRENT ACTUAL UPB', 'OrigQuarter'])

# Convert Decimal to Percentage in the % Change in UPB Column
full_timeseries['% Change in UPB'] = full_timeseries['% Change in UPB'] * 100

# Making undefined ETLV values -1
full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'] = pd.to_numeric(full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'], errors='coerce').fillna(-1)

full_timeseries['CURRENT LOAN DELINQUENCY STATUS'] = full_timeseries['CURRENT LOAN DELINQUENCY STATUS'].astype(str)
full_timeseries['CURRENT INTEREST RATE'] = full_timeseries['CURRENT INTEREST RATE'].astype(np.float32)
full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'] = full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'].astype(np.float32)
full_timeseries['ORIGINAL INTEREST RATE'] = full_timeseries['ORIGINAL INTEREST RATE'].astype(np.float32)
full_timeseries['index_sa'] = full_timeseries['index_sa'].astype(np.float32)
full_timeseries['UNRATE'] = full_timeseries['UNRATE'].astype(np.float32)
full_timeseries['inflation'] = full_timeseries['inflation'].astype(np.float32)
full_timeseries['% Change in UPB'] = full_timeseries['% Change in UPB'].astype(np.float32)

full_timeseries.drop(columns=['CURRENT LOAN DELINQUENCY STATUS'], inplace=True)
full_timeseries.drop(columns=['LOAN AGE'], inplace=True)
full_timeseries.drop(columns=['Unnamed: 0'], inplace=True)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

In [21]:
full_timeseries.head()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),DEFAULT,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,OCCUPANCY STATUS,ORIGINAL INTEREST RATE,PROPERTY TYPE,LOAN PURPOSE,SELLER NAME,OrigYear,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
392871,8.0,0.5797,0,723,N,P,8.0,PU,P,CHASE MANHATTAN MORTGAGE CORPORATION,2000,2000Q1,221.190002,4.7,3.5,-6.32
405446,8.625,0.6433,0,645,N,P,8.625,SF,C,Other sellers,2000,2000Q1,183.389999,9.5,1.1,-14.46
405445,8.625,0.6371,0,645,N,P,8.625,SF,C,Other sellers,2000,2000Q1,185.169998,9.5,1.1,-14.46
405444,8.625,0.6365,1,645,N,P,8.625,SF,C,Other sellers,2000,2000Q1,185.350006,9.4,1.2,-14.46
405443,8.625,0.6349,0,645,N,P,8.625,SF,C,Other sellers,2000,2000Q1,186.529999,9.4,1.1,-14.14


In [22]:
full_timeseries.tail()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),DEFAULT,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,OCCUPANCY STATUS,ORIGINAL INTEREST RATE,PROPERTY TYPE,LOAN PURPOSE,SELLER NAME,OrigYear,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
452942,6.5,0.9864,0,806,N,P,6.5,CO,P,"GUARANTEED RATE, INC.",2023,2023Q2,409.0,3.5,3.2,0.0
452941,6.5,0.9947,0,806,N,P,6.5,CO,P,"GUARANTEED RATE, INC.",2023,2023Q2,405.609985,3.6,3.0,0.0
452940,6.5,-1.0,0,806,N,P,6.5,CO,P,"GUARANTEED RATE, INC.",2023,2023Q2,403.440002,3.7,4.0,0.0
452948,6.875,0.9745,0,723,N,P,6.875,SF,P,Other sellers,2023,2023Q2,412.040009,3.8,3.7,-0.47
453089,6.475,-1.0,0,687,N,P,6.475,PU,P,"GUARANTEED RATE, INC.",2023,2023Q2,403.440002,3.7,4.0,0.0


In [23]:
total_rows = len(full_timeseries) 
print(f"Total number of rows in the DataFrame: {total_rows}")

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Total number of rows in the DataFrame: 466921


In [24]:
nulls_per_column = full_timeseries.isnull().sum()
print(nulls_per_column)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

CURRENT INTEREST RATE              0
ESTIMATED LOAN TO VALUE (ELTV)     0
DEFAULT                            0
CREDIT SCORE                       0
FIRST TIME HOMEBUYER FLAG          0
OCCUPANCY STATUS                   0
ORIGINAL INTEREST RATE             0
PROPERTY TYPE                      0
LOAN PURPOSE                       0
SELLER NAME                        0
OrigYear                           0
OrigDate                           0
index_sa                           0
UNRATE                             0
inflation                          0
% Change in UPB                   99
dtype: int64


In [25]:
full_timeseries = full_timeseries.dropna(subset=['% Change in UPB'])
nulls_per_column = full_timeseries.isnull().sum()
print(nulls_per_column)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

CURRENT INTEREST RATE             0
ESTIMATED LOAN TO VALUE (ELTV)    0
DEFAULT                           0
CREDIT SCORE                      0
FIRST TIME HOMEBUYER FLAG         0
OCCUPANCY STATUS                  0
ORIGINAL INTEREST RATE            0
PROPERTY TYPE                     0
LOAN PURPOSE                      0
SELLER NAME                       0
OrigYear                          0
OrigDate                          0
index_sa                          0
UNRATE                            0
inflation                         0
% Change in UPB                   0
dtype: int64


In [26]:
total_rows2 = len(full_timeseries) 
print(f"Total number of rows in the DataFrame: {total_rows2}")

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Total number of rows in the DataFrame: 466822


In [27]:
# Calculate the difference
row_difference = total_rows - total_rows2
print(f"Difference between total_rows and total_rows2: {row_difference}")

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Difference between total_rows and total_rows2: 99


In [28]:
contains_undefined = full_timeseries['ESTIMATED LOAN TO VALUE (ELTV)'].isin(['undefined']).any()

print("Is 'undefined' present in the ELTV column?", contains_undefined)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Is 'undefined' present in the ELTV column? False


In [29]:
column_types = full_timeseries.dtypes
print(column_types)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

CURRENT INTEREST RATE             float32
ESTIMATED LOAN TO VALUE (ELTV)    float32
DEFAULT                             int64
CREDIT SCORE                        int64
FIRST TIME HOMEBUYER FLAG          object
OCCUPANCY STATUS                   object
ORIGINAL INTEREST RATE            float32
PROPERTY TYPE                      object
LOAN PURPOSE                       object
SELLER NAME                        object
OrigYear                            int64
OrigDate                           object
index_sa                          float32
UNRATE                            float32
inflation                         float32
% Change in UPB                   float32
dtype: object


In [30]:
# Reset the index 
full_timeseries.reset_index(drop=True, inplace=True)

# Now that the DataFrame is sorted and the index has been reset, fetch indices for 2022 and 2023
end_test_idx = full_timeseries[full_timeseries['OrigYear'] == 2023].index.max()

print(f"End of test index (inclusive): {end_test_idx}")

# Calculate the index that represents 80% of the data up to the end of 2023
split_index = int(end_test_idx * 0.8)

# Print the calculated split index
print(f"Training data ends at index: {split_index}")
print(f"Testing data starts at index: {split_index + 1} and ends at index: {end_test_idx}")

# Optionally, split the DataFrame into training and testing sets based on the calculated index
train_df = full_timeseries.loc[:split_index]
test_df = full_timeseries.loc[split_index + 1:end_test_idx]

full_timeseries['OrigYear'] = full_timeseries['OrigYear'].astype(str)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

End of test index (inclusive): 466821
Training data ends at index: 373456
Testing data starts at index: 373457 and ends at index: 466821


# PiML 

In [31]:
from piml import Experiment
exp = Experiment()
exp.data_loader(data = full_timeseries, silent=True)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

In [32]:
# Update "OrigYear" as Categorical Variables

exp.data_summary(feature_exclude=[], feature_type={"OrigYear": "categorical"})

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,name,n_missing,mean,std,min,q1,median,q3,max
0,CURRENT INTEREST RATE,0,5.2096,1.2885,2.0,4.25,5.0,6.125,10.625
1,ESTIMATED LOAN TO VALUE (ELTV),0,0.7971,0.3025,-1.0,0.717,0.855,0.9626,1.551
2,CREDIT SCORE,0,735.3731,435.0033,300.0,673.0,718.0,765.0,9999.0
3,ORIGINAL INTEREST RATE,0,5.3307,1.2459,2.125,4.375,5.25,6.25,10.625
4,index_sa,0,242.4418,66.8261,136.86,192.35,220.82,269.28,414.98
5,UNRATE,0,5.8731,2.1375,3.4,4.2,5.1,7.3,14.8
6,inflation,0,2.5367,1.9783,-2.1,1.4,2.1,3.3,9.1
7,% Change in UPB,0,-7.7493,13.1241,-100.0,-9.52,-4.4,-1.6,49.27

Unnamed: 0,name,n_missing,n_unique,top1,top2,top3,n_others
0,DEFAULT,0,2,0.0 : 425034,1.0 : 41788,0,0
1,FIRST TIME HOMEBUYER FLAG,0,2,N : 406332,Y : 60490,0,0
2,OCCUPANCY STATUS,0,3,P : 422890,I : 28711,S : 15221,0
3,PROPERTY TYPE,0,6,SF : 372015,PU : 56700,CO : 29711,8396
4,LOAN PURPOSE,0,3,P : 185155,N : 159877,C : 121790,0
5,SELLER NAME,0,81,Other sell : 238879,WELLS FARG : 30795,COUNTRYWID : 18296,178852
6,OrigYear,0,24,2003.0 : 32063,2004.0 : 29120,2005.0 : 28174,377465
7,OrigDate,0,94,2003Q3 : 9055,2005Q1 : 8253,2003Q2 : 8225,441289


Data Shape:(466822, 16)


In [33]:
np.random.seed(12345) 
custom_train_idx = np.arange(0, 373456)
custom_test_idx = np.arange(373457, 466821)
exp.data_prepare(target='DEFAULT', task_type='classification', sample_weight=None,
                train_idx=custom_train_idx, test_idx=custom_test_idx, random_state = 12345)

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

Unnamed: 0,Config,Value
0,Excluded columns,[]
1,Target variable,DEFAULT
2,Sample weight,
3,Task type,classification
4,Split method,manual
5,Test ratio,0.2
6,Random state,12345


In [34]:
np.random.seed(12345)
exp.model_train()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>

VBox(children=(Box(children=(Box(children=(HTML(value="<h4 style='margin: 10px 0px;'>Choose Model</h4>"), Box(…

In [None]:
np.random.seed(12345)

parameters = {'n_estimators': [100, 500, 1000],
              'eta': [0.01, 0.1, 0.5],
              'reg_lambda': [0.0, 0.5, 1.0],
              'reg_alpha': [0.01, 0.5, .99]}
result = exp.model_tune("XGB2", method="grid", parameters=parameters, metric=['MSE', 'MAE'], test_ratio=0.2, random_state = 12345)
result.data

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

<IPython.core.display.Javascript object>