In [1]:
import numpy as np
import pandas as pd

path_to_csv = "../data/mortgage_transformed.csv"
df = pd.read_csv(path_to_csv)
df.set_index(df.pop("Unnamed: 0"), inplace=True)

X = df.dropna(axis=0)
y = X.pop("result")

# A model that incorporates what we've learned thus far

We will use most features from the original datasets, while adding new ones. We will also try to normalize the data to improve model performance

In [59]:
# custom features
X["time_since_start"] = X.time - X.orig_time
X["time_until_end"] = X.mat_time - X.time
X["mortgage_duration"] = X.mat_time - X.orig_time
X["balance_change"] = X.balance_time-X.balance_orig_time
X["LTV_change"] = X.LTV_time - X.LTV_orig_time
X["interest_change"] = X.interest_rate_time - X.Interest_Rate_orig_time
# X["interest_change_as_fraction"] = X.interest_change / X.Interest_Rate_orig_time
X["hpi_change"] = X.hpi_time - X.hpi_orig_time

In [60]:
# select useful columns
used_columns = [
    "time_since_start", "time_until_end", "mortgage_duration",
    "balance_time", "balance_orig_time", "balance_change",
    "LTV_time", "LTV_orig_time", "LTV_change",
    "interest_rate_time", "Interest_Rate_orig_time", "interest_change",
    #"interest_change_as_fraction",
    "hpi_time", "hpi_orig_time",
    "hpi_change", "uer_time"
]

In [75]:
# normalize the data
from sklearn.preprocessing import normalize
X_data = X.loc[:, used_columns]
X_norm = normalize(X_data)

In [83]:
# split the data for testing the model
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X_data, y,
    train_size=0.7,
    test_size=0.3)

In [79]:
# we will try to select a better model using accuracy scoring
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def score_model(X_train, y_train, X_valid, y_valid, n_estimators, learning_rate=0.1):
    model = XGBClassifier(
                n_estimators = n_estimators,
                learning_rate = learning_rate,
                n_jobs = 5,
                eval_metric = 'mlogloss'
            )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    
    accuracy = accuracy_score(predictions, y_valid)
    return accuracy

In [82]:
# shut up warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# we will try different number of estimators to see, which is the best fit
estimator_count = [10, 25, 36, 50, 100]

last_accuracy = 0
for n_estimators in estimator_count:
    accuracy = score_model(
        X_train, y_train, X_valid, y_valid,
        n_estimators=n_estimators,
        learning_rate=0.5 # let's speed things up
    )
    print(f"The accuracy with n_estimators is {n_estimators} with an accuracy of {accuracy}")

The accuracy with n_estimators is 10 with an accuracy of 0.6596495095337815
The accuracy with n_estimators is 25 with an accuracy of 0.6605312465557147
The accuracy with n_estimators is 36 with an accuracy of 0.6578309269260443
The accuracy with n_estimators is 50 with an accuracy of 0.6552959329879864
The accuracy with n_estimators is 100 with an accuracy of 0.6543039788383115


In [157]:
some_model = XGBClassifier(
    n_estimators = 24,
    learning_rate = 0.3,
    reg_lambda=0.9,
    reg_alpha=0.1,
    objective='reg:logistic',
    eval_metric = 'error',
)
some_model.fit(X_train, y_train)
predictions = some_model.predict(X_valid)
accuracy = accuracy_score(predictions, y_valid)
print(f"Accuracy is {accuracy}")

Accuracy is 0.6707814394356884
