# ML Classifiers: KNN Optimizing

### Contents:

**Machine Learning Classifiers:** KNN

**Feature Selection:** Kendalltau

**Resample Method:** Random Over Resampling

In [1]:
%%time

# import necessary dependencies
import os
import re
import gc

import time

import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
np.random.seed(42)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_theme(style = "whitegrid", palette = "pastel")

from warnings import simplefilter
simplefilter(action = "ignore", category = FutureWarning)
simplefilter(action = "ignore", category = DeprecationWarning)

# _______________________________________________________________ #

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import StandardScaler, label_binarize

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix

CPU times: total: 2.3 s
Wall time: 2.8 s


In [2]:
from sklearn.neighbors import KNeighborsClassifier

In [3]:
%%time

# config directories
detected_dir = "F:\FURP\#Single Family Loan Level Classification\Sample_orig\Mean-Target_Detected_Samples"

plots_dir = "F:\FURP\#Single Family Loan Level Classification\Sample_orig\Opt_Plots"

# config file paths
detected_paths = os.listdir(detected_dir)

# config necessary global variables
model_name = "KNN+Kendall+Random-Optimized"
kbest = [
    "First Payment Date",
    "Maturity Date",
    "Metropolitan Statistical Area (MSA) Or Metropolitan Division",
    "Mortgage Insurance Percentage (MI %)",
    "Number of Units",
    "Original Combined Loan-to-Value (CLTV)",
    "Original Debt-to-Income (DTI) Ratio",
    "Original UPB",
    "Original Loan-to-Value (LTV)",
    "Original Interest Rate",
    "Postal Code",
    "Original Loan Term",
    "Number of Borrowers",
    "Property Valuation Method",
    "First Payment Year",
    "Maturity Year",
    "Month Gap",
    "First Time Homebuyer Flag_1",
    "First Time Homebuyer Flag_2",
    "First Time Homebuyer Flag_3",
    "Occupancy Status_1",
    "Occupancy Status_2",
    "Occupancy Status_3",
    "Channel_1",
    "Channel_2",
    "Channel_3",
    "Prepayment Penalty Mortgage (PPM) Flag_1",
    "Prepayment Penalty Mortgage (PPM) Flag_2",
    "Prepayment Penalty Mortgage (PPM) Flag_3",
    "Property Type_1",
    "Property Type_2",
    "Property Type_3",
    "Loan Purpose_1",
    "Loan Purpose_2",
    "Loan Purpose_3",
    "Seller Name_1",
    "Seller Name_2",
    "Seller Name_3",
    "Servicer Name_1",
    "Servicer Name_2",
    "Servicer Name_3",
    "Program Indicator_1",
    "Program Indicator_2",
    "Program Indicator_3",
]

CPU times: total: 0 ns
Wall time: 998 µs


In [4]:
%%time

# walk through the files
df = pd.DataFrame()

for fileName in detected_paths:
    temp = pd.read_parquet(os.path.join(detected_dir, fileName))
    df = pd.concat([df, temp])
    del temp
    gc.collect()

# data preprocessing
if kbest is None:
    X = df.drop("Credit Rank", axis = 1)
else:
    X = df[kbest]

Y = df["Credit Rank"]

del df
gc.collect()
X_cols = X.columns

# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size = 0.2,
                                                    random_state = 42)
del X
del Y
gc.collect()

# apply random oversampling
ros = RandomOverSampler(random_state = 42)
X_train, Y_train = ros.fit_resample(X_train, Y_train)

# apply feature scaling
sc = StandardScaler()
sc_scaler = sc.fit(X_train)
X_train = sc_scaler.transform(X_train)
X_train = pd.DataFrame(X_train, columns = X_cols)

X_test = sc_scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns = X_cols)

CPU times: total: 10.1 s
Wall time: 10.3 s


In [None]:
%%time

# KNN Classifier
knn_clf = KNeighborsClassifier()

param_grid = {
    "n_neighbors": [3],
    "weights": ["distance"],
    "p": [1, 2, 3, 4]
}

knn_opt = GridSearchCV(
    knn_clf,
    param_grid,
    scoring = "accuracy",
    verbose = 3
)

knn_opt.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END n_neighbors=3, p=1, weights=distance;, score=0.845 total time=71.9min
[CV 2/5] END n_neighbors=3, p=1, weights=distance;, score=0.844 total time=72.2min
[CV 3/5] END n_neighbors=3, p=1, weights=distance;, score=0.868 total time=72.2min
[CV 4/5] END n_neighbors=3, p=1, weights=distance;, score=0.872 total time=72.3min
[CV 5/5] END n_neighbors=3, p=1, weights=distance;, score=0.872 total time=72.3min
[CV 1/5] END n_neighbors=3, p=2, weights=distance;, score=0.843 total time=15.3min
[CV 2/5] END n_neighbors=3, p=2, weights=distance;, score=0.844 total time=15.2min
[CV 3/5] END n_neighbors=3, p=2, weights=distance;, score=0.866 total time=15.3min
[CV 4/5] END n_neighbors=3, p=2, weights=distance;, score=0.871 total time=15.3min
[CV 5/5] END n_neighbors=3, p=2, weights=distance;, score=0.871 total time=15.2min


In [None]:
%%time

print(knn_opt.get_params().keys())
print()
print(knn_opt.best_params_)
print()
print(knn_opt)

---