In [102]:
import pickle
from warnings import simplefilter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import set_config
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (
    GradientBoostingClassifier, RandomForestClassifier,
    BaggingClassifier, AdaBoostClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

from util import drop_empty_features, print_metrics


set_config(transform_output='pandas')
rs = 5

simplefilter(action='ignore', category=FutureWarning)

In [103]:
data = pd.read_csv("data/train.csv")

In [104]:
target_label = "completion_status"
target = data[target_label]

In [105]:
# Drop empty features
data = drop_empty_features(data)

In [106]:
# Make all text lowercase
filt = (data.dtypes == "object") | (data.dtypes == "bool")
data.loc[:, filt] = data.loc[:, filt].applymap(str.lower, na_action='ignore')

In [107]:
X_train, X_test, y_train, y_test = train_test_split(
    data.loc[:, data.columns != target_label],
    target, test_size=0.1, random_state=rs
)

In [108]:
num_columns = list(X_train.select_dtypes(["int64", "float64"]).columns)

In [109]:
# Encode ordinary data with the ordered categories, then impute

ord_columns = [
    'RATE_owner_1', 'RATE_ID_FOR_years_in_business',
    'RATE_ID_FOR_judgement_lien_percent',
    'RATE_ID_FOR_num_negative_days', 'RATE_ID_FOR_num_deposits',
    'RATE_ID_FOR_current_position', 'RATE_owner_4'
]

categories = [
    ['a', 'b', 'c', 'r', 'd'],
    ['a', 'b', 'c', 'd'],
    ['a', 'b', 'c', 'd'],
    ['a', 'b', 'c', 'd'],
    ['a', 'b', 'c', 'r'],
    ['a', 'c', 'd'],
    ['a', 'b', 'c', 'r', 'd'],
]

ord_enc = OrdinalEncoder(
    categories=categories,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

knn_imp = KNNImputer(n_neighbors=1)

ord_knn_pipeline = Pipeline(
    [('ord_enc', ord_enc), ('knn_imp', knn_imp)]
)

X_train[ord_columns] = ord_knn_pipeline.fit_transform(X_train[ord_columns])

X_test[ord_columns] = ord_knn_pipeline.transform(X_test[ord_columns])

In [110]:
with open("./saves/objects/ord_pipeline.p", "wb") as f:
    pickle.dump(ord_knn_pipeline, f)

In [111]:
# Encode binary data, then impute

bin_columns = [
    'funded_last_30', 'RATE_ID_FOR_judgement_lien_amount',
    'RATE_ID_FOR_monthly_gross', 'RATE_ID_FOR_average_ledger',
    'RATE_ID_FOR_fc_margin', 'RATE_ID_FOR_tax_lien_amount',
    'RATE_ID_FOR_tax_lien_percent', 'RATE_ID_FOR_tax_lien_count',
]

simple_imp = SimpleImputer(strategy='most_frequent')

bin_enc = OneHotEncoder(
    drop='if_binary',
    handle_unknown='ignore',
    sparse_output=False
)

simple_bin_pipeline = Pipeline(
    [('simple_imp', simple_imp), ('bin_enc', bin_enc)]
)

X_train[bin_columns] = simple_bin_pipeline.fit_transform(X_train[bin_columns])

X_test[bin_columns] = simple_bin_pipeline.transform(X_test[bin_columns])

In [112]:
with open("./saves/objects/bin_pipeline.p", "wb") as f:
    pickle.dump(simple_bin_pipeline, f)

In [113]:
# Encode nominal data, then impute

oh_columns = [
    'location', 'INPUT_VALUE_ID_FOR_industry_type'
]

simple_imp2 = SimpleImputer(strategy='most_frequent')

oh_enc = OneHotEncoder(
    drop='first',
    handle_unknown='ignore',
    sparse_output=False
)

simple_oh_pipeline = Pipeline(
    [('simple_imp2', simple_imp2), ('oh_enc', oh_enc)]
)

oh_data = simple_oh_pipeline.fit_transform(X_train[oh_columns])
X_train.drop(columns=oh_columns, inplace=True)
X_train = pd.concat([X_train, oh_data], axis=1)

oh_data = simple_oh_pipeline.transform(X_test[oh_columns])
X_test.drop(columns=oh_columns, inplace=True)
X_test = pd.concat([X_test, oh_data], axis=1)

In [114]:
with open("./saves/objects/oh_pipeline.p", "wb") as f:
    pickle.dump(simple_oh_pipeline, f)

In [115]:
# Impute numerical data, then scale

knn_imp2 = KNNImputer()

robust_scaler = RobustScaler(quantile_range=(0.03, 0.97))

knn_robust_pipeline = Pipeline(
    [
        ('knn_imp2', knn_imp2),
        ('robust_scaler', robust_scaler),
    ]
)

X_train[num_columns] = knn_robust_pipeline.fit_transform(X_train[num_columns])

X_test[num_columns] = knn_robust_pipeline.transform(X_test[num_columns])

In [116]:
with open("./saves/objects/num_pipeline.p", "wb") as f:
    pickle.dump(knn_robust_pipeline, f)

In [117]:
with open('./saves/objects/num_columns.p', 'wb') as f:
    pickle.dump(num_columns, f)
with open('./saves/objects/ord_columns.p', 'wb') as f:
    pickle.dump(ord_columns, f)
with open('./saves/objects/bin_columns.p', 'wb') as f:
    pickle.dump(bin_columns, f)
with open('./saves/objects/oh_columns.p', 'wb') as f:
    pickle.dump(oh_columns, f)

In [118]:
# Drop univariate column
X_train.drop(columns=['RATE_ID_FOR_location'], inplace=True)
X_test.drop(columns=['RATE_ID_FOR_location'], inplace=True)

In [119]:
# Perform Recursive Feature Elimination to choose the optimal features

clf = AdaBoostClassifier(random_state=rs)
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

rfecv.fit(X_train, y_train)

X_train = X_train.loc[:, rfecv.support_]
X_test = X_test.loc[:, rfecv.support_]

In [120]:
with open('./saves/objects/final_columns.p', 'wb') as f:
    pickle.dump(list(X_train.columns), f)

In [121]:
# RandomForest

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 25],
}

rfc = RandomForestClassifier(random_state=rs)

grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=2, verbose=3)
grid_search.fit(X_train, y_train)

rfc = grid_search.best_estimator_

y_pred = rfc.predict(X_test)

print_metrics(y_test, y_pred)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END .....max_depth=5, n_estimators=100;, score=0.638 total time=   0.4s
[CV 2/2] END .....max_depth=5, n_estimators=100;, score=0.665 total time=   0.4s
[CV 1/2] END .....max_depth=5, n_estimators=200;, score=0.636 total time=   0.8s
[CV 2/2] END .....max_depth=5, n_estimators=200;, score=0.663 total time=   0.8s
[CV 1/2] END .....max_depth=5, n_estimators=300;, score=0.644 total time=   1.1s
[CV 2/2] END .....max_depth=5, n_estimators=300;, score=0.668 total time=   1.1s
[CV 1/2] END ....max_depth=10, n_estimators=100;, score=0.815 total time=   0.5s
[CV 2/2] END ....max_depth=10, n_estimators=100;, score=0.844 total time=   0.5s
[CV 1/2] END ....max_depth=10, n_estimators=200;, score=0.822 total time=   0.9s
[CV 2/2] END ....max_depth=10, n_estimators=200;, score=0.853 total time=   0.9s
[CV 1/2] END ....max_depth=10, n_estimators=300;, score=0.816 total time=   1.3s
[CV 2/2] END ....max_depth=10, n_estimators=300;,

In [122]:
# save rfc
with open("./saves/objects/random_forest_model.p", "wb") as f:
    pickle.dump(rfc, f)

In [123]:
#AdaBoostClassifier DecisionTreeClassifier base
param_grid = {
    'base_estimator__max_depth': [5, 10, 25],
    'n_estimators': [100, 200, 300]
}

# Define the base classifier
base_classifier = DecisionTreeClassifier(random_state=rs)

# Define the boosting classifier
boosting_classifier = AdaBoostClassifier(base_estimator=base_classifier, random_state=rs)

# Perform grid search on the boosting classifier
grid_search = GridSearchCV(estimator=boosting_classifier, param_grid=param_grid, cv=2, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best boosting classifier
boosting_classifier = grid_search.best_estimator_

# Make predictions using the best boosting classifier
y_pred = boosting_classifier.predict(X_test)

print_metrics(y_test, y_pred)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END base_estimator__max_depth=5, n_estimators=100;, score=0.853 total time=   1.1s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=100;, score=0.876 total time=   1.1s
[CV 1/2] END base_estimator__max_depth=5, n_estimators=200;, score=0.870 total time=   2.3s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=200;, score=0.881 total time=   2.3s
[CV 1/2] END base_estimator__max_depth=5, n_estimators=300;, score=0.872 total time=   3.4s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=300;, score=0.887 total time=   3.7s
[CV 1/2] END base_estimator__max_depth=10, n_estimators=100;, score=0.886 total time=   1.5s
[CV 2/2] END base_estimator__max_depth=10, n_estimators=100;, score=0.892 total time=   1.4s
[CV 1/2] END base_estimator__max_depth=10, n_estimators=200;, score=0.888 total time=   2.8s
[CV 2/2] END base_estimator__max_depth=10, n_estimators=200;, score=0.896 total time=   2.8s
[CV 1/2] END bas

In [124]:
# save boosting_classifier
with open("./saves/objects/adaboost_model.p", "wb") as f:
    pickle.dump(boosting_classifier, f)

In [125]:
#BaggingClassifier DecisionTreeClassifier base

param_grid = {
    'base_estimator__max_depth': [5, 10, 25],
    'n_estimators': [100, 200, 300]
}

# Define the base classifier
base_classifier = DecisionTreeClassifier(random_state=rs)

# Define the bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=100, random_state=rs)

# Perform grid search on the bagging classifier
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=2, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best bagging classifier
bagging_classifier = grid_search.best_estimator_

# Make predictions using the best bagging classifier
y_pred = bagging_classifier.predict(X_test)

print_metrics(y_test, y_pred)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END base_estimator__max_depth=5, n_estimators=100;, score=0.692 total time=   0.7s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=100;, score=0.699 total time=   0.7s
[CV 1/2] END base_estimator__max_depth=5, n_estimators=200;, score=0.679 total time=   1.4s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=200;, score=0.709 total time=   1.4s
[CV 1/2] END base_estimator__max_depth=5, n_estimators=300;, score=0.677 total time=   2.0s
[CV 2/2] END base_estimator__max_depth=5, n_estimators=300;, score=0.704 total time=   2.1s
[CV 1/2] END base_estimator__max_depth=10, n_estimators=100;, score=0.850 total time=   0.9s
[CV 2/2] END base_estimator__max_depth=10, n_estimators=100;, score=0.848 total time=   0.9s
[CV 1/2] END base_estimator__max_depth=10, n_estimators=200;, score=0.850 total time=   1.9s
[CV 2/2] END base_estimator__max_depth=10, n_estimators=200;, score=0.853 total time=   1.8s
[CV 1/2] END bas

In [126]:
# save bagging_classifier
with open("./saves/objects/bagging_model.p", "wb") as f:
    pickle.dump(bagging_classifier, f)

In [127]:
#GradientBoostingClassifier
param_grid = {
    'learning_rate': [0.01, 0.8],
    'n_estimators': [100, 200]
}

gb_clf = GradientBoostingClassifier(random_state=rs)
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=2, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best bagging classifier
gb_clf = grid_search.best_estimator_

# Make predictions using the best bagging classifier
y_pred = gb_clf.predict(X_test)

print_metrics(y_test, y_pred)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END learning_rate=0.01, n_estimators=100;, score=0.589 total time=   2.0s
[CV 2/2] END learning_rate=0.01, n_estimators=100;, score=0.623 total time=   1.9s
[CV 1/2] END learning_rate=0.01, n_estimators=200;, score=0.704 total time=   3.9s
[CV 2/2] END learning_rate=0.01, n_estimators=200;, score=0.682 total time=   3.8s
[CV 1/2] END learning_rate=0.8, n_estimators=100;, score=0.893 total time=   1.8s
[CV 2/2] END learning_rate=0.8, n_estimators=100;, score=0.889 total time=   1.9s
[CV 1/2] END learning_rate=0.8, n_estimators=200;, score=0.904 total time=   3.4s
[CV 2/2] END learning_rate=0.8, n_estimators=200;, score=0.904 total time=   3.4s
Accuracy: 98.34 %
Precision: 98.97 %
Recall: 97.92 %
F1 score: 98.39 %
Confusion matrix:
[[55  0  0  0]
 [ 0 84  0  0]
 [ 1  2 33  0]
 [ 0  0  0  6]]



In [128]:
# save gb_clf
with open("./saves/objects/gradientboost_model.p", "wb") as f:
    pickle.dump(gb_clf, f)