In [1]:
# ! pip install xgboost category_encoders

# ETL

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    plot_precision_recall_curve,
    precision_recall_curve,
    average_precision_score,
)
from xgboost import XGBClassifier
from category_encoders.woe import WOEEncoder
from sklearn.preprocessing import RobustScaler
import numpy as np

In [3]:
application = pd.read_parquet("data/application.parquet")
credit_record = pd.read_parquet("data/credit_record.parquet")

## Creating label

### Month balance & status

Month balance: 

The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on

Status:

0: 1-29 days past due 1: 30-59 days past due 2: 60-89 days overdue 3: 90-119 days overdue 4: 120-149 days overdue 5: Overdue or bad debts, write-offs for more than 150 days C: paid off that month X: No loan for the month

<font color="red">I will assume that any overdue of 60 or more days corresponds to a default</font>

In [4]:
defaulted_user_ids = credit_record.query("STATUS not in ['0','1','C','X']")["ID"].unique()

In [5]:
credit_record_unique = credit_record.drop_duplicates(subset=["ID"],keep="last",ignore_index=True).copy()

In [None]:
credit_record_unique["LABEL"] = credit_record["ID"].apply(lambda x: 1 if x in defaulted_user_ids else 0)

In [None]:
credit_record_unique["LABEL"].value_counts(normalize=True)

### Merge applications with credit records

In [None]:
df = application.merge(credit_record_unique[["ID","LABEL"]], how="inner", on="ID")

## Feature types

In [None]:
excluded_features = [
    "ID",
    "LABEL",
]

In [None]:
special_features = []

In [None]:
numeric_features, categorical_features = [], []
for feature in df.iloc[:,1:-1].columns:
    if feature not in excluded_features and feature not in special_features:
        if df[feature].dtype == "object":
            categorical_features.append(feature)
        else:
            numeric_features.append(feature)        

In [None]:
numeric_features

In [None]:
categorical_features

In [None]:
features = numeric_features + categorical_features + special_features

## Split datasets

In [None]:
X = df.iloc[:,1:-1] 
y = df.iloc[:,-1] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.15, random_state=42)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
param_grid = {
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    "min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": np.random.uniform(1, .7, 1),
    "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7 ],
    "max_depth": [3, 4, 5, 6, 7],
    "n_estimators": np.arange(100, 500, 10),
}

## Additional definitions

In [None]:
def replace_values_in_string(text, args_dict):
    for key in args_dict.keys():
        text = text.replace(key, str(args_dict[key]))
    return text

In [None]:
class ModifiedColumnTransformer(ColumnTransformer):       
    """Wraps a modified version of a ColumnTransformer that includes the column names after having done all the
    transformations.
        
    Args:
        transformers (list): List of transformers that are going to be set for the ColumnTransformer inheriting parent
    Returns:
        None.
    Raises:
        None.
    """
    def __init__(self, transformers):
        super().__init__(transformers=transformers)
        self.final_features = None
    
    def fit(self, X, y=None):
        super().fit(X, y=y)
        self.final_features = ModifiedColumnTransformer.get_all_column_names(self)
        
    def transform(self, X, y=None):
        return super().transform(X)
        
    def fit_transform(self, X, y=None):
        result = super().fit_transform(X, y=y)
        self.final_features = ModifiedColumnTransformer.get_all_column_names(self)
        return result
    
    @staticmethod
    def get_all_column_names(column_transformer) -> list:
        """Extracts the name of the resulting columns of a ColumnTransformer after all the transformations
        Args:
            column_transformer (ColumnTranformer): ColumnTransformer fitted instance from which to extract the column
                names
        Returns:
            col_name (list): List containing the column names based on the order of the ColumnTransformer transformers
        Raises:
            None.
        """
        col_name = []
        for transformer_in_columns in column_transformer.transformers_:
            # print(transformer_in_columns)
            raw_col_name = transformer_in_columns[2]
            if isinstance(transformer_in_columns[1],Pipeline): 
                transformer = transformer_in_columns[1].steps[-1][1]
            else:
                transformer = transformer_in_columns[1]
            try:
                category_dict = {}
                i=0
                names = transformer.get_feature_names()
                for category in transformer_in_columns[2]:
                    category_dict[f"x{i}"] = category
                    i+=1
                names = [replace_values_in_string(name,category_dict) for name in names]
                # print(category_dict)
            except AttributeError: # if no 'get_feature_names' function, use raw column name
                names = raw_col_name
            if isinstance(names,np.ndarray): # eg.
                col_name += names.tolist()
            elif isinstance(names,list):
                col_name += names    
            elif isinstance(names,str):
                col_name.append(names)
        return col_name

## Pipeline

In [None]:
included_fields_transformer = Pipeline(
    steps = [
        ("selector", ColumnTransformer(
            transformers=[
                ("selector", "passthrough", features)
            ], remainder="drop")
        ),
    ]
)

categorical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant",missing_values=np.nan, fill_value=np.nan)),
        ("encoder", WOEEncoder()),
    ]
)

numeric_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", missing_values=np.nan, fill_value=np.nan)),
    ]
)


preprocessor = ModifiedColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("numeric", numeric_transformer, numeric_features),
    ]
)

## XGBoost

In [None]:
clf_xgb = XGBClassifier(objective="binary:logistic", use_label_encoder=False, )

In [None]:
clf_xgb_rs = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("rs", RandomizedSearchCV(clf_xgb,param_distributions=param_grid,n_iter=100,cv=5,scoring="average_precision", random_state=42, )),
    ]
)

### Training

In [None]:
clf_xgb_rs.fit(X_train, y_train)

### Evaluation

In [None]:
clf_xgb_rs["rs"].best_score_

In [None]:
y_test_proba = clf_xgb_rs.predict_proba(X_test)

In [None]:
y_test_pred = np.where(y_test_proba[:,1] <= 0.5,0,1)

In [None]:
print(classification_report(y_test,y_test_pred))

In [None]:
average_precision_score(y_test,y_test_proba[:,1])