# Cognitive NPL Resolution Engine

## Install & Import

In [21]:
import os
import pandas as pd
import numpy as np
import joblib
import mlflow


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from pandas.api.types import is_numeric_dtype

import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Mlflow
mlflow.set_tracking_uri("file:../mlruns")

mlflow.set_experiment("CreditScoring")

<Experiment: artifact_location='file:///Users/tri/Documents/code/P02/notebook/../mlruns/574172305759250793', creation_time=1771514225988, experiment_id='574172305759250793', last_update_time=1771514225988, lifecycle_stage='active', name='CreditScoring', tags={'mlflow.experimentKind': 'custom_model_development'}>

### Config & Settings Up

In [17]:
CONFIG = {
    "data_paths": {
        "raw": "../data/01-raw/",
        "processed": "../data/02-processed/",
        "features": "../data/03-features/",
    },
    "model_paths": {
        "artifacts": "../models/"
    },
    "files": {
        "input_data": "data_clean.parquet",
        "train_out": "feature_train.parquet",
        "test_out": "feature_test.parquet"
    },
    "target_col": "is_default",
    "test_size": 0.2,
    "random_state": 42,
    
    "b2c_naics_codes": ['44', '45', '61', '62', '71', '72', '81']
}

os.makedirs(CONFIG["data_paths"]["features"], exist_ok=True)
os.makedirs(CONFIG["model_paths"]["artifacts"], exist_ok=True)

## Feature Engineering

### Feature Construction/Derivative

In [13]:
def create_target_variable(df):
    
    if 'is_default' not in df.columns:
        df = df[df['loanstatus'].isin(['PIF', 'CHGOFF'])].copy()
        df['is_default'] = df['loanstatus'].apply(lambda x: 1 if x == 'CHGOFF' else 0)
    return df

def engineer_features(df):
    
    df = df.copy()
    
    gdp_map = df[['Industry_BEA', 'Quarter_Date', 'GDP_Value']].drop_duplicates().sort_values(['Industry_BEA', 'Quarter_Date'])
    gdp_map['gdp_pct_change'] = gdp_map.groupby('Industry_BEA')['GDP_Value'].pct_change()
    df = df.merge(gdp_map[['Industry_BEA', 'Quarter_Date', 'gdp_pct_change']], on=['Industry_BEA', 'Quarter_Date'], how='left')
    df['is_sector_in_recession'] = (df['gdp_pct_change'] < 0).astype(int) #H1
    
    df['is_b2c'] = df['NAICS_2'].astype(str).isin(CONFIG['b2c_naics_codes']).astype(int)
    df['unrate_b2c_impact'] = df['UNRATE'] * df['is_b2c'] 
    
    df['is_high_rate_era'] = (df['DPRIME'] <= 4.0).astype(int) #H3
    
    prime_map = df[['Quarter_Date', 'DPRIME']].drop_duplicates().sort_values('Quarter_Date')
    prime_map['DPRIME_Lag_7'] = prime_map['DPRIME'].shift(7)
    df = df.merge(prime_map[['Quarter_Date', 'DPRIME_Lag_7']], on='Quarter_Date', how='left')
  
    df['is_microloan'] = (df['grossapproval'] < 50000).astype(int) #H4
    
    high_risk_states = ['HI', 'NY', 'DE']
    df['is_high_risk_state'] = df['borrstate'].isin(high_risk_states).astype(int) #H5

    df['is_short_term'] = (df['terminmonths'] < 36).astype(int) #H6
    df['log_grossapproval'] = np.log1p(df['grossapproval'])
    df['loan_to_jobs_ratio'] = df['grossapproval'] / (df['jobssupported'] + 1)
    
    if 'franchisename' in df.columns:
        df['is_franchise'] = df['franchisename'].notna().astype(int)
    
    # Kembalikan urutan berdasarkan waktu persetujuan pinjaman
    df = df.sort_values('approvaldate').reset_index(drop=True)
    
    return df

### Split Dataset

In [20]:
def split_data(df, target_col):
    
    # Drop kolom yang tidak relevan / menyebabkan bocor (Data Leakage)
    drop_cols = ['l2locid', 'borrname', 'borrstreet', 'borrcity', 'borrzip', 
                 'bankname', 'bankstreet', 'bankcity', 'bankzip', 
                 'firstdisbursementdate', 'paidinfulldate', 'chargeoffdate', 
                 'observation_date', 'observation_date_unrate', 'Quarter_Date', 
                 'loanstatus', 'naicsdescription', 'franchisename', 'asofdate'
                 'subpgmdesc']
    
    df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])
    
    df_model = df_model.dropna(subset=[target_col, 'gdp_pct_change', 'DPRIME_Lag_7'])
    
    split_idx = int(len(df_model) * (1 - CONFIG['test_size']))
    
    train_df = df_model.iloc[:split_idx]
    test_df = df_model.iloc[split_idx:]
    
    X_train = train_df.drop(columns=[target_col, 'approvaldate'])
    y_train = train_df[target_col]
    
    X_test = test_df.drop(columns=[target_col, 'approvaldate'])
    y_test = test_df[target_col]
    
    return X_train, X_test, y_train, y_test

### Scaling & Encode

In [23]:
def preprocess_features(X_train, X_test, y_train):
    
    cat_cols_high_card = ['borrstate', 'bankstate', 'projectstate', 'Industry_BEA', 'NAICS_2']
    cat_cols_low_card = ['program', 'processingmethod', 'collateralind']
    binary_cols = ['is_b2c', 'is_cheap_money_era', 'is_microloan', 'is_high_risk_state', 'is_short_term', 'is_sector_in_recession', 'is_franchise']
    
    cat_cols_high_card = [c for c in cat_cols_high_card if c in X_train.columns]
    cat_cols_low_card = [c for c in cat_cols_low_card if c in X_train.columns]
    binary_cols = [c for c in binary_cols if c in X_train.columns]
    num_cols = [c for c in X_train.columns if c not in cat_cols_high_card + cat_cols_low_card + binary_cols and is_numeric_dtype(X_train[c])]

    # Target Encoding (High Cardinality)
    target_enc = ce.TargetEncoder(cols=cat_cols_high_card)
    X_train_te = target_enc.fit_transform(X_train[cat_cols_high_card], y_train)
    X_test_te = target_enc.transform(X_test[cat_cols_high_card])
    joblib.dump(target_enc, f"{CONFIG['model_paths']['artifacts']}target_encoder.pkl")

    # One Hot Encoding (Low Cardinality)
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first') 
    X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[cat_cols_low_card]), columns=ohe.get_feature_names_out())
    X_test_ohe = pd.DataFrame(ohe.transform(X_test[cat_cols_low_card]), columns=ohe.get_feature_names_out())
    X_train_ohe.index = X_train.index
    X_test_ohe.index = X_test.index
    joblib.dump(ohe, f"{CONFIG['model_paths']['artifacts']}ohe_encoder.pkl")

    # Scaling 
    scaler = RobustScaler()
    X_train_num = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)
    X_test_num = pd.DataFrame(scaler.transform(X_test[num_cols]), columns=num_cols)
    X_train_num.index = X_train.index
    X_test_num.index = X_test.index
    joblib.dump(scaler, f"{CONFIG['model_paths']['artifacts']}robust_scaler.pkl")

    X_train_final = pd.concat([X_train_num, X_train_te, X_train_ohe, X_train[binary_cols]], axis=1)
    X_test_final = pd.concat([X_test_num, X_test_te, X_test_ohe, X_test[binary_cols]], axis=1)
    
    return X_train_final, X_test_final

In [24]:
with mlflow.start_run(run_name='preprocessing_v1') as run:

    print("[INFO] Loading data...")
    df_raw = pd.read_parquet(f"{CONFIG['data_paths']['processed']}{CONFIG['files']['input_data']}")
    df_raw = create_target_variable(df_raw)
    
    initial_cols = set(df_raw.columns)

    print("[INFO] Engineer Features...")
    df_fe = engineer_features(df_raw)
    
    new_features = list(set(df_fe.columns) - initial_cols)
    mlflow.log_param("Feature Derivative", ",".join(new_features))
    mlflow.log_metric("Total Feature Derivative", len(new_features))
    mlflow.log_param("NAICS_Code", CONFIG['b2c_naics_codes'])

    print("[INFO] Splitting OOT (Out-of-Time)...")
    X_train, X_test, y_train, y_test = split_data(df_fe, CONFIG['target_col'])
    
    mlflow.log_param("test_size", CONFIG['test_size'])
    mlflow.log_param("split_strategy","Out-of-Time/Chronological")
    mlflow.log_metric("train_samples", len(X_train))
    mlflow.log_metric("test_samples", len(X_test))
    
    mlflow.log_metric("train_default_rate", y_train.mean())
    mlflow.log_metric("test_default_rate", y_test.mean())
    
    print("[INFO] Preprocessing (Encoding & Scaling)...")
    X_train_final, X_test_final = preprocess_features(X_train, X_test, y_train)
    
    mlflow.log_param("Scaler", "RobustScaler")
    mlflow.log_param("encoder_high_cardinality", "TargetEncoder")
    mlflow.log_param("encoder_low_cardinality","OneHotEncoder")
    mlflow.log_metric("final_feature_count", X_train_final.shape[1])

    train_final = X_train_final.copy()
    train_final['is_default'] = y_train.values

    test_final = X_test_final.copy()
    test_final['is_default'] = y_test.values

    print("[INFO] Saving artifacts to /03-features/...")
    
    train_path = f"{CONFIG['data_paths']['features']}{CONFIG['files']['train_out']}"
    test_path = f"{CONFIG['data_paths']['features']}{CONFIG['files']['test_out']}"
    
    train_final.to_parquet(train_path, index=False)
    test_final.to_parquet(test_path, index=False)
    
    print("[INFO] Logging Mlflow Artifacts..")
    mlflow.log_artifact(train_path, artifact_path="datasets")
    mlflow.log_artifact(test_path, artifact_path="datasets")
    
    mlflow.log_artifacts(CONFIG['model_paths']['artifacts'], artifact_path='preprocessors')

    print(f"[SUCCESS] Feature Engineering & Preprocessing Completed! MLflow RUN ID: {run.info.run_id}")

[INFO] Loading data...
[INFO] Engineer Features...
[INFO] Splitting OOT (Out-of-Time)...
[INFO] Preprocessing (Encoding & Scaling)...
[INFO] Saving artifacts to /03-features/...
[INFO] Logging Mlflow Artifacts..
[SUCCESS] Feature Engineering & Preprocessing Completed! MLflow RUN ID: 554458798cf34f41943c57f1c2f6c8ff
