In [1]:
# Import libraries
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
# Load preprocessed datasets
def load_datasets(data_dir="../data"):
    app = pd.read_csv(f"{data_dir}/application_train.csv")
    bureau = pd.read_csv(f"{data_dir}/bureau.csv")
    install = pd.read_csv(f"{data_dir}/installments_payments.csv")
    try:
        credit = pd.read_csv(f"{data_dir}/credit_card_balance.csv")
    except FileNotFoundError:
        credit = None
    return app, bureau, install, credit

app, bureau, install, credit = load_datasets()


In [4]:
# Define function to derive FICO-style features
def derive_fico_features(app, bureau, install, credit=None):
    """Derive FICO-style traditional credit features."""
    
    # --- Repayment Simulation ---
    install["DAYS_ENTRY_PAYMENT"] = install["DAYS_ENTRY_PAYMENT"].fillna(0)
    install["dpd"] = (install["DAYS_ENTRY_PAYMENT"] - install["DAYS_INSTALMENT"]).clip(lower=0)
    install["on_time"] = (install["dpd"] == 0).astype(int)

    repay_feat = install.groupby("SK_ID_CURR").agg({
        "dpd": ["mean", "max"],
        "on_time": "mean",
        "SK_ID_PREV": "count"
    })
    repay_feat.columns = ["dpd_mean", "dpd_max", "on_time_ratio", "num_payments"]
    repay_feat.reset_index(inplace=True)

    # --- Amounts Owed ---
    bureau["CREDIT_ACTIVE_FLAG"] = (bureau["CREDIT_ACTIVE"] == "Active").astype(int)
    owed_feat = bureau.groupby("SK_ID_CURR").agg({
        "AMT_CREDIT_SUM": "sum",
        "AMT_CREDIT_SUM_DEBT": "sum",
        "CREDIT_ACTIVE_FLAG": "sum"
    }).reset_index()
    owed_feat["total_utilization"] = (
        owed_feat["AMT_CREDIT_SUM_DEBT"] / owed_feat["AMT_CREDIT_SUM"].replace(0, np.nan)
    ).clip(upper=1.5)

    # --- Optional credit card utilization ---
    if credit is not None:
        card_util = credit.groupby("SK_ID_CURR").agg({
            "AMT_BALANCE": "mean",
            "AMT_CREDIT_LIMIT_ACTUAL": "mean"
        }).reset_index()
        card_util["credit_card_utilization"] = (
            card_util["AMT_BALANCE"] / card_util["AMT_CREDIT_LIMIT_ACTUAL"].replace(0, np.nan)
        ).clip(upper=1.5)
        owed_feat = owed_feat.merge(card_util[["SK_ID_CURR", "credit_card_utilization"]], on="SK_ID_CURR", how="left")
    else:
        owed_feat["credit_card_utilization"] = np.nan

    # --- Credit History Length ---
    bureau["CREDIT_AGE_MONTHS"] = abs(bureau["DAYS_CREDIT"]) / 30.44
    hist_feat = bureau.groupby("SK_ID_CURR").agg({
        "CREDIT_AGE_MONTHS": ["min", "max", "mean"]
    }).reset_index()
    hist_feat.columns = ["SK_ID_CURR", "oldest_account_m", "newest_account_m", "aaoa_m"]

    # --- Merge ---
    fico = app[["SK_ID_CURR", "TARGET"]]
    fico = fico.merge(repay_feat, on="SK_ID_CURR", how="left")
    fico = fico.merge(owed_feat, on="SK_ID_CURR", how="left")
    fico = fico.merge(hist_feat, on="SK_ID_CURR", how="left")

    # --- Handle missing / invalid values ---
    fico = fico.replace([np.inf, -np.inf], np.nan)
    fico = fico.fillna(0)
    fico["thin_file_flag"] = (fico["AMT_CREDIT_SUM"] == 0).astype(int)

    return fico


In [5]:
# Derive features
fico = derive_fico_features(app, bureau, install, credit)
print("FICO-style features created successfully.")
display(fico.head())


FICO-style features created successfully.


Unnamed: 0,SK_ID_CURR,TARGET,dpd_mean,dpd_max,on_time_ratio,num_payments,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,CREDIT_ACTIVE_FLAG,total_utilization,credit_card_utilization,oldest_account_m,newest_account_m,aaoa_m,thin_file_flag
0,100002,1,0.0,0.0,1.0,19.0,865055.565,245781.0,2.0,0.284122,0.0,3.383706,47.207622,28.712221,0
1,100003,0,0.0,0.0,1.0,25.0,1017400.5,0.0,1.0,0.0,0.0,19.908016,84.954008,46.016754,0
2,100004,0,0.0,0.0,1.0,3.0,189037.8,0.0,0.0,0.0,0.0,13.403417,43.561104,28.48226,0
3,100006,0,0.0,0.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,100007,0,0.954545,12.0,0.757576,66.0,146250.0,0.0,0.0,0.0,0.0,37.746386,37.746386,37.746386,0


In [6]:
# Save output
output_path = "../output/fico_style_features.csv"
fico.to_csv(output_path, index=False)
print(f"FICO-style feature table saved to {output_path}")


FICO-style feature table saved to ../output/fico_style_features.csv
