In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder



In [3]:
def clean_data(df):
    df = df.copy()
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Handle missing or invalid billing/payment values
    bill_cols = [f"bill_amt{i}" for i in range(1, 7)]
    pay_cols = [f"pay_amt{i}" for i in range(1, 7)]

    df[bill_cols] = df[bill_cols].apply(pd.to_numeric, errors='coerce')
    df[pay_cols] = df[pay_cols].apply(pd.to_numeric, errors='coerce')

    # Fill missing values with 0 (or median if you prefer)
    df[bill_cols] = df[bill_cols].fillna(0)
    df[pay_cols] = df[pay_cols].fillna(0)

    return df

In [5]:
# Load the dataset (correct UCI URL, Excel format)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)

In [6]:
def engineer_features(df):
    df = df.copy()

    bill_cols = [f"bill_amt{i}" for i in range(1, 7)]
    pay_cols = [f"pay_amt{i}" for i in range(1, 7)]

    df["avg_bill_amt"] = df[bill_cols].mean(axis=1)
    df["avg_pay_amt"] = df[pay_cols].mean(axis=1)
    df["pay_bill_ratio"] = df["avg_pay_amt"] / df["avg_bill_amt"]
    df["pay_bill_ratio"].replace([np.inf, -np.inf], 0, inplace=True)
    df["pay_bill_ratio"].fillna(0, inplace=True)

    df["age_group"] = pd.cut(df["age"], bins=[20, 30, 40, 50, 60, 100],
                             labels=["20s", "30s", "40s", "50s", "60+"],
                             right=False)

    df = pd.get_dummies(df, columns=["sex", "education", "marriage", "age_group"], drop_first=True)

    return df

In [7]:
df_clean = clean_data(df)
df_final = engineer_features(df_clean)

print("✅ Final shape after feature engineering:", df_final.shape)
df_final.head()

✅ Final shape after feature engineering: (30000, 39)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pay_bill_ratio"].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pay_bill_ratio"].fillna(0, inplace=True)


Unnamed: 0,id,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,...,education_4,education_5,education_6,marriage_1,marriage_2,marriage_3,age_group_30s,age_group_40s,age_group_50s,age_group_60+
0,1,20000,24,2,2,-1,-1,-2,-2,3913,...,False,False,False,True,False,False,False,False,False,False
1,2,120000,26,-1,2,0,0,0,2,2682,...,False,False,False,False,True,False,False,False,False,False
2,3,90000,34,0,0,0,0,0,0,29239,...,False,False,False,False,True,False,True,False,False,False
3,4,50000,37,0,0,0,0,0,0,46990,...,False,False,False,True,False,False,True,False,False,False
4,5,50000,57,-1,0,-1,0,0,0,8617,...,False,False,False,True,False,False,False,False,True,False
