In [23]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

default_of_credit_card_clients = fetch_ucirepo(id=350)

X = default_of_credit_card_clients.data.features
y = default_of_credit_card_clients.data.targets

df = pd.concat([X, y], axis=1)

print("df.shape:", df.shape)
df.head()


df.shape: (30000, 24)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


The dataset is used to analyze and predict credit card default behavior to support research in credit risk assessment and financial decision-making.

- **Target:** Y
- **Type:** Binary (0 = No default, 1 = Default)
- **Description:** Indicates whether a client defaulted on their payment in the following month.

- **Source:** UCI Machine Learning Repository  
- **Dataset:** Default of Credit Card Clients (Yeh, 2009)  
- **Access:** ucimlrepo.fetch_ucirepo(id=350)
- **License:** Public for research and educational use

### Feature Summary
- `LIMIT_BAL` (numerical): Credit limit  
- `AGE` (numerical): Client age  
- `SEX`, `EDUCATION`, `MARRIAGE` (categorical): Demographics  
- `PAY_0`–`PAY_6`: Repayment status  
- `BILL_AMT1`–`BILL_AMT6` (numerical): Bill amounts  
- `PAY_AMT1`–`PAY_AMT6` (numerical): Payment amounts  

### Limitations and Risks
- Data is from 2005 and may be outdated  
- Clients are from Taiwan only (limited generalizability)  
- Demographic features may introduce bias  
- Class imbalance between default and non-default cases


In [24]:
print("missingness")
print(df.isna().sum().sort_values(ascending=False))
print()

print("duplicates")
print(df.duplicated().sum())
print()

print("target distribution")
df['Y'].value_counts(normalize=True)






missingness
X1     0
X2     0
X23    0
X22    0
X21    0
X20    0
X19    0
X18    0
X17    0
X16    0
X15    0
X14    0
X13    0
X12    0
X11    0
X10    0
X9     0
X8     0
X7     0
X6     0
X5     0
X4     0
X3     0
Y      0
dtype: int64

duplicates
35

target distribution


Y
0    0.7788
1    0.2212
Name: proportion, dtype: float64

## Leakage-Risk Note

### Plausible Leakage Vectors

1. Variables such as recent repayment status and payment amounts are closely tied to a client’s payment behavior near the time of default. If these features include information from the same period or after the target outcome, they could leak future information into the model.

2. The dataset contains 35 duplicate records. If duplicates appear across training and test splits, the model may effectively “memorize” outcomes, leading to overly optimistic performance estimates.

### Leakage Prevention Strategies

- Ensure that all features used for training strictly precede the prediction window for Y. Avoid using features that summarize behavior after the default decision point.
-  Remove duplicate rows prior to splitting the data into training and test sets to prevent information leakage across splits.



In [25]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

initial_n = len(df)
df_dedup = df.drop_duplicates().reset_index(drop=True)
dedup_n = len(df_dedup)
duplicates_removed = initial_n - dedup_n

print(f"Initial rows: {initial_n}, after dedup: {dedup_n}, duplicates removed: {duplicates_removed}")

X = df_dedup.drop(columns=['Y'])
y = df_dedup['Y']

test_size = 0.20
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size,
    stratify=y,
    random_state=random_state
)

print("Shapes:")
print(" X_train:", X_train.shape)
print(" X_test: ", X_test.shape)
print(" y_train distribution:\n", y_train.value_counts(normalize=True))
print(" y_test distribution:\n", y_test.value_counts(normalize=True))

Initial rows: 30000, after dedup: 29965, duplicates removed: 35
Shapes:
 X_train: (23972, 23)
 X_test:  (5993, 23)
 y_train distribution:
 Y
0    0.778742
1    0.221258
Name: proportion, dtype: float64
 y_test distribution:
 Y
0    0.778742
1    0.221258
Name: proportion, dtype: float64


In [26]:

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, random_state=random_state)
)

pipeline.fit(X_train, y_train)

test_score = pipeline.score(X_test, y_test)
print(f"Test accuracy: {test_score:.4f}")

Test accuracy: 0.8129


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=20))
])

knn_pipeline.fit(X_train, y_train)

y_pred = knn_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"kNN Accuracy (k=20): {accuracy:.4f}")


kNN Accuracy (k=20): 0.8118


In [28]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8118


In [29]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(
    cm,
    index=["Actual: No Default", "Actual: Default"],
    columns=["Predicted: No Default", "Predicted: Default"]
)



Unnamed: 0,Predicted: No Default,Predicted: Default
Actual: No Default,4451,216
Actual: Default,912,414
