## Identifying Feature

### Categorical Feature
-  sex
-  education
-  marriage
-  pay_1, pay_2, pay_3, pay_4, pay_5, pay_6 (pay_col are actually payment status, so we will treat them as categories..)

### Numerical Feature
- limit_bal
- age
- bill_amt1, bill_amt2, bill_amt3, bill_amt4, bill_amt5, bill_amt6
- pay_amt1, pay_amt2, pay_amt3, pay_amt4, pay_amt5, pay_amt6

### Target Featur
- default_next_month

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

RND = 42
data = pd.read_csv("../data/cleaned_data.csv")
PROCESSED_DATA_FILE = "data/processed_data.csv"
PREPROCESSOR_FILE = "models/preprocessor.joblib"

In [2]:
print("Shape:", data.shape)
display(data.head())
print("\nMissing values in each column:\n", data.isnull().sum())
print("\nTarget distribution:\n", data['default_next_month'].value_counts(normalize=True))


Shape: (30000, 25)


Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_next_month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0



Missing values in each column:
 id                    0
limit_bal             0
sex                   0
education             0
marriage              0
age                   0
pay_1                 0
pay_2                 0
pay_3                 0
pay_4                 0
pay_5                 0
pay_6                 0
bill_amt1             0
bill_amt2             0
bill_amt3             0
bill_amt4             0
bill_amt5             0
bill_amt6             0
pay_amt1              0
pay_amt2              0
pay_amt3              0
pay_amt4              0
pay_amt5              0
pay_amt6              0
default_next_month    0
dtype: int64

Target distribution:
 default_next_month
0    0.7788
1    0.2212
Name: proportion, dtype: float64


In [3]:
# Target
target = 'default_next_month'

# Payment delay columns 
pay_cols = [f'pay_{i}' for i in range(1,7)]

# Billing and payment 
bill_cols = [f'bill_amt{i}' for i in range(1,7)]
pay_amt_cols = [f'pay_amt{i}' for i in range(1,7)]

# Grouping the columns by there type 
numerical_features = ['limit_bal', 'age'] + bill_cols + pay_amt_cols
categorical_features = ['sex', 'education', 'marriage'] + pay_cols

print("Numerical:", numerical_features)
print("Categorical:", categorical_features)


Numerical: ['limit_bal', 'age', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']
Categorical: ['sex', 'education', 'marriage', 'pay_1', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']


In [4]:
# Checking unique values for categorical features
print("Categorical features unique values:")
data[categorical_features].nunique()

Categorical features unique values:


sex           2
education     7
marriage      4
pay_1        11
pay_2        11
pay_3        11
pay_4        11
pay_5        10
pay_6        10
dtype: int64

In [5]:
# Onehot encoding for categorical features
onehotencode = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False )

# Standard Scaler for numerical features
scaler = StandardScaler()

# ColumnTransformer for different column types
preprocessor = ColumnTransformer(
    transformers =[
        ("numerical", scaler, numerical_features),
        ("categorical", onehotencode, categorical_features)
    ]
)

# creating preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

In [6]:
# splitting the data
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.20, 
    random_state=RND, 
    stratify=y 
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Target ratio - Train: {y_train.mean():.3f}, Test: {y_test.mean():.3f}")

Training set: (24000, 24), Test set: (6000, 24)
Target ratio - Train: 0.221, Test: 0.221


In [7]:
# Fit preprocessing on training data
preprocessor.fit(X_train)

# Transforming both train and test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

numerical_columns = numerical_features
categorical_columns = preprocessor.named_transformers_['categorical'].get_feature_names_out(categorical_features)

processed_columns = list(numerical_columns) + list(categorical_columns)

x_train_data = pd.DataFrame(X_train_processed, columns= processed_columns, index= X_train.index)
x_test_data = pd.DataFrame(X_test_processed, columns= processed_columns,  index= X_test.index)

print("Processed training data shape:", x_train_data.shape)
x_train_data.head()

Processed training data shape: (24000, 82)


Unnamed: 0,limit_bal,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,...,pay_5_8,pay_6_-1,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7,pay_6_8
22788,-0.056866,-0.264558,1.505547,1.745089,1.778869,1.891679,2.020839,2.096346,0.580657,-0.290332,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29006,-0.134081,-0.155804,-0.695165,-0.691138,-0.68509,-0.673845,-0.662488,-0.652126,-0.344969,-0.290332,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16950,-1.21509,1.58426,-0.5568,-0.576836,-0.561615,-0.547887,-0.526789,-0.510492,-0.348128,-0.227081,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22280,0.406423,-0.699574,2.11919,2.287237,2.460102,2.738001,1.509166,1.613118,0.215287,0.154056,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11346,1.101358,-0.373312,-0.695165,-0.691886,-0.680324,-0.668755,-0.666365,-0.656089,-0.348251,-0.274236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Saving data to CSV files
x_train_data[target] = y_train
x_test_data[target] = y_test

x_train_data.to_csv("../data/train_processed.csv")
x_test_data.to_csv("../data/test_processed.csv")