# feature engineering

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [5]:
df = pd.read_csv("../data/processed/cleaned_application.csv")

print("Shape:", df.shape)
df.head()


Shape: (307511, 103)


Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,YEARS_BIRTH
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,25.920548
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,45.931507
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,52.180822
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,52.068493
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,54.608219


In [7]:
df.isna().sum().sort_values(ascending=False).head(10)


TARGET                0
NAME_CONTRACT_TYPE    0
CODE_GENDER           0
FLAG_OWN_CAR          0
FLAG_OWN_REALTY       0
CNT_CHILDREN          0
AMT_INCOME_TOTAL      0
AMT_CREDIT            0
AMT_ANNUITY           0
AMT_GOODS_PRICE       0
dtype: int64

In [8]:
y = df["TARGET"]
X = df.drop("TARGET", axis=1)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify=y)

print( X_train.shape)
print(X_test.shape)


(246008, 102)
(61503, 102)


## encoding:

### step 1: identify categorical columns

In [11]:
cat_cols=X_train.select_dtypes(include=["object"]).columns.tolist()
cat_cols

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

### step 2: import labelEncoder

In [12]:
from sklearn.preprocessing import LabelEncoder

### step 3: create a dictionnary to store encoders 

In [13]:
encoders={}

### step 4:  fit LabelEncoder on training data only 

In [15]:
for col in cat_cols:
    l=LabelEncoder()
    X_train[col]=l.fit_transform(X_train[col])
    #X_test[col]=l.fit_transform(X_test[col]) :   WRONG  :data leakage
    X_test[col]=l.transform(X_test[col])
    encoders[col]=l # useful when i want to predict nex data later 

## additional features :
****( kaggle winners used this)****

In [17]:
# debt to income ration
df["RATIO_ANNUITY_INCOME"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]


In [18]:
# credit to income ration
df["RATIO_CREDIT_INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]


In [19]:
# credit term length:
df["CREDIT_TERM"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]


In [None]:
# family size income
df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / (df["CNT_FAM_MEMBERS"] + 1)


## saving 

In [21]:
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)


# Summary of Feature Engineering

- Cleaned missing values

- Converted categorical features using LabelEncoder

- Created additional features (ratios)

- Saved train/test splits