# Feature Engineering
In this notebook, we create meaningful derived features from the cleaned dataset to enhance predictive power.

Objectives:
- Create financial stress indicators
- Build transaction behavior features
- Encode categorical variables
- Prepare dataset for modeling

In [1]:
import pandas as pd
import numpy as np

# Load cleaned dataset
df = pd.read_csv("../data/cleaned_bank_data.csv")

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (10127, 21)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


## Credit Utilization Derived Features
Credit utilization indicates how much of the available credit is being used.

High utilization often signals financial stress.
Low utilization may indicate disengagement.

We will:
- Recalculate utilization
- Create risk flags
- Create utilization category
- Compute credit headroom

In [2]:
required_cols = {'Credit_Limit', 'Total_Revolving_Bal'}

if required_cols.issubset(df.columns):

    # Recalculate utilization safely
    df['calculated_utilization'] = np.where(
        df['Credit_Limit'] > 0,
        df['Total_Revolving_Bal'] / df['Credit_Limit'],
        np.nan
    )

    # Clip unrealistic values
    df['calculated_utilization'] = df['calculated_utilization'].clip(0, 1)

    # High utilization (>=80%)
    df['High_Utilization_Flag'] = np.where(
        df['calculated_utilization'] >= 0.8, 1, 0
    )

    # Low utilization (<=20%)
    df['Low_Utilization_Flag'] = np.where(
        df['calculated_utilization'] <= 0.2, 1, 0
    )

    # Utilization Category
    df['Utilization_Category'] = np.select(
        [
            df['calculated_utilization'] <= 0.2,
            df['calculated_utilization'].between(0.2, 0.8),
            df['calculated_utilization'] >= 0.8
        ],
        [
            'Low',
            'Medium',
            'High'
        ],
        default='medium'
    )

    # Credit Headroom
    df['Credit_Headroom'] = df['Credit_Limit'] - df['Total_Revolving_Bal']

else:
    print("Required columns not found.")


## Transaction Behavior Features
Customer transaction behavior provides insight into engagement levels.

We will derive:

- Average transaction value
- Transaction intensity
- Activity rate

In [5]:
required_cols = {'Total_Trans_Amt', 'Total_Trans_Ct'}
if required_cols.issubset(df.columns):

    # avg transaction value 
    df['avg_transaction_value'] = np.where(
        df['Total_Trans_Ct'] > 0, df['Total_Trans_Amt'] / df['Months_on_book'], np.nan
    )

    # transaction intenstiy [amount per month]
    if 'Months_on_book' in df.columns:
        df['Transaction_Intensity'] = np.where(
            df['Months_on_book'] > 0, df['Total_Trans_Amt'] / df['Months_on_book'], np.nan
        )

    # inactive ratio 
    if 'Months_Inactive_12_mon' in df.columns:
        df['Inactivity_Rate'] = df['Months_Inactive_12_mon']/12
    else: 
        print('transaction column missing')

## Customer Engagement Features
we will create behavioral flags:
- High Contact Frequency 
- Inactive Customer Flag
- High Value Customer Flag 


In [10]:
# high contact frequency [>= 4 times contact per year]
if 'Contacts_Count_12_mon' in df.columns:
    df['High_Contact_Flag'] = np.where(
        df['Contacts_Count_12_mon'] >= 4, 1, 0
    ) 

# inactive customer flag (>3 inactive months)
if 'Months_Inactive_12_mon' in df.columns:
    df['Inactive_Customer_Flag'] = np.where(
        df['Months_Inactive_12_mon'] > 3, 1, 0
    )

# high Value Customer (top 25% credit Limit)
if 'Credit_Limit' in df.columns:
    threshold = df['Credit_Limit'].quantile(0.75)
    df['High_Value_Customer_Flag'] = np.where(
        df['Credit_Limit'] >= threshold, 1, 0
    )


## Target Encoding
Convert target variable into binary format for modeling.

Attrited Customer → 1
Existing Customer → 0

In [11]:
df['Churn_Binary'] = df['Attrition_Flag'].map({
    'Attrited Customer': 1,
    'Existing Customer': 0
})

## Encode Categorical Variables
Machine learning models require numerical input.

We will use:

One-hot encoding for nominal variables

Drop first category to avoid multicollinearity

In [12]:
categorical_cols = df.select_dtypes(include=['object']).columns

# Remove target column from encoding
categorical_cols = categorical_cols.drop('Attrition_Flag')

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = df.select_dtypes(include=['object']).columns


## Final Validation
We verify:
- No missing values
- No duplicates
- Final dataset shape

In [13]:
print("Final Dataset Shape:", df.shape)

print("\nMissing Values:\n", df.isnull().sum().sum())

print("\nDuplicate Rows:", df.duplicated().sum())

df.head()

Final Dataset Shape: (10127, 47)

Missing Values:
 0

Duplicate Rows: 0


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,...,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Utilization_Category_Low,Utilization_Category_Medium
0,768805383,Existing Customer,45,3,39,5,1,3,12691.0,777,...,False,True,False,False,False,False,False,False,True,False
1,818770008,Existing Customer,49,5,44,6,1,2,8256.0,864,...,False,False,False,True,False,False,False,False,True,False
2,713982108,Existing Customer,51,3,36,4,1,0,3418.0,0,...,False,False,True,False,False,False,False,False,True,False
3,769911858,Existing Customer,40,4,34,3,4,1,3313.0,2517,...,False,False,False,True,False,False,False,False,False,True
4,709106358,Existing Customer,40,3,21,5,1,0,4716.0,0,...,False,True,False,False,False,False,False,False,True,False


In [14]:
# Python (Save Engineered Dataset)
df.to_csv("engineered_bank_data.csv", index=False)