In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from typing import List
from sklearn.model_selection import train_test_split
from Model_Utils.feature_nan_imputation import DataImputer
from Model_Utils.feature_outlier_handling import OutlierHandler
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv("Data/raw_data/extracted_files/bank_churners.csv")
df = df.drop(columns=["CLIENTNUM"])
X = df.drop(columns=["Attrition_Flag"])
y = df[["Attrition_Flag"]]
y.head()

Unnamed: 0,Attrition_Flag
0,Existing Customer
1,Existing Customer
2,Existing Customer
3,Existing Customer
4,Existing Customer


In [50]:

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [51]:
y_train.head()

Unnamed: 0,Attrition_Flag
1082,Existing Customer
70,Existing Customer
1822,Existing Customer
8058,Existing Customer
3102,Existing Customer


In [52]:
x_imp, y_imp = DataImputer(numeric_strategy="mean"), DataImputer()
X_train_impute = x_imp.fit_transform(X_train)
y_train_impute = y_imp.fit_transform(y_train)
X_val_impute = x_imp.transform(X_val)
y_val_impute = y_imp.transform(y_val)
X_test_impute = x_imp.transform(X_test)
y_test_impute = y_imp.transform(y_test)

outlier_handler = OutlierHandler(iqr_threshold=1.5)
X_train_transformed = outlier_handler.fit_transform(X_train_impute,'yeo')
X_val_transformed = outlier_handler.transform(X_val_impute,'yeo')
X_test_transformed = outlier_handler.transform(X_test_impute,'yeo')


No numeric columns to impute.


In [53]:
X_test_transformed.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
7002,35.346344,F,2.0,Graduate,Married,Less than $40K,Blue,37.425502,3.0,1.686329,2.040506,2.984663,0.0,8.688394,0.391201,5.040089,37.770937,0.544588,0.0
5427,41.198527,F,2.0,Unknown,Single,Less than $40K,Blue,54.112989,6.0,0.763339,4.127308,3.050488,2517.0,5.748274,0.390787,4.964433,33.727422,0.519159,0.94
5802,46.071084,M,3.0,Graduate,Unknown,$120K +,Blue,54.112989,3.0,1.281876,3.079838,3.151732,795.0,11.326019,0.444408,5.058946,41.038439,0.412962,0.085
8361,44.122495,F,5.0,Unknown,Divorced,$40K - $60K,Blue,49.026861,2.0,1.281876,3.079838,2.984663,0.0,8.688394,0.440844,5.063697,36.43895,0.5094,0.0
2037,46.071084,F,2.0,Graduate,Single,Less than $40K,Blue,54.112989,3.0,1.686329,3.079838,3.050488,1390.0,8.531831,0.438326,4.765508,29.166533,0.350159,0.519


In [54]:
ordinal_cols= ["Card_Category", "Education_Level", "Income_Category"]
ohe_cols = ["Gender", "Marital_Status"]
categories: dict = {
    "Education_Level": ["Unknown", "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate"],
    "Income_Category": ["Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"],
    "Card_Category": ["Blue", "Silver", "Gold", "Platinum"]
}

encoding_transformer = ColumnTransformer([
            ('ord', OrdinalEncoder(categories=[categories[col] for col in ordinal_cols]), ordinal_cols),
            ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ohe_cols)
        ], remainder='passthrough')


full_pipeline = Pipeline([
            ('encode', encoding_transformer),
            ('scale', StandardScaler())
        ])

# Fit-transform and get feature names
X_train_final = full_pipeline.fit_transform(X_train)
X_val_final = full_pipeline.transform(X_val)
X_test_final = full_pipeline.transform(X_test)
feature_names = full_pipeline.get_feature_names_out()

In [55]:
df1=pd.DataFrame(X_train_final, columns=feature_names)
df2=pd.DataFrame(X_val_final, columns=feature_names)
df3=pd.DataFrame(X_test_final, columns=feature_names)

In [56]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ord__Card_Category                   2025 non-null   float64
 1   ord__Education_Level                 2025 non-null   float64
 2   ord__Income_Category                 2025 non-null   float64
 3   ohe__Gender_F                        2025 non-null   float64
 4   ohe__Gender_M                        2025 non-null   float64
 5   ohe__Marital_Status_Divorced         2025 non-null   float64
 6   ohe__Marital_Status_Married          2025 non-null   float64
 7   ohe__Marital_Status_Single           2025 non-null   float64
 8   ohe__Marital_Status_Unknown          2025 non-null   float64
 9   remainder__Customer_Age              2025 non-null   float64
 10  remainder__Dependent_count           2025 non-null   float64
 11  remainder__Months_on_book     

In [None]:
import pandas as pd 


Unnamed: 0,ord__Card_Category,ord__Education_Level,ord__Income_Category,ohe__Gender_F,ohe__Gender_M,ohe__Marital_Status_Divorced,ohe__Marital_Status_Married,ohe__Marital_Status_Single,ohe__Marital_Status_Unknown,remainder__Customer_Age,...,remainder__Months_Inactive_12_mon,remainder__Contacts_Count_12_mon,remainder__Credit_Limit,remainder__Total_Revolving_Bal,remainder__Avg_Open_To_Buy,remainder__Total_Amt_Chng_Q4_Q1,remainder__Total_Trans_Amt,remainder__Total_Trans_Ct,remainder__Total_Ct_Chng_Q4_Q1,remainder__Avg_Utilization_Ratio
0,-0.256861,1.411052,-0.071017,0.953358,-0.953358,-0.281458,-0.923582,-0.802736,3.52739,0.348015,...,-1.475167,1.389526,-0.064908,-1.425138,0.17677,1.05609,-1.414426,-1.692976,-2e-05,-0.996743
1,-0.256861,0.827738,1.957982,-1.048924,1.048924,-0.281458,-0.923582,1.24574,-0.283496,0.599846,...,-0.259478,0.503153,1.709003,0.744616,1.742536,2.044286,-1.275771,-0.999294,1.408271,-0.810736
2,-0.256861,-1.505519,0.605316,-1.048924,1.048924,-0.281458,1.082741,-0.802736,-0.283496,-1.033105,...,-1.475167,-0.396038,-0.045389,0.642466,-0.153218,1.957972,-0.66096,-1.28793,-0.096044,0.323537
3,-0.256861,-1.505519,-0.74735,0.953358,-0.953358,-0.281458,-0.923582,1.24574,-0.283496,1.482784,...,0.714075,-1.312765,-0.277266,0.891071,-0.474223,0.338848,0.442109,0.190881,0.833779,0.801318
4,-0.256861,0.827738,0.605316,-1.048924,1.048924,-0.281458,-0.923582,1.24574,-0.283496,-1.408466,...,0.714075,0.503153,0.616834,-0.540252,0.611772,-0.9867,-0.788981,-0.18974,-0.44005,-0.697674


In [4]:
X_train = pd.read_csv("Data/processed_data/X_train.csv")
X_val = pd.read_csv("Data/processed_data/X_val.csv")
X_test = pd.read_csv("Data/processed_data/X_test.csv")
y_train = pd.read_csv("Data/processed_data/y_train.csv")
y_val = pd.read_csv("Data/processed_data/y_val.csv")
y_test = pd.read_csv("Data/processed_data/y_test.csv")

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
7002,35.346344,F,2.0,Graduate,Married,Less than $40K,Blue,37.425502,3.0,1.686329,2.040506,2.984663,0.0,8.688394,0.391201,5.040089,37.770937,0.544588,0.0
5427,41.198527,F,2.0,Unknown,Single,Less than $40K,Blue,54.112989,6.0,0.763339,4.127308,3.050488,2517.0,5.748274,0.390787,4.964433,33.727422,0.519159,0.94
5802,46.071084,M,3.0,Graduate,Unknown,$120K +,Blue,54.112989,3.0,1.281876,3.079838,3.151732,795.0,11.326019,0.444408,5.058946,41.038439,0.412962,0.085
8361,44.122495,F,5.0,Unknown,Divorced,$40K - $60K,Blue,49.026861,2.0,1.281876,3.079838,2.984663,0.0,8.688394,0.440844,5.063697,36.43895,0.5094,0.0
2037,46.071084,F,2.0,Graduate,Single,Less than $40K,Blue,54.112989,3.0,1.686329,3.079838,3.050488,1390.0,8.531831,0.438326,4.765508,29.166533,0.350159,0.519
