In [10]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import (VarianceThreshold, SelectKBest, chi2, f_classif, mutual_info_classif,
                                       SelectFromModel, RFE, SequentialFeatureSelector)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')

In [16]:
# Identify categorical and numerical columns
categorical_columns = X_train.drop(columns='CLERK_TYPE').select_dtypes(include=['object', 'category']).columns
numerical_columns = X_train.drop(columns='ID_CLIENT').select_dtypes(include=['int64', 'float64']).columns

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean for numerical features
            #('scaler', StandardScaler())  # Scale numerical features
        ]), numerical_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode for categorical features
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
        ]), categorical_columns)
    ]
)

# Define the number of top features to select
k = 15

# Create a pipeline for preprocessing, feature selection, and model training
pipeline = Pipeline([
    ('preprocessing', preprocessor),  # Preprocess the data
    ('feature_selection', SelectKBest(score_func=chi2, k=k)),  # Select top k features
    ('classifier', ExtraTreesClassifier(random_state=42))  # Extra Trees Classifier
])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train.values.ravel())

# Evaluate the pipeline on the validation set
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))

# Get the selected feature names
selected_features = pipeline.named_steps['feature_selection'].get_support(indices=True)
preprocessed_feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()
selected_feature_names = preprocessed_feature_names[selected_features]
print("Selected Features:", selected_feature_names)

              precision    recall  f1-score   support

           0       0.75      0.94      0.83      1955
           1       0.34      0.10      0.15       667

    accuracy                           0.72      2622
   macro avg       0.55      0.52      0.49      2622
weighted avg       0.65      0.72      0.66      2622

Selected Features: ['num__PAYMENT_DAY' 'num__MONTHS_IN_RESIDENCE'
 'num__PERSONAL_MONTHLY_INCOME' 'num__OTHER_INCOMES'
 'num__PERSONAL_ASSETS_VALUE' 'num__OCCUPATION_TYPE' 'num__AGE'
 'num__RESIDENCIAL_ZIP_3' 'num__PROFESSIONAL_ZIP_3'
 'cat__RESIDENCIAL_CITY_CACHOEIRAS DE MACACU'
 'cat__RESIDENCIAL_CITY_Duque de Caxias' 'cat__RESIDENCIAL_CITY_SALVADOR'
 'cat__FLAG_RESIDENCIAL_PHONE_N' 'cat__RESIDENCIAL_PHONE_AREA_CODE_ '
 'cat__RESIDENCIAL_PHONE_AREA_CODE_58']


In [7]:
X_train.RESIDENCIAL_PHONE_AREA_CODE.value_counts()

RESIDENCIAL_PHONE_AREA_CODE
       5230
5      2914
97     1870
107    1864
54     1478
       ... 
106       1
64        1
60        1
8         1
59        1
Name: count, Length: 94, dtype: int64

In [12]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID_CLIENT,16384.0,24958.226562,14441.337506,5.0,12342.0,25016.5,37505.25,49989.0
PAYMENT_DAY,16384.0,12.887085,6.629545,1.0,10.0,10.0,15.0,25.0
QUANT_ADDITIONAL_CARDS,16384.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
POSTAL_ADDRESS_TYPE,16384.0,1.006042,0.077501,1.0,1.0,1.0,1.0,2.0
MARITAL_STATUS,16384.0,2.161255,1.3344,0.0,1.0,2.0,2.0,7.0
QUANT_DEPENDANTS,16384.0,0.652954,1.233144,0.0,0.0,0.0,1.0,53.0
EDUCATION_LEVEL,16384.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NACIONALITY,16384.0,0.963196,0.198389,0.0,1.0,1.0,1.0,2.0
RESIDENCE_TYPE,15921.0,1.246404,0.861457,0.0,1.0,1.0,1.0,5.0
MONTHS_IN_RESIDENCE,15120.0,9.728042,10.720125,0.0,1.0,6.0,15.0,228.0
