In [1]:
import pandas as pd
import openpyxl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv("./citizens_data/BrownDSI_masked_capstone_data.csv_20250401031515")

In [8]:
# Drop those problematic columns
df = df.drop(columns=['drawee_avg', 'drawee_max', 'drawee_min', 'RDI_DT', 'RETURN_REASON', 'over_draft_amount'])

In [18]:
Y = df['return_target']
X = df.drop(columns=['return_target'])

In [22]:
cat_ftrs = ['onus_ind', 'treasury_check_ind', 'heloc_ind']
# ordinal_ftrs = ['LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',\
#                'BsmtFinType1','BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish',\
#                'GarageQual','GarageCond','PoolQC','Fence']
# ordinal_cats = [['Reg','IR1','IR2','IR3'],['AllPub','NoSewr','NoSeWa','ELO'],['Gtl','Mod','Sev'],\
#                ['Po','Fa','TA','Gd','Ex'],['Po','Fa','TA','Gd','Ex'],['NA','Po','Fa','TA','Gd','Ex'],\
#                ['NA','Po','Fa','TA','Gd','Ex'],['NA','No','Mn','Av','Gd'],['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],\
#                ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],['Po','Fa','TA','Gd','Ex'],['Po','Fa','TA','Gd','Ex'],\
#                ['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],['NA','Po','Fa','TA','Gd','Ex'],\
#                ['NA','Unf','RFn','Fin'],['NA','Po','Fa','TA','Gd','Ex'],['NA','Po','Fa','TA','Gd','Ex'],
#                ['NA','Fa','TA','Gd','Ex'],['NA','MnWw','GdWo','MnPrv','GdPrv']]
log_num_ftrs = ['rdis', 'max_deposit_amount30d', 'total_deposit_item_count']
num_ftrs = ['drawee_sum', 'drawee_cnt']

In [26]:
# preprocess with pipeline and columntransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer

random_state = 42

# one-hot encoder
# We need to replace the NaN with a string first!
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='other')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# ordinal encoder
# We need to replace the NaN with a string first!
# ordinal_transformer = Pipeline(steps=[
#     ('imputer2', SimpleImputer(strategy='constant',fill_value='NA')),
#     ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

log_numeric_transformer = Pipeline(steps=[
    ('log', FunctionTransformer(np.log1p, feature_names_out = 'one-to-one')),
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('log_num', log_numeric_transformer, log_num_ftrs),
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

In [27]:
X_prep = preprocessor.fit_transform(X)

In [28]:
X_prep

array([[        nan,  0.62259073, -0.61815274, ...,  0.        ,
         1.        ,  0.        ],
       [        nan, -0.1330338 , -0.61815274, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.03013261, -1.24534128,  0.10148317, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [        nan, -1.24534128,  0.10148317, ...,  0.        ,
         1.        ,  0.        ],
       [        nan, -1.24534128, -0.61815274, ...,  0.        ,
         1.        ,  0.        ],
       [        nan,  1.02314574,  1.3317089 , ...,  0.        ,
         1.        ,  0.        ]])