In [None]:
def load_data():
    df = pd.read_csv('../data/kzhan176/citizens_data/BrownDSI_masked_capstone_data.csv_20250401031515')
    return df

In [None]:
def process_calgorial_features(df):

    '''
    one hot encoding:
    - masked_bank_num
    - masked_account_type
    - masked_product_code_grouped
    - relationship_balance_new_account
    - oao_flg
    - onus_ind
    - treasury_check_ind
    - heloc_ind
    '''
    one_hot_features = ['masked_bank_num', 'masked_account_type', 'masked_product_code_grouped']
    df = pd.get_dummies(df, columns=one_hot_features)
    
    df['oao_flg'] = (df['oao_flg'] == 'Y').astype(int)

    cat_ftrs = ['onus_ind', 'treasury_check_ind', 'heloc_ind']
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='other')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('cat', categorical_transformer, cat_ftrs)]
    )
    df = preprocessor.fit_transform(df)


    '''
    ordinal encoding
    - bucket_days_since_open
    - deposit_quarter
    - deposit_dayofweek
    '''
    ordinal_mapping = {
        '0-1000': 0,
        '1000-2000': 1,
        '2000-5000': 2,
        '5000+': 3
    }
    df['bucket_days_since_open'] = df['bucket_days_since_open'].map(ordinal_mapping)
    # fill missing values with -1
    df['bucket_days_since_open'] = df['bucket_days_since_open'].fillna(-1)

    df['deposit_dt'] = pd.to_datetime(df['deposit_dt'])
    df['deposit_quarter'] = df['deposit_dt'].dt.quarter
    df['deposit_dayofweek'] = df['deposit_dt'].dt.dayofweek
    df = df.drop(columns='deposit_dt')

    
    


def process_continous_features(df):
    # total_deposit_amount
    df['total_deposit_amount'] = df['total_deposit_amount'].applymap(lambda x: np.log1p(x) if x > 0 else 0)

    # item_amt
    df['item_amt'] = df['item_amt'].applymap(lambda x: np.log1p(x) if x > 0 else 0)

    # relationship_balance
    df['relationship_balance_new_account'] = (df['relationship_balance'] == -99999999).astype(int)

    df['relationship_balance'] = df['relationship_balance'].replace(-99999999, np.nan)
    median_val = df['relationship_balance'].median()
    df['relationship_balance'] = df['relationship_balance'].fillna(median_val)
    min_val = df['relationship_balance'].min()
    shift = 1 - min_val if min_val <= 0 else 0
    df['relationship_balance'] = np.log1p(df['relationship_balance'] + shift)

    # rdis, max_deposit_amount30d, total_deposit_item_count
    # drawee_sum, drawee_cnt
    num_ftrs = ['drawee_sum', 'drawee_cnt']
    log_num_ftrs = ['rdis', 'max_deposit_amount30d', 'total_deposit_item_count']

    df['rdis'] = df['rdis'].fillna(0)
    df['drawee_sum'] = df['drawee_sum'].fillna(0)
    df['drawee_cnt'] = df['drawee_cnt'].fillna(0)

    numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

    log_numeric_transformer = Pipeline(steps=[
        ('log', FunctionTransformer(np.log1p, feature_names_out = 'one-to-one')),
        ('scaler', RobustScaler())])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('log_num', log_numeric_transformer, log_num_ftrs),
            ('num', numeric_transformer, num_ftrs)]
    )
    df = preprocessor.fit_transform(df)


def normalization():
    # Step 2: RobustScaler
    robust_scaler_total = RobustScaler()
    df['total_deposit_amount_log_scaled'] = robust_scaler_total.fit_transform(
        df[['total_deposit_amount_log']]
    )

    # Step 2: RobustScaler
    robust_scaler_item = RobustScaler()
    df['item_amt_log_scaled'] = robust_scaler_item.fit_transform(
        df[['item_amt_log']]
    )
    







def feature_engineer(df):

    # drop unused columns
    cols_to_drop = ['masked_dep_acct_num', 'masked_id', 'channel']
    df = df.drop(columns=cols_to_drop)

    # process masked_product_code
    top_n = 8
    top_product_codes = df['masked_product_code'].value_counts().nlargest(top_n).index
    df['masked_product_code_grouped'] = df['masked_product_code'].apply(
        lambda x: f'prod_{x}' if x in top_product_codes else 'Other'
    )
    df = df.drop(columns='masked_product_code')


