# Feature Engineering for Clustering


# Steps
- create cost metrics

[WIP]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
import warnings
import json
import os
import io
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the splits from the configured folder
df_train = pd.read_csv('../Dataset Creation/5411_temporal_splits/5411_train_with_costs.csv')
df_validate = pd.read_csv('../Dataset Creation/5411_temporal_splits/5411_validate_with_costs.csv')
df_test = pd.read_csv('../Dataset Creation/5411_temporal_splits/5411_test_with_costs.csv')

print(f"Train set loaded: {len(df_train)} transactions, {df_train['merchant_id'].nunique()} unique merchants")
print(f"Validate set loaded: {len(df_validate)} transactions, {df_validate['merchant_id'].nunique()} unique merchants")
print(f"Test set loaded: {len(df_test)} transactions, {df_test['merchant_id'].nunique()} unique merchants")
print(f"\nTotal transactions: {len(df_train) + len(df_validate) + len(df_test)}")
print(f"\nTrain columns: {df_train.columns.tolist()}")


Train set loaded: 167323 transactions, 2848 unique merchants
Validate set loaded: 166883 transactions, 2828 unique merchants
Test set loaded: 138806 transactions, 2626 unique merchants

Total transactions: 473012

Train columns: ['transaction_id', 'date', 'amount', 'use_chip', 'merchant_id', 'mcc', 'errors', 'card_brand', 'card_type', 'mcc_description', 'cost_type_ID', 'proc_cost']


In [27]:
# Find transactions with 0 amount (which would cause division by zero in cost_percent)

zero_amount_train = df_train[df_train['amount'] == 0]
zero_amount_validate = df_validate[df_validate['amount'] == 0]
zero_amount_test = df_test[df_test['amount'] == 0]

print(f"Train set: {len(zero_amount_train)} transactions with amount=0")
print(f"Validate set: {len(zero_amount_validate)} transactions with amount=0")
print(f"Test set: {len(zero_amount_test)} transactions with amount=0")
print(f"\nTotal transactions with amount=0: {len(zero_amount_train) + len(zero_amount_validate) + len(zero_amount_test)}")

# Display transactions with amount=0
if len(zero_amount_validate) > 0:
    print("\nValidate set transactions with amount=0:")
    print(zero_amount_validate[['transaction_id', 'merchant_id', 'amount', 'proc_cost', 'date']])


Train set: 0 transactions with amount=0
Validate set: 1 transactions with amount=0
Test set: 0 transactions with amount=0

Total transactions with amount=0: 1

Validate set transactions with amount=0:
       transaction_id  merchant_id  amount  proc_cost                 date
53416        21153715        50783     0.0       0.06  2018-04-27 11:34:00


In [18]:
# add cost percent feature
# Handle division by zero: set cost_percent to 0 when amount is 0

df_train['cost_percent'] = np.where(df_train['amount'] == 0, 0, df_train['proc_cost'] / df_train['amount'])
df_validate['cost_percent'] = np.where(df_validate['amount'] == 0, 0, df_validate['proc_cost'] / df_validate['amount'])
df_test['cost_percent'] = np.where(df_test['amount'] == 0, 0, df_test['proc_cost'] / df_test['amount'])

df_train.head()

Unnamed: 0,transaction_id,date,amount,use_chip,merchant_id,mcc,errors,card_brand,card_type,mcc_description,cost_type_ID,proc_cost,cost_percent
0,18881494,2017-01-01 00:07:00,2.54,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",26.0,0.209972,0.082666
2,18881527,2017-01-01 00:42:00,25.13,Chip Transaction,83229,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",39.0,0.476534,0.018963
3,18881539,2017-01-01 00:51:00,1.95,Chip Transaction,93881,5411.0,,Visa,Prepaid,"Grocery Stores, Supermarkets",1.0,0.103735,0.053197
4,18881574,2017-01-01 01:31:00,3.99,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",26.0,0.227082,0.056913
5,18881577,2017-01-01 01:33:00,9.61,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",38.0,0.293398,0.03053


In [19]:
# drop NaN costs

df_train = df_train.dropna(subset=['proc_cost'])
df_validate = df_validate.dropna(subset=['proc_cost'])
df_test = df_test.dropna(subset=['proc_cost'])

df_train.info()

<class 'pandas.DataFrame'>
Index: 153992 entries, 0 to 167322
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   transaction_id   153992 non-null  int64  
 1   date             153992 non-null  str    
 2   amount           153992 non-null  float64
 3   use_chip         153992 non-null  str    
 4   merchant_id      153992 non-null  int64  
 5   mcc              153992 non-null  float64
 6   errors           1751 non-null    str    
 7   card_brand       153992 non-null  str    
 8   card_type        153992 non-null  str    
 9   mcc_description  153992 non-null  str    
 10  cost_type_ID     153992 non-null  float64
 11  proc_cost        153992 non-null  float64
 12  cost_percent     153992 non-null  float64
dtypes: float64(5), int64(2), str(6)
memory usage: 27.8 MB


In [20]:
# create new train test validate dataframe grouped by merchant_id and aggregating the cost_percent feature

df_train_agg = df_train.groupby('merchant_id')['cost_percent'].mean().reset_index()
df_validate_agg = df_validate.groupby('merchant_id')['cost_percent'].mean().reset_index()
df_test_agg = df_test.groupby('merchant_id')['cost_percent'].mean().reset_index()

df_train_agg.head()

Unnamed: 0,merchant_id,cost_percent
0,34,0.054254
1,106,0.03156
2,135,0.037166
3,144,0.028724
4,152,0.029105


In [21]:
# Create features for percentage of transactions per merchant with each cost_type_ID
# For each merchant: (count of transactions with cost_type_ID X) / (total transactions for that merchant)
# If merchant doesn't have a cost_type_ID, set to 0%

all_cost_ids = pd.read_csv('../Dataset Creation/cost_type_id_18feb.csv')['cost_type_ID'].dropna().unique()



def create_cost_type_features(df, all_possible_cost_ids):
    """
    For each merchant, calculate the percentage of transactions with each cost_type_ID
    Also includes total transaction count per merchant
    Ensures all possible cost_type_IDs are represented (even if 0% for all merchants)
    """
    # Count transactions per merchant per cost_type_ID
    # fill_value=0 ensures missing cost_type_IDs are set to 0
    cost_counts = df.groupby(['merchant_id', 'cost_type_ID']).size().unstack(fill_value=0)
    
    # Ensure all possible cost_type_IDs are included as columns (add missing ones with 0s)
    for cost_id in all_possible_cost_ids:
        if cost_id not in cost_counts.columns:
            cost_counts[cost_id] = 0
    
    # Sort columns by cost_type_ID for consistency
    cost_counts = cost_counts[sorted(cost_counts.columns)]
    
    # Calculate total transactions per merchant
    total_counts = cost_counts.sum(axis=1)
    
    # Calculate percentages (merchants without a cost_type_ID will have 0%)
    cost_percentages = cost_counts.div(total_counts, axis=0)
    
    # Rename columns to be more descriptive
    cost_percentages.columns = [f'pct_cost_type_{int(col)}' for col in cost_percentages.columns]
    
    # Add total transaction count as a feature
    cost_percentages['total_transactions'] = total_counts
    
    # Reset index to make merchant_id a column
    cost_percentages = cost_percentages.reset_index()
    
    return cost_percentages

# Create cost_type_ID percentage features for each dataset
df_train_cost_features = create_cost_type_features(df_train, all_cost_ids)
df_validate_cost_features = create_cost_type_features(df_validate, all_cost_ids)
df_test_cost_features = create_cost_type_features(df_test, all_cost_ids)

print(f"Train: {df_train_cost_features.shape[0]} merchants, {df_train_cost_features.shape[1]-2} cost_type features (from {len(all_cost_ids)} possible)")
print(f"Validate: {df_validate_cost_features.shape[0]} merchants, {df_validate_cost_features.shape[1]-2} cost_type features (from {len(all_cost_ids)} possible)")
print(f"Test: {df_test_cost_features.shape[0]} merchants, {df_test_cost_features.shape[1]-2} cost_type features (from {len(all_cost_ids)} possible)")

print(f"\nFeature columns: {df_train_cost_features.columns.tolist()}")
print(f"\nSample:")
df_train_cost_features.head()


Train: 2775 merchants, 61 cost_type features (from 61 possible)
Validate: 2755 merchants, 61 cost_type features (from 61 possible)
Test: 2554 merchants, 61 cost_type features (from 61 possible)

Feature columns: ['merchant_id', 'pct_cost_type_1', 'pct_cost_type_2', 'pct_cost_type_3', 'pct_cost_type_4', 'pct_cost_type_5', 'pct_cost_type_6', 'pct_cost_type_7', 'pct_cost_type_8', 'pct_cost_type_9', 'pct_cost_type_10', 'pct_cost_type_11', 'pct_cost_type_12', 'pct_cost_type_13', 'pct_cost_type_14', 'pct_cost_type_15', 'pct_cost_type_16', 'pct_cost_type_17', 'pct_cost_type_18', 'pct_cost_type_19', 'pct_cost_type_20', 'pct_cost_type_21', 'pct_cost_type_22', 'pct_cost_type_23', 'pct_cost_type_24', 'pct_cost_type_25', 'pct_cost_type_26', 'pct_cost_type_27', 'pct_cost_type_28', 'pct_cost_type_29', 'pct_cost_type_30', 'pct_cost_type_31', 'pct_cost_type_32', 'pct_cost_type_33', 'pct_cost_type_34', 'pct_cost_type_35', 'pct_cost_type_36', 'pct_cost_type_37', 'pct_cost_type_38', 'pct_cost_type_39', '

Unnamed: 0,merchant_id,pct_cost_type_1,pct_cost_type_2,pct_cost_type_3,pct_cost_type_4,pct_cost_type_5,pct_cost_type_6,pct_cost_type_7,pct_cost_type_8,pct_cost_type_9,...,pct_cost_type_53,pct_cost_type_54,pct_cost_type_55,pct_cost_type_56,pct_cost_type_57,pct_cost_type_58,pct_cost_type_59,pct_cost_type_60,pct_cost_type_61,total_transactions
0,34,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98
2,135,0.295775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71
3,144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4,152,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [22]:
# merge the cost_type features back to the aggregated cost_percent feature dataframe

df_train_features = pd.merge(df_train_agg, df_train_cost_features, on='merchant_id', how='left')
df_validate_features = pd.merge(df_validate_agg, df_validate_cost_features, on='merchant_id', how='left')
df_test_features = pd.merge(df_test_agg, df_test_cost_features, on='merchant_id', how='left')  

df_test_features.info()

<class 'pandas.DataFrame'>
RangeIndex: 2554 entries, 0 to 2553
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   merchant_id         2554 non-null   int64  
 1   cost_percent        2554 non-null   float64
 2   pct_cost_type_1     2554 non-null   float64
 3   pct_cost_type_2     2554 non-null   float64
 4   pct_cost_type_3     2554 non-null   float64
 5   pct_cost_type_4     2554 non-null   float64
 6   pct_cost_type_5     2554 non-null   float64
 7   pct_cost_type_6     2554 non-null   float64
 8   pct_cost_type_7     2554 non-null   float64
 9   pct_cost_type_8     2554 non-null   float64
 10  pct_cost_type_9     2554 non-null   float64
 11  pct_cost_type_10    2554 non-null   float64
 12  pct_cost_type_11    2554 non-null   float64
 13  pct_cost_type_12    2554 non-null   float64
 14  pct_cost_type_13    2554 non-null   float64
 15  pct_cost_type_14    2554 non-null   float64
 16  pct_cost_type_15 

In [23]:
# add stdev of cost_percent per merchant as a feature

def add_cost_percent_stdev(df, df_features):
    stdev_df = df.groupby('merchant_id')['cost_percent'].std().reset_index().rename(columns={'cost_percent': 'cost_percent_stdev'})
    df_features = pd.merge(df_features, stdev_df, on='merchant_id', how='left')
    return df_features

df_train_features = add_cost_percent_stdev(df_train, df_train_features)
df_validate_features = add_cost_percent_stdev(df_validate, df_validate_features)
df_test_features = add_cost_percent_stdev(df_test, df_test_features)

df_train_features.info()



<class 'pandas.DataFrame'>
RangeIndex: 2775 entries, 0 to 2774
Data columns (total 65 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   merchant_id         2775 non-null   int64  
 1   cost_percent        2775 non-null   float64
 2   pct_cost_type_1     2775 non-null   float64
 3   pct_cost_type_2     2775 non-null   float64
 4   pct_cost_type_3     2775 non-null   float64
 5   pct_cost_type_4     2775 non-null   float64
 6   pct_cost_type_5     2775 non-null   float64
 7   pct_cost_type_6     2775 non-null   float64
 8   pct_cost_type_7     2775 non-null   float64
 9   pct_cost_type_8     2775 non-null   float64
 10  pct_cost_type_9     2775 non-null   float64
 11  pct_cost_type_10    2775 non-null   float64
 12  pct_cost_type_11    2775 non-null   float64
 13  pct_cost_type_12    2775 non-null   float64
 14  pct_cost_type_13    2775 non-null   float64
 15  pct_cost_type_14    2775 non-null   float64
 16  pct_cost_type_15 

In [24]:
# base metrics clustering

df_train_base = df_train_features[['merchant_id', 'cost_percent', 'cost_percent_stdev']]
df_validate_base = df_validate_features[['merchant_id', 'cost_percent', 'cost_percent_stdev']]
df_test_base = df_test_features[['merchant_id', 'cost_percent', 'cost_percent_stdev']]

df_train_base.head()

Unnamed: 0,merchant_id,cost_percent,cost_percent_stdev
0,34,0.054254,0.006133
1,106,0.03156,0.014677
2,135,0.037166,0.014822
3,144,0.028724,0.006158
4,152,0.029105,0.000798


In [25]:
# apply z score normalization to the cost_percent and cost_percent_stdev features

def z_score_normalize(df, columns):
    df = df.copy()  # Avoid modifying original dataframe
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        if std == 0 or pd.isna(std):
            df[col] = 0  # Set to 0 when std is zero (no variation) or NaN
        else:
            df[col] = (df[col] - mean) / std
    return df

df_train_features_zscored = z_score_normalize(df_train_features, ['cost_percent', 'cost_percent_stdev'])
df_validate_features_zscored = z_score_normalize(df_validate_features, ['cost_percent', 'cost_percent_stdev'])
df_test_features_zscored = z_score_normalize(df_test_features, ['cost_percent', 'cost_percent_stdev'])  

# base metrics clustering

df_train_base_zscored = df_train_features_zscored[['merchant_id', 'cost_percent', 'cost_percent_stdev']]
df_validate_base_zscored = df_validate_features_zscored[['merchant_id', 'cost_percent', 'cost_percent_stdev']]
df_test_base_zscored = df_test_features_zscored[['merchant_id', 'cost_percent', 'cost_percent_stdev']]

df_train_base_zscored.head()

Unnamed: 0,merchant_id,cost_percent,cost_percent_stdev
0,34,0.012929,-0.269064
1,106,-0.137961,-0.192923
2,135,-0.100683,-0.191632
3,144,-0.156815,-0.26884
4,152,-0.154279,-0.316606


In [26]:
# Save the cleaned DataFrame to a new CSV file

df_train_base.to_csv('df_train_base.csv', index=False)
df_validate_base.to_csv('df_validate_base.csv', index=False)
df_test_base.to_csv('df_test_base.csv', index=False)

df_train_base_zscored.to_csv('df_train_base_zscored.csv', index=False)
df_validate_base_zscored.to_csv('df_validate_base_zscored.csv', index=False)
df_test_base_zscored.to_csv('df_test_base_zscored.csv', index=False)

df_train

Unnamed: 0,transaction_id,date,amount,use_chip,merchant_id,mcc,errors,card_brand,card_type,mcc_description,cost_type_ID,proc_cost,cost_percent
0,18881494,2017-01-01 00:07:00,2.54,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",26.0,0.209972,0.082666
2,18881527,2017-01-01 00:42:00,25.13,Chip Transaction,83229,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",39.0,0.476534,0.018963
3,18881539,2017-01-01 00:51:00,1.95,Chip Transaction,93881,5411.0,,Visa,Prepaid,"Grocery Stores, Supermarkets",1.0,0.103735,0.053197
4,18881574,2017-01-01 01:31:00,3.99,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",26.0,0.227082,0.056913
5,18881577,2017-01-01 01:33:00,9.61,Chip Transaction,75781,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",38.0,0.293398,0.030530
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167318,20604727,2017-12-31 22:33:00,7.92,Chip Transaction,510,5411.0,,Mastercard,Credit,"Grocery Stores, Supermarkets",36.0,0.267016,0.033714
167319,20604739,2017-12-31 22:36:00,13.99,Chip Transaction,8702,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",38.0,0.345082,0.024666
167320,20604757,2017-12-31 22:45:00,0.12,Chip Transaction,44886,5411.0,,Visa,Prepaid,"Grocery Stores, Supermarkets",1.0,0.072076,0.600633
167321,20604773,2017-12-31 22:52:00,56.46,Swipe Transaction,5408,5411.0,,Mastercard,Debit,"Grocery Stores, Supermarkets",39.0,0.846228,0.014988
