In [None]:
import pandas as pd
import datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
import warnings
import json
import os
import io
from datasets import load_dataset

In [None]:
dataset_name = "thiru1711/Financial_Transactions"
ds = load_dataset(dataset_name)

# print(ds)
for feature_name, feature_type in ds['train'].features.items():
    print(f"Column: {feature_name}, Data Type: {feature_type.dtype if hasattr(feature_type, 'dtype') else str(feature_type)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


master_transactions.parquet:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13305915 [00:00<?, ? examples/s]

Column: transaction_id, Data Type: string
Column: date, Data Type: timestamp[ns]
Column: card_id, Data Type: string
Column: amount, Data Type: float32
Column: use_chip, Data Type: string
Column: merchant_id, Data Type: int64
Column: merchant_city, Data Type: string
Column: merchant_state, Data Type: string
Column: zip, Data Type: float64
Column: mcc, Data Type: string
Column: errors, Data Type: string
Column: is_fraud, Data Type: int64
Column: card_brand, Data Type: string
Column: card_type, Data Type: string
Column: card_number, Data Type: int64
Column: expires, Data Type: string
Column: cvv, Data Type: int16
Column: has_chip, Data Type: string
Column: num_cards_issued, Data Type: int64
Column: credit_limit, Data Type: float32
Column: acct_open_date, Data Type: string
Column: year_pin_last_changed, Data Type: int64
Column: card_on_dark_web, Data Type: string
Column: current_age, Data Type: int64
Column: retirement_age, Data Type: int64
Column: birth_year, Data Type: int64
Column: birt

In [None]:
df = ds['train'].to_pandas()

# alternatively
# df = pd.read_parquet("hf://datasets/thiru1711/Financial_Transactions/master_transactions.parquet")

In [None]:
# Drop columns not needed
drop_cols = [
    # PII/Security Fields
    'card_number', 'cvv', 'expires', 'address',

    # Cardholder Demographics
    'current_age', 'retirement_age', 'birth_year', 'birth_month', 'gender',
    'latitude', 'longitude', 'per_capita_income', 'yearly_income',
    'total_debt', 'credit_score', 'num_credit_cards','credit_limit',

    # Account Metadata
    'card_id', 'acct_open_date', 'year_pin_last_changed',
    'card_on_dark_web', 'num_cards_issued',

    # Geographical features
    'merchant_state', 'zip', 'merchant_city', 'has_chip'
]

df = df.drop(columns=drop_cols)


In [None]:
df = df[df['is_fraud'] != 1]

In [None]:
# Convert 'mcc' column to numeric, coercing errors to NaN
df['mcc'] = pd.to_numeric(df['mcc'], errors='coerce')

# Filter out rows where 'mcc' is NaN after conversion (if any)
df = df.dropna(subset=['mcc'])

# Filter out rows where is fraud == 1 and then drop the whole column
df = df[df['is_fraud'] != 1]

# Drop row
df = df.drop(columns='is_fraud')

# Rename "Debit (Prepaid)" to "Prepaid" in card_type column
df['card_type'] = df['card_type'].replace('Debit (Prepaid)', 'Prepaid')

'''
# Rename the column merchant city
df = df.rename(columns={'merchant_city': 'transaction_type'})

# Update values
df['transaction_type'] = np.where(df['transaction_type'] != 'ONLINE', 'OFFLINE', df['transaction_type'])
'''

print(f"Remaining columns")
for feature_name, feature_type in df.dtypes.items():
    print(f"Column: {feature_name}, Data Type: {feature_type}")

Remaining columns
Column: transaction_id, Data Type: object
Column: date, Data Type: datetime64[ns]
Column: amount, Data Type: float32
Column: merchant_id, Data Type: int64
Column: mcc, Data Type: int64
Column: errors, Data Type: object
Column: card_brand, Data Type: object
Column: card_type, Data Type: object
Column: has_chip, Data Type: object
Column: mcc_description, Data Type: object


In [None]:
train_merchant_ids = pd.read_csv('/content/4121_train.csv')
train_merchant_ids

In [None]:
train_merchant_ids.columns = ['merchant_id']
train_merchant_ids

Unnamed: 0,merchant_id
0,5248
1,14528
2,43293
3,59935
4,60600
5,86438


In [None]:
# Rename "Debit (Prepaid)" to "Prepaid" in card_type column
df['card_type'] = df['card_type'].replace('Debit (Prepaid)', 'Prepaid')
df

Unnamed: 0,transaction_id,date,amount,merchant_id,mcc,errors,card_brand,card_type,has_chip,mcc_description
0,7475327,2010-01-01 00:01:00,-77.000000,59935,5499,,Mastercard,Prepaid,YES,Miscellaneous Food Stores
1,7475328,2010-01-01 00:02:00,14.570000,67570,5311,,Mastercard,Credit,YES,Department Stores
2,7475329,2010-01-01 00:02:00,80.000000,27092,4829,,Mastercard,Debit,YES,Money Transfer
3,7475331,2010-01-01 00:05:00,200.000000,27092,4829,,Mastercard,Debit,NO,Money Transfer
4,7475332,2010-01-01 00:06:00,46.410000,13051,5813,,Visa,Debit,YES,Drinking Places (Alcoholic Beverages)
...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1.110000,86438,5499,,Mastercard,Debit,YES,Miscellaneous Food Stores
13305911,23761869,2019-10-31 23:56:00,12.800000,39261,5815,,Mastercard,Debit,YES,"Digital Goods - Media, Books, Apps"
13305912,23761870,2019-10-31 23:57:00,40.439999,2925,4900,,Mastercard,Debit,YES,"Utilities - Electric, Gas, Water, Sanitary"
13305913,23761873,2019-10-31 23:58:00,4.000000,46284,5411,,Visa,Debit,YES,"Grocery Stores, Supermarkets"


In [None]:
# Check what's in both DataFrames
print(f"df merchant_id sample: {df['merchant_id'].head()}")
print(f"df merchant_id dtype: {df['merchant_id'].dtype}")
print(f"\ntrain_merchant_ids sample: {train_merchant_ids['merchant_id'].head()}")
print(f"train_merchant_ids dtype: {train_merchant_ids['merchant_id'].dtype}")

# Check for matches
print(f"\nAny overlap? {df['merchant_id'].isin(train_merchant_ids['merchant_id']).any()}")
print(f"Number of matches: {df['merchant_id'].isin(train_merchant_ids['merchant_id']).sum()}")


df merchant_id sample: 0    59935
1    67570
2    27092
3    27092
4    13051
Name: merchant_id, dtype: int64
df merchant_id dtype: int64

train_merchant_ids sample: 0     5248
1    14528
2    43293
3    59935
4    60600
Name: merchant_id, dtype: int64
train_merchant_ids dtype: int64

Any overlap? True
Number of matches: 1460744


In [None]:
train_merchant_ids
# drop if not in train set

df = df[df['merchant_id'].isin(train_merchant_ids['merchant_id'])]
df

Unnamed: 0,transaction_id,date,amount,merchant_id,mcc,errors,card_brand,card_type,has_chip,mcc_description
0,7475327,2010-01-01 00:01:00,-77.000000,59935,5499,,Mastercard,Prepaid,YES,Miscellaneous Food Stores
6,7475334,2010-01-01 00:09:00,77.000000,59935,5499,,Mastercard,Prepaid,YES,Miscellaneous Food Stores
12,7475340,2010-01-01 00:26:00,39.630001,59935,5499,,Mastercard,Prepaid,YES,Miscellaneous Food Stores
36,7475366,2010-01-01 01:03:00,78.000000,43293,5499,,Mastercard,Credit,YES,Miscellaneous Food Stores
47,7475379,2010-01-01 01:22:00,1.850000,14528,5499,,Visa,Debit,YES,Miscellaneous Food Stores
...,...,...,...,...,...,...,...,...,...,...
13305845,23761786,2019-10-31 22:48:00,0.760000,86438,5499,,Mastercard,Debit,YES,Miscellaneous Food Stores
13305865,23761812,2019-10-31 22:58:00,2.220000,86438,5499,,Mastercard,Debit,NO,Miscellaneous Food Stores
13305900,23761857,2019-10-31 23:46:00,1.540000,14528,5499,,Visa,Debit,YES,Miscellaneous Food Stores
13305901,23761858,2019-10-31 23:46:00,0.970000,43293,5499,,Visa,Debit,YES,Miscellaneous Food Stores


Transaction Costing Engine

In [None]:
cost_type_df = pd.read_csv('/content/cost_type_id.csv')
cost_type_df

Unnamed: 0,cost_type_ID,card_network,card_type,fee_program,min_transaction_amt,max_transaction_amt,mcc,card_fee_percent,card_fee_dollars,network_fee_percent,network_fee_dollars,subtotal_fee_percent,subtotal_fee_dollars
0,1,Visa,Prepaid,Small Ticket Fee Program (All),0.000,5.000000e+00,,1.60%,$0.05,0.13%,$0.02,1.73%,$0.07
1,2,Visa,Debit,Small Ticket Fee Program (All),0.000,5.000000e+00,,0.05%,$0.21,0.13%,$0.02,0.18%,$0.23
2,3,Visa,Credit,Small Ticket Fee Program (All),0.000,1.818000e+00,,0.00%,$0.04,0.14%,$0.02,0.14%,$0.06
3,4,Visa,Credit,Small Ticket Fee Program (All),1.818,5.000000e+00,,2.20%,$0.00,0.14%,$0.02,2.34%,$0.02
4,5,Visa,Super Premium Credit,Small Ticket Fee Program (All),0.000,1.818000e+00,,0.00%,$0.04,0.14%,$0.02,0.14%,$0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,57,Mastercard,Prepaid,Industry Fee Program (All),1000.000,1.000000e+08,5411.0,1.05%,$0.15,0.14%,$0.03,1.19%,$0.18
57,58,Mastercard,Credit,Industry Fee Program (All),1000.000,1.000000e+08,5812.0,1.80%,$0.10,0.14%,$0.03,1.94%,$0.13
58,59,Mastercard,Super Premium Credit,Industry Fee Program (All),1000.000,1.000000e+08,5812.0,2.00%,$0.10,0.14%,$0.03,2.14%,$0.13
59,60,Mastercard,Debit,Industry Fee Program (All),1000.000,1.000000e+08,5812.0,1.19%,$0.10,0.14%,$0.03,1.33%,$0.13


In [None]:
# Ensure mcc is same type in both dataframes
df['mcc'] = df['mcc'].astype(float)
cost_type_df['mcc'] = pd.to_numeric(cost_type_df['mcc'], errors='coerce')

# Filter to only card brands that exist in cost_type_df
df_filtered = df[df['card_brand'].isin(['Visa', 'Mastercard'])].copy()

# Separate rules
cost_general = cost_type_df[cost_type_df['mcc'].isna()].copy()
cost_specific = cost_type_df[cost_type_df['mcc'].notna()].copy()

# Process small (<=5): general rules
df_small = df_filtered[df_filtered['amount'] <= 5].merge(
    cost_general,
    left_on=['card_brand', 'card_type'],
    right_on=['card_network', 'card_type'],
    how='left',
    suffixes=('', '_cost')
)
df_small = df_small[
    (df_small['amount'] >= df_small['min_transaction_amt']) &
    (df_small['amount'] <= df_small['max_transaction_amt']) &
    (df_small['cost_type_ID'].notna())
]
df_small = df_small.sort_values('transaction_id').drop_duplicates('transaction_id', keep='first')

# Process large (>5): mcc-specific rules
df_large = df_filtered[df_filtered['amount'] > 5].merge(
    cost_specific,
    left_on=['card_brand', 'card_type', 'mcc'],
    right_on=['card_network', 'card_type', 'mcc'],
    how='left',
    suffixes=('', '_cost')
)
df_large = df_large[
    (df_large['amount'] >= df_large['min_transaction_amt']) &
    (df_large['amount'] <= df_large['max_transaction_amt']) &
    (df_large['cost_type_ID'].notna())
]
df_large = df_large.sort_values('transaction_id').drop_duplicates('transaction_id', keep='first')

# Combine results and map back to original df
result_map = pd.concat([
    df_small[['transaction_id', 'cost_type_ID']],
    df_large[['transaction_id', 'cost_type_ID']]
]).set_index('transaction_id')['cost_type_ID']

# Drop old cost_type_ID if exists
if 'cost_type_ID' in df.columns:
    df = df.drop('cost_type_ID', axis=1)

df['cost_type_ID'] = df['transaction_id'].map(result_map)

print(f"Total transactions: {len(df)}")
print(f"Rows with cost_type_ID: {df['cost_type_ID'].notna().sum()}")
print(f"Rows without match: {df['cost_type_ID'].isna().sum()}")
print(f"\nBreakdown of unmatched:")
print(f"  - Negative amounts: {(df['amount'] < 0).sum()}")
print(f"  - Amex cards: {(df['card_brand'] == 'Amex').sum()}")
print(f"  - Discover cards: {(df['card_brand'] == 'Discover').sum()}")
print(f"  - Other: {df['cost_type_ID'].isna().sum() - (df['amount'] < 0).sum() - (df['card_brand'] == 'Amex').sum() - (df['card_brand'] == 'Discover').sum()}")


Total transactions: 1460744
Rows with cost_type_ID: 1078095
Rows without match: 382649

Breakdown of unmatched:
  - Negative amounts: 290527
  - Amex cards: 84912
  - Discover cards: 33184
  - Other: -25974


In [None]:
df

Unnamed: 0,transaction_id,date,amount,merchant_id,mcc,errors,card_brand,card_type,has_chip,mcc_description,cost_type_ID
0,7475327,2010-01-01 00:01:00,-77.000000,59935,5499.0,,Mastercard,Prepaid,YES,Miscellaneous Food Stores,
1,7475334,2010-01-01 00:09:00,77.000000,59935,5499.0,,Mastercard,Prepaid,YES,Miscellaneous Food Stores,31.0
2,7475340,2010-01-01 00:26:00,39.630001,59935,5499.0,,Mastercard,Prepaid,YES,Miscellaneous Food Stores,31.0
3,7475366,2010-01-01 01:03:00,78.000000,43293,5499.0,,Mastercard,Credit,YES,Miscellaneous Food Stores,28.0
4,7475379,2010-01-01 01:22:00,1.850000,14528,5499.0,,Visa,Debit,YES,Miscellaneous Food Stores,2.0
...,...,...,...,...,...,...,...,...,...,...,...
1460739,23761786,2019-10-31 22:48:00,0.760000,86438,5499.0,,Mastercard,Debit,YES,Miscellaneous Food Stores,26.0
1460740,23761812,2019-10-31 22:58:00,2.220000,86438,5499.0,,Mastercard,Debit,NO,Miscellaneous Food Stores,26.0
1460741,23761857,2019-10-31 23:46:00,1.540000,14528,5499.0,,Visa,Debit,YES,Miscellaneous Food Stores,2.0
1460742,23761858,2019-10-31 23:46:00,0.970000,43293,5499.0,,Visa,Debit,YES,Miscellaneous Food Stores,2.0




In [None]:
# Prepare lookup arrays indexed by cost_type_ID
cost_type_df['subtotal_fee_percent_clean'] = cost_type_df['subtotal_fee_percent'].str.rstrip('%').astype(float) / 100
cost_type_df['subtotal_fee_dollars_clean'] = cost_type_df['subtotal_fee_dollars'].str.lstrip('$').astype(float)

# Create dictionaries for O(1) lookup
fee_dollars_map = dict(zip(cost_type_df['cost_type_ID'], cost_type_df['subtotal_fee_dollars_clean']))
fee_percent_map = dict(zip(cost_type_df['cost_type_ID'], cost_type_df['subtotal_fee_percent_clean']))

# Vectorized calculation using map (no merge, no temporary columns)
df['proc_cost'] = df['cost_type_ID'].map(fee_dollars_map) + (df['cost_type_ID'].map(fee_percent_map) * df['amount'])

print(f"✓ proc_cost calculated for {df['proc_cost'].notna().sum():,} transactions")


✓ proc_cost calculated for 1,078,095 transactions


In [None]:
df.head(25).to_csv('preview_transactions_costed', index=False)