In [1]:
import pandas as pd
import datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
import warnings
import json
import os
import io
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "thiru1711/Financial_Transactions"
ds = load_dataset(dataset_name)

# print(ds)
for feature_name, feature_type in ds['train'].features.items():
    print(f"Column: {feature_name}, Data Type: {feature_type.dtype if hasattr(feature_type, 'dtype') else str(feature_type)}")


Column: transaction_id, Data Type: string
Column: date, Data Type: timestamp[ns]
Column: card_id, Data Type: string
Column: amount, Data Type: float32
Column: use_chip, Data Type: string
Column: merchant_id, Data Type: int64
Column: merchant_city, Data Type: string
Column: merchant_state, Data Type: string
Column: zip, Data Type: float64
Column: mcc, Data Type: string
Column: errors, Data Type: string
Column: is_fraud, Data Type: int64
Column: card_brand, Data Type: string
Column: card_type, Data Type: string
Column: card_number, Data Type: int64
Column: expires, Data Type: string
Column: cvv, Data Type: int16
Column: has_chip, Data Type: string
Column: num_cards_issued, Data Type: int64
Column: credit_limit, Data Type: float32
Column: acct_open_date, Data Type: string
Column: year_pin_last_changed, Data Type: int64
Column: card_on_dark_web, Data Type: string
Column: current_age, Data Type: int64
Column: retirement_age, Data Type: int64
Column: birth_year, Data Type: int64
Column: birt

In [3]:
df = ds['train'].to_pandas()

# Drop columns not needed
drop_cols = [
    # PII/Security Fields
    'card_number', 'cvv', 'expires', 'address',

    # Cardholder Demographics
    'current_age', 'retirement_age', 'birth_year', 'birth_month', 'gender',
    'latitude', 'longitude', 'per_capita_income', 'yearly_income',
    'total_debt', 'credit_score', 'num_credit_cards','credit_limit',

    # Account Metadata
    'card_id', 'acct_open_date', 'year_pin_last_changed',
    'card_on_dark_web', 'num_cards_issued',

    # Geographical features
    'merchant_state', 'zip', 'merchant_city', 'has_chip'
]

df = df.drop(columns=drop_cols)


In [4]:
# Convert 'mcc' column to numeric, coercing errors to NaN
df['mcc'] = pd.to_numeric(df['mcc'], errors='coerce')

# Filter out rows where 'mcc' is NaN after conversion (if any)
df = df.dropna(subset=['mcc'])

# Filter out rows where is fraud == 1 and then drop the whole column
df = df[df['is_fraud'] != 1]

# Drop row
df = df.drop(columns='is_fraud')

# Rename "Debit (Prepaid)" to "Prepaid" in card_type column
df['card_type'] = df['card_type'].replace('Debit (Prepaid)', 'Prepaid')

print(f"Remaining columns")
for feature_name, feature_type in df.dtypes.items():
    print(f"Column: {feature_name}, Data Type: {feature_type}")

Remaining columns
Column: transaction_id, Data Type: str
Column: date, Data Type: datetime64[ns]
Column: amount, Data Type: float32
Column: use_chip, Data Type: str
Column: merchant_id, Data Type: int64
Column: mcc, Data Type: int64
Column: errors, Data Type: str
Column: card_brand, Data Type: str
Column: card_type, Data Type: str
Column: mcc_description, Data Type: str


In [6]:
# Filter for MCC 5411 (Grocery Stores)
df_5411 = df[df['mcc'] == 5411].copy()

# Convert transaction_date to datetime and extract year
df_5411['date'] = pd.to_datetime(df_5411['date'])
df_5411['year'] = df_5411['date'].dt.year

print(f"Total transactions for MCC 5411: {len(df_5411)}")
print(f"Year distribution:\n{df_5411['year'].value_counts().sort_index()}")


Total transactions for MCC 5411: 1592159
Year distribution:
year
2010    147876
2011    154463
2012    158898
2013    162232
2014    163821
2015    165844
2016    166013
2017    167323
2018    166883
2019    138806
Name: count, dtype: int64


In [8]:
# Find merchants that appear in each year
merchant_years = df_5411.groupby('merchant_id')['year'].apply(set).reset_index()
merchant_years.columns = ['merchant_id', 'years_set']

# Find merchants that appear ONLY in 2017 (not in any other year)
merchants_only_2017 = merchant_years[merchant_years['years_set'] == {2017}]
train_merchant_ids = merchants_only_2017['merchant_id'].tolist()

# Find merchants that appear ONLY in 2018 (not in any other year)
merchants_only_2018 = merchant_years[merchant_years['years_set'] == {2018}]
validate_merchant_ids = merchants_only_2018['merchant_id'].tolist()

# Find merchants that appear ONLY in 2019 (not in any other year)
merchants_only_2019 = merchant_years[merchant_years['years_set'] == {2019}]
test_merchant_ids = merchants_only_2019['merchant_id'].tolist()

print(f"Total merchants in MCC 5411: {len(merchant_years)}")
print(f"Merchants appearing ONLY in 2017: {len(train_merchant_ids)}")
print(f"Merchants appearing ONLY in 2018: {len(validate_merchant_ids)}")
print(f"Merchants appearing ONLY in 2019: {len(test_merchant_ids)}")


Total merchants in MCC 5411: 8158
Merchants appearing ONLY in 2017: 327
Merchants appearing ONLY in 2018: 362
Merchants appearing ONLY in 2019: 300


In [9]:
# Create splits using the separate merchant ID lists
# Train: merchants that only appear in 2017
df_train = df_5411[df_5411['merchant_id'].isin(train_merchant_ids)].copy()

# Validate: merchants that only appear in 2018
df_validate = df_5411[df_5411['merchant_id'].isin(validate_merchant_ids)].copy()

# Test: merchants that only appear in 2019
df_test = df_5411[df_5411['merchant_id'].isin(test_merchant_ids)].copy()

print(f"Train set (2017 only merchants): {len(df_train)} transactions, {df_train['merchant_id'].nunique()} unique merchants")
print(f"Validate set (2018 only merchants): {len(df_validate)} transactions, {df_validate['merchant_id'].nunique()} unique merchants")
print(f"Test set (2019 only merchants): {len(df_test)} transactions, {df_test['merchant_id'].nunique()} unique merchants")
print(f"\nTotal: {len(df_train) + len(df_validate) + len(df_test)} transactions")


Train set (2017 only merchants): 708 transactions, 327 unique merchants
Validate set (2018 only merchants): 801 transactions, 362 unique merchants
Test set (2019 only merchants): 653 transactions, 300 unique merchants

Total: 2162 transactions


In [10]:
# Drop the year column (temporary helper column)
df_train = df_train.drop(columns=['year'])
df_validate = df_validate.drop(columns=['year'])
df_test = df_test.drop(columns=['year'])

print("Year column removed from all splits")
print(f"Train columns: {df_train.columns.tolist()}")


Year column removed from all splits
Train columns: ['transaction_id', 'date', 'amount', 'use_chip', 'merchant_id', 'mcc', 'errors', 'card_brand', 'card_type', 'mcc_description']


In [11]:
# Create output directory if it doesn't exist
output_dir = '5411_splits'
os.makedirs(output_dir, exist_ok=True)

# Save the splits to CSV
df_train.to_csv(f'{output_dir}/5411_train.csv', index=False)
df_validate.to_csv(f'{output_dir}/5411_validate.csv', index=False)
df_test.to_csv(f'{output_dir}/5411_test.csv', index=False)

print(f"Datasets saved to {output_dir}/ directory")
print(f"- 5411_train.csv: {len(df_train)} rows")
print(f"- 5411_validate.csv: {len(df_validate)} rows")
print(f"- 5411_test.csv: {len(df_test)} rows")


Datasets saved to 5411_splits/ directory
- 5411_train.csv: 708 rows
- 5411_validate.csv: 801 rows
- 5411_test.csv: 653 rows


In [12]:
# Verification: Check that the merchants are completely separate (no overlap)
train_merchants = set(df_train['merchant_id'].unique())
validate_merchants = set(df_validate['merchant_id'].unique())
test_merchants = set(df_test['merchant_id'].unique())

print(f"Train merchants: {len(train_merchants)}")
print(f"Validate merchants: {len(validate_merchants)}")
print(f"Test merchants: {len(test_merchants)}")
print(f"\nOverlap between train and validate: {len(train_merchants & validate_merchants)}")
print(f"Overlap between train and test: {len(train_merchants & test_merchants)}")
print(f"Overlap between validate and test: {len(validate_merchants & test_merchants)}")
print(f"\nSets are completely separate (no overlap): {len(train_merchants & validate_merchants & test_merchants) == 0}")


Train merchants: 327
Validate merchants: 362
Test merchants: 300

Overlap between train and validate: 0
Overlap between train and test: 0
Overlap between validate and test: 0

Sets are completely separate (no overlap): True


In [13]:
# Get all MCC 5411 transactions for each merchant set independently
# Train: all transactions for merchants that only appear in 2017
df_train_all = df_5411[df_5411['merchant_id'].isin(train_merchant_ids)].copy()

# Validate: all transactions for merchants that only appear in 2018
df_validate_all = df_5411[df_5411['merchant_id'].isin(validate_merchant_ids)].copy()

# Test: all transactions for merchants that only appear in 2019
df_test_all = df_5411[df_5411['merchant_id'].isin(test_merchant_ids)].copy()

print(f"Train merchants (5411 only): {len(df_train_all)} transactions, {df_train_all['merchant_id'].nunique()} unique merchants")
print(f"  Year distribution: {df_train_all['year'].value_counts().sort_index().to_dict()}")

print(f"\nValidate merchants (5411 only): {len(df_validate_all)} transactions, {df_validate_all['merchant_id'].nunique()} unique merchants")
print(f"  Year distribution: {df_validate_all['year'].value_counts().sort_index().to_dict()}")

print(f"\nTest merchants (5411 only): {len(df_test_all)} transactions, {df_test_all['merchant_id'].nunique()} unique merchants")
print(f"  Year distribution: {df_test_all['year'].value_counts().sort_index().to_dict()}")


Train merchants (5411 only): 708 transactions, 327 unique merchants
  Year distribution: {2017: 708}

Validate merchants (5411 only): 801 transactions, 362 unique merchants
  Year distribution: {2018: 801}

Test merchants (5411 only): 653 transactions, 300 unique merchants
  Year distribution: {2019: 653}


In [14]:
df_train_all.head()

Unnamed: 0,transaction_id,date,amount,use_chip,merchant_id,mcc,errors,card_brand,card_type,mcc_description,year
9352406,18882165,2017-01-01 07:24:00,0.24,Chip Transaction,13456,5411,,Discover,Credit,"Grocery Stores, Supermarkets",2017
9352456,18882220,2017-01-01 07:34:00,0.18,Chip Transaction,13456,5411,,Discover,Credit,"Grocery Stores, Supermarkets",2017
9364365,18896821,2017-01-04 09:05:00,9.33,Chip Transaction,53657,5411,,Mastercard,Debit,"Grocery Stores, Supermarkets",2017
9371915,18906066,2017-01-06 07:40:00,5.96,Chip Transaction,83089,5411,,Visa,Debit,"Grocery Stores, Supermarkets",2017
9378085,18913687,2017-01-07 16:08:00,95.900002,Chip Transaction,7547,5411,,Mastercard,Debit,"Grocery Stores, Supermarkets",2017


In [15]:
# Simple temporal split: split by year (merchants can appear in multiple years)
# Train: All MCC 5411 transactions from 2017
df_train_temporal = df_5411[df_5411['year'] == 2017].copy()

# Validate: All MCC 5411 transactions from 2018
df_validate_temporal = df_5411[df_5411['year'] == 2018].copy()

# Test: All MCC 5411 transactions from 2019
df_test_temporal = df_5411[df_5411['year'] == 2019].copy()

print("Temporal Split (by year):")
print(f"Train (2017): {len(df_train_temporal)} transactions, {df_train_temporal['merchant_id'].nunique()} unique merchants")
print(f"Validate (2018): {len(df_validate_temporal)} transactions, {df_validate_temporal['merchant_id'].nunique()} unique merchants")
print(f"Test (2019): {len(df_test_temporal)} transactions, {df_test_temporal['merchant_id'].nunique()} unique merchants")
print(f"\nTotal: {len(df_train_temporal) + len(df_validate_temporal) + len(df_test_temporal)} transactions")


Temporal Split (by year):
Train (2017): 167323 transactions, 2848 unique merchants
Validate (2018): 166883 transactions, 2828 unique merchants
Test (2019): 138806 transactions, 2626 unique merchants

Total: 473012 transactions


In [16]:
# Check merchant overlap in temporal splits (expected to have overlap)
train_merchants_temporal = set(df_train_temporal['merchant_id'].unique())
validate_merchants_temporal = set(df_validate_temporal['merchant_id'].unique())
test_merchants_temporal = set(df_test_temporal['merchant_id'].unique())

print("Merchant overlap analysis (temporal split):")
print(f"Merchants in train & validate: {len(train_merchants_temporal & validate_merchants_temporal)}")
print(f"Merchants in train & test: {len(train_merchants_temporal & test_merchants_temporal)}")
print(f"Merchants in validate & test: {len(validate_merchants_temporal & test_merchants_temporal)}")
print(f"Merchants in all three sets: {len(train_merchants_temporal & validate_merchants_temporal & test_merchants_temporal)}")


Merchant overlap analysis (temporal split):
Merchants in train & validate: 1695
Merchants in train & test: 1595
Merchants in validate & test: 1619
Merchants in all three sets: 1372


In [17]:
# Drop the year column before saving
df_train_temporal = df_train_temporal.drop(columns=['year'])
df_validate_temporal = df_validate_temporal.drop(columns=['year'])
df_test_temporal = df_test_temporal.drop(columns=['year'])

# Create output directory for temporal splits
output_dir_temporal = '5411_temporal_splits'
os.makedirs(output_dir_temporal, exist_ok=True)

# Save the temporal splits to CSV
df_train_temporal.to_csv(f'{output_dir_temporal}/5411_train_temporal.csv', index=False)
df_validate_temporal.to_csv(f'{output_dir_temporal}/5411_validate_temporal.csv', index=False)
df_test_temporal.to_csv(f'{output_dir_temporal}/5411_test_temporal.csv', index=False)

print(f"Temporal datasets saved to {output_dir_temporal}/ directory")
print(f"- 5411_train_temporal.csv: {len(df_train_temporal)} rows")
print(f"- 5411_validate_temporal.csv: {len(df_validate_temporal)} rows")
print(f"- 5411_test_temporal.csv: {len(df_test_temporal)} rows")


Temporal datasets saved to 5411_temporal_splits/ directory
- 5411_train_temporal.csv: 167323 rows
- 5411_validate_temporal.csv: 166883 rows
- 5411_test_temporal.csv: 138806 rows


In [None]:
output_dir_temporal = '5411_iso_splits'
os.makedirs(output_dir_temporal, exist_ok=True)

# Save the temporal splits to CSV
df_train_all.to_csv(f'{output_dir_temporal}/5411_train_temporal.csv', index=False)
df_validate_all.to_csv(f'{output_dir_temporal}/5411_validate_temporal.csv', index=False)
df_test_all.to_csv(f'{output_dir_temporal}/5411_test_temporal.csv', index=False)

print(f"isolated year datasets saved to {output_dir_temporal}/ directory")
print(f"- 5411_train_iso_.csv: {len(df_train_all)} rows")
print(f"- 5411_validate_iso_.csv: {len(df_validate_all)} rows")
print(f"- 5411_test_iso_.csv: {len(df_test_all)} rows")