In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import List, Optional, Dict

data_dir = '../data/jp_morgan'

Matplotlib is building the font cache; this may take a moment.


In [3]:
df = pl.read_parquet(f'{data_dir}/transaction_metrics_final_aml.parquet')

In [9]:
df['Time_step'].max()

'2027-12-04 13:20:00'

In [5]:
def count_velocity(
        df: pd.DataFrame, 
        pattern: str = "volume_{day}d_{metric}",
        days: List[int] = [7, 14, 30]
        ) -> pd.DataFrame:
    
    for d in days:
        avg_col = pattern.format(day=d, metric='avg')
        count_col = pattern.format(day=d, metric='count')
        velocity_col = f'stat_{d}d_velocity'
        df = df.with_columns((df[avg_col] * df[count_col] / d).alias(velocity_col))

    return df

test_data = count_velocity(df)


In [4]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dataclasses import dataclass
from typing import Optional, Dict, List, Tuple

@dataclass
class ColumnConfig:
    """Configuration for column names in the dataset"""
    date_col: str
    transaction_type_col: str
    transaction_method_col: str
    amount_col: str
    party_col: str
    counterparty_col: str
    account_col: str
    label_col: str  # The column indicating fraud/non-fraud
    positive_label: any  # Value indicating legitimate transaction in label column
    
    @classmethod
    def default(cls) -> 'ColumnConfig':
        """Default column configuration"""
        return cls(
            date_col='date',
            transaction_type_col='transaction_type',
            transaction_method_col='transaction_method',
            amount_col='amount',
            party_col='party',
            counterparty_col='cparty',
            account_col='account_identifier',
            label_col='is_good',
            positive_label=True
        )

class FraudEDA:
    """Fraud Detection Exploratory Data Analysis"""
    
    def __init__(self, column_config: Optional[ColumnConfig] = None):
        self.config = column_config or ColumnConfig.default()
    
    def load_and_clean_data(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Initial data loading and cleaning using Polars
        """
        # Convert date column to datetime and extract temporal features
        df = df.with_columns([
            pl.col(self.config.date_col).str.to_datetime().alias('date_parsed'),
            pl.col(self.config.date_col).str.to_datetime().dt.hour().alias('hour'),
            pl.col(self.config.date_col).str.to_datetime().dt.weekday().alias('day_of_week'),
            pl.col(self.config.date_col).str.to_datetime().dt.month().alias('month')
        ])
        
        return df
    
    def basic_statistics(self, df: pl.DataFrame) -> Dict[str, pl.DataFrame]:
        """
        Calculate basic statistics for numerical columns
        """
        print("\n=== Basic Statistics ===")
        
        # Numerical summary for amount
        amount_stats = df.select([
            pl.col(self.config.amount_col).mean().alias('mean'),
            pl.col(self.config.amount_col).median().alias('median'),
            pl.col(self.config.amount_col).std().alias('std'),
            pl.col(self.config.amount_col).min().alias('min'),
            pl.col(self.config.amount_col).max().alias('max'),
            pl.col(self.config.amount_col).quantile(0.25).alias('25%'),
            pl.col(self.config.amount_col).quantile(0.75).alias('75%')
        ])
        print("\nAmount Statistics:")
        print(amount_stats)
        
        # Missing values
        null_counts = df.null_count()
        print("\nMissing Values:")
        print(null_counts)
        
        # Class distribution
        class_dist = (df
            .select(pl.col(self.config.label_col))
            .groupby(self.config.label_col)
            .count()
            .with_columns(
                (pl.col('count') / pl.col('count').sum()).alias('percentage')
            )
        )
        print("\nClass Distribution:")
        print(class_dist)
        
        return {
            'amount_stats': amount_stats,
            'null_counts': null_counts,
            'class_dist': class_dist
        }
    
    def temporal_analysis(self, df: pl.DataFrame) -> Dict[str, pl.DataFrame]:
        """
        Analyze temporal patterns
        """
        print("\n=== Temporal Patterns ===")
        
        # Transaction volume by hour
        hourly_stats = (df
            .groupby(['hour', self.config.label_col])
            .count()
            .pivot(
                values='count',
                index='hour',
                columns=self.config.label_col
            )
            .sort('hour')
        )
        print("\nHourly Transaction Patterns:")
        print(hourly_stats)
        
        # Transaction volume by day of week
        daily_stats = (df
            .groupby(['day_of_week', self.config.label_col])
            .count()
            .pivot(
                values='count',
                index='day_of_week',
                columns=self.config.label_col
            )
            .sort('day_of_week')
        )
        print("\nDaily Transaction Patterns:")
        print(daily_stats)
        
        return {
            'hourly_stats': hourly_stats,
            'daily_stats': daily_stats
        }
    
    def amount_distribution_analysis(self, df: pl.DataFrame) -> Dict[str, pl.DataFrame]:
        """
        Analyze transaction amount patterns
        """
        print("\n=== Amount Analysis ===")
        
        # Amount statistics by transaction type
        amount_by_type = (df
            .groupby(self.config.transaction_type_col)
            .agg([
                pl.col(self.config.amount_col).mean().alias('mean'),
                pl.col(self.config.amount_col).median().alias('median'),
                pl.col(self.config.amount_col).std().alias('std')
            ])
        )
        print("\nAmount Statistics by Transaction Type:")
        print(amount_by_type)
        
        # Calculate amount percentiles for good vs fraudulent transactions
        percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
        amount_percentiles = (df
            .groupby(self.config.label_col)
            .agg([
                pl.col(self.config.amount_col).quantile(p).alias(f'p{int(p*100)}')
                for p in percentiles
            ])
        )
        print("\nAmount Percentiles by Class:")
        print(amount_percentiles)
        
        return {
            'amount_by_type': amount_by_type,
            'amount_percentiles': amount_percentiles
        }
    
    def behavioral_analysis(self, df: pl.DataFrame) -> Dict[str, pl.DataFrame]:
        """
        Analyze behavioral patterns
        """
        print("\n=== Behavioral Analysis ===")
        
        # Transaction method preferences
        method_stats = (df
            .groupby([self.config.transaction_method_col, self.config.label_col])
            .count()
            .pivot(
                values='count',
                index=self.config.transaction_method_col,
                columns=self.config.label_col
            )
        )
        print("\nTransaction Method Distribution:")
        print(method_stats)
        
        # Account activity patterns
        account_activity = (df
            .groupby(self.config.account_col)
            .agg([
                pl.col(self.config.amount_col).count().alias('transaction_count'),
                pl.col(self.config.amount_col).sum().alias('total_amount'),
                pl.col(self.config.amount_col).mean().alias('avg_amount'),
                pl.col(self.config.amount_col).std().alias('std_amount'),
                pl.col(self.config.label_col).mean().alias('good_ratio')
            ])
        )
        print("\nAccount Activity Patterns:")
        print(account_activity.head())
        
        return {
            'method_stats': method_stats,
            'account_activity': account_activity
        }
    
    def party_analysis(self, df: pl.DataFrame) -> Dict[str, pl.DataFrame]:
        """
        Analyze party/counterparty patterns
        """
        print("\n=== Party Analysis ===")
        
        # Party transaction patterns
        party_stats = (df
            .groupby(self.config.party_col)
            .agg([
                pl.col(self.config.amount_col).count().alias('transaction_count'),
                pl.col(self.config.amount_col).sum().alias('total_amount'),
                pl.col(self.config.amount_col).mean().alias('avg_amount'),
                pl.col(self.config.label_col).mean().alias('good_ratio')
            ])
            .sort('transaction_count', reverse=True)
        )
        print("\nParty Transaction Patterns:")
        print(party_stats.head())
        
        # Analyze party-counterparty relationships
        party_cparty_freq = (df
            .groupby([self.config.party_col, self.config.counterparty_col])
            .agg([
                pl.col(self.config.amount_col).count().alias('transaction_count')
            ])
            .sort('transaction_count', reverse=True)
        )
        print("\nTop Party-Counterparty Relationships:")
        print(party_cparty_freq.head())
        
        return {
            'party_stats': party_stats,
            'party_cparty_freq': party_cparty_freq
        }
    
    def velocity_checks(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Perform velocity checks
        """
        print("\n=== Velocity Checks ===")
        
        # Add hour window
        df = df.with_columns([
            pl.col('date_parsed').dt.truncate('1h').alias('hour_window')
        ])
        
        # Transaction velocity per account (1-hour window)
        velocity_1h = (df
            .groupby([self.config.account_col, 'hour_window'])
            .agg([
                pl.col(self.config.amount_col).count().alias('transaction_count'),
                pl.col(self.config.amount_col).sum().alias('total_amount'),
                pl.col(self.config.label_col).mean().alias('good_ratio')
            ])
        )
        
        # Calculate velocity statistics
        velocity_stats = velocity_1h.describe()
        print("\nHourly Velocity Statistics:")
        print(velocity_stats)
        
        return velocity_1h
    
    def generate_visualizations(self, df: pl.DataFrame) -> None:
        """
        Generate key visualizations
        """
        plt.style.use('seaborn')
        
        # Convert to pandas for visualization (as seaborn works better with pandas)
        df_pd = df.to_pandas()
        
        # Amount distribution by class
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=self.config.label_col, y=self.config.amount_col, data=df_pd)
        plt.title('Transaction Amount Distribution by Class')
        plt.yscale('log')
        
        # Temporal heatmap
        temporal_heatmap = (df
            .groupby(['day_of_week', 'hour'])
            .count()
            .pivot(
                values='count',
                index='day_of_week',
                columns='hour'
            )
            .to_pandas()
        )
        
        plt.figure(figsize=(12, 6))
        sns.heatmap(temporal_heatmap, cmap='YlOrRd')
        plt.title('Transaction Volume Heatmap')
    
    def add_risk_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Add fraud detection specific features
        """
        return df.with_columns([
            # Calculate amount z-score per transaction type
            (pl.col(self.config.amount_col) - pl.col(self.config.amount_col).mean()) / 
            pl.col(self.config.amount_col).std()
                .over(self.config.transaction_type_col)
                .alias('amount_zscore'),
            
            # Calculate transaction velocity (number of transactions in last hour)
            pl.col('date_parsed')
                .rolling(period='1h')
                .over(self.config.account_col)
                .alias('tx_velocity_1h'),
            
            # Flag structured transactions (just below common reporting thresholds)
            (pl.col(self.config.amount_col).is_between(9000, 10000) | 
             pl.col(self.config.amount_col).is_between(4500, 5000))
                .alias('is_structured')
        ])
    
    def run_full_eda(self, df: pl.DataFrame) -> Tuple[pl.DataFrame, Dict[str, pl.DataFrame]]:
        """
        Run complete EDA pipeline
        """
        # Clean and prepare data
        df = self.load_and_clean_data(df)
        
        # Add risk features
        df = self.add_risk_features(df)
        
        # Run all analyses
        results = {}
        results.update(self.basic_statistics(df))
        results.update(self.temporal_analysis(df))
        results.update(self.amount_distribution_analysis(df))
        results.update(self.behavioral_analysis(df))
        results.update(self.party_analysis(df))
        
        # Generate visualizations
        self.generate_visualizations(df)
        
        # Additional fraud-specific metrics
        fraud_rate = (1 - df.select(pl.col(self.config.label_col).mean()).item()) * 100
        print("\n=== Fraud Detection Metrics ===")
        print(f"Fraud Rate: {fraud_rate:.2f}%")
        
        return df, results

# Example usage
def main():
    # Define custom column names
    custom_columns = ColumnConfig(
        date_col='Time_step',
        transaction_type_col='std_txn_type',
        transaction_method_col='std_txn_method',
        amount_col='USD_amount',
        party_col='party_Id',
        counterparty_col='cparty_Id',
        account_col='party_Account',
        label_col='Label',
        positive_label='GOOD'
    )
    
    # Initialize the EDA class with custom column configuration
    eda = FraudEDA(column_config=custom_columns)
    
    # Load your data    
    # Run the full EDA
    analyzed_df, results = eda.run_full_eda(df)
    return analyzed_df, results

if __name__ == "__main__":
    main()

TypeError: Expr.rolling() missing 1 required positional argument: 'index_column'

In [8]:
test_data = test_data.with_columns((test_data['cparty_Id'].str.contains('JPMC')).alias('cparty_internal'))

In [4]:
test_data

NameError: name 'test_data' is not defined

In [14]:
test_data = test_data.sort(by=['Time_step'])

In [3]:
df.get_column('Transaction_Type').unique()

Transaction_Type
str
"""PAYMENT"""
"""CASH-DEPOSIT"""
"""WIRE"""
"""CRYPTO-TRANSFER"""
"""WITHDRAWL"""


In [4]:
txn_mapping = {
  "CASH-DEPOSIT": {
    "std_txn_type": "DEPOSIT",
    "std_txn_method": "CASH"
  },
  "WITHDRAWL": {
    "std_txn_type": "WITHDRAWAL",
    "std_txn_method": "CASH"
  },
  "WIRE": {
    "std_txn_type": "TRANSFER",
    "std_txn_method": "ELECTRONIC"
  },
  "PAYMENT": {
    "std_txn_type": "PAYMENT",
    "std_txn_method": "ELECTRONIC"
  },
  "CRYPTO-TRANSFER": {
    "std_txn_type": "TRANSFER",
    "std_txn_method": "ELECTRONIC"
  }
}

In [7]:
def std_map_transactions(df: pl.DataFrame, txn_mapping: dict) -> pl.DataFrame:
    # Pre-compute the mappings
    type_mapping = {k: v['std_txn_type'] for k, v in txn_mapping.items()}
    method_mapping = {k: v['std_txn_method'] for k, v in txn_mapping.items()}
    
    # First check for unmapped values
    unique_txn_types = df.get_column('Transaction_Type').unique().to_list()  # Note the column name change
    unmapped_txns = [txn for txn in unique_txn_types if txn not in type_mapping]
    
    if unmapped_txns:
        raise ValueError(
            f"Found {len(unmapped_txns)} unmapped transaction types:\n"
            f"{unmapped_txns}\n"
            f"Please update the mapping dictionary with these values."
        )
    
    # If we get here, all values are mapped, so proceed with mapping
    return df.with_columns([
        pl.col('Transaction_Type').replace(type_mapping).alias('std_txn_type'),
        pl.col('Transaction_Type').replace(method_mapping).alias('std_txn_method')
    ])


In [8]:
df = map_transactions(df, txn_mapping)