## Polars Eager analysis for banking datset

Dataset:
https://www.kaggle.com/datasets/ismetsemedov/transactions?resource=download

In [1]:
# Import libraries
import polars as pl
import time
import tracemalloc

In [2]:
# Create function to measure and log results
benchmark_results = []

def measure_and_log(step_name, library_name, operation_func, *args, **kwargs):
    tracemalloc.start()
    start = time.perf_counter()

    result = operation_func(*args, **kwargs)

    end = time.perf_counter()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Build the new entry
    new_entry = {
        "Library": library_name,
        "Step": step_name,
        "Time (s)": round(end - start, 4),
        "Peak Memory (MB)": round(peak / 10**6, 4)
    }

    # Remove existing entry for same library + step
    global benchmark_results
    benchmark_results = [
        entry for entry in benchmark_results
        if not (entry["Library"] == library_name and entry["Step"] == step_name)
    ]

    # Add the new one
    benchmark_results.append(new_entry)

    return result

In [3]:
transactions_csv = '/Users/adolfomytr/Documents/Data Science/Polars/synthetic_fraud_data.csv'

#### Step 1: Load the dataset

In [4]:
def load_csv():
    return pl.read_csv(transactions_csv, try_parse_dates=True)

df = measure_and_log("Load CSV", "Polars Eager", load_csv)

#### Step 2: Exploratory Data Analysis

In [5]:
# Summary statistics for numeric columns
summary = measure_and_log(
    "Describe numeric columns", "Polars Eager",
    lambda: df.describe()
)

# Value counts for merchant_type
merchant_type_counts = measure_and_log(
    "Merchant type value counts", "Polars Eager",
    lambda: df.select(pl.col("merchant_type")).to_series().value_counts()
)

# Missing values per column
missing_values = measure_and_log(
    "Missing values per column", "Polars Eager",
    lambda: df.null_count()
)

# Minimum timestamp
min_date = measure_and_log(
    "Min txn_date", "Polars Eager",
    lambda: df.select(pl.col("timestamp").min())
)

# Maximum timestamp
max_date = measure_and_log(
    "Max txn_date", "Polars Eager",
    lambda: df.select(pl.col("timestamp").max())
)

# Unique customer_id count
unique_customers = measure_and_log(
    "Unique customer_id count", "Polars Eager",
    lambda: df.select(pl.col("customer_id")).n_unique()
)

#### Step 3: Transaction Volume Analysis

In [6]:
# Daily transaction count
daily_txns = measure_and_log("Daily transaction count", "Polars Eager", lambda: 
    df.group_by("timestamp").len()
)

# Monthly transaction count (convert timestamp to month)
monthly_txns = measure_and_log("Monthly transaction count", "Polars Eager", lambda: 
    df.with_columns(
        pl.col("timestamp").dt.truncate("1mo").alias("month")
    ).group_by("month").len()
)

#### Step 4: Transaction Amount Aggregation

In [7]:
# Average and total transaction amount by type
avg_total_by_type = measure_and_log("Avg/Total txn by merch type", "Polars Eager", lambda: 
    df.group_by("merchant_type").agg([
        pl.col("amount").mean().alias("avg_amount"),
        pl.col("amount").sum().alias("total_amount")
    ])
)

# Top 10 customers by transaction volume
top_customers = measure_and_log("Top 10 customers by volume", "Polars Eager", lambda: 
    df.group_by("customer_id").agg(
        pl.col("amount").sum().alias("total_amount")
    ).sort("total_amount", descending=True).head(10)
)

#### Step 5: Fraud Analysis

In [8]:
# Fraud vs non-fraud count
fraud_vs_nonfraud = measure_and_log("Fraud vs Non-Fraud Count", "Polars Eager", lambda: 
    df.select(pl.col("is_fraud")).to_series().value_counts()
)

# Fraud rate by transaction type
fraud_rate_by_type = measure_and_log("Fraud rate by merchant_type", "Polars Eager", lambda: 
    df.group_by("merchant_type").agg(
        pl.col("is_fraud").mean().alias("fraud_rate")
    )
)

# Stats for fraudulent transaction amounts
fraud_amount_distribution = measure_and_log("Fraud amount stats", "Polars Eager", lambda: 
    df.filter(pl.col("is_fraud") == 1).select("amount").describe()
)

#### Step 6: Export benchmark results

In [9]:
pl.DataFrame(benchmark_results).write_csv("/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_polars_eager.csv")
pl.DataFrame(benchmark_results)

Library,Step,Time (s),Peak Memory (MB)
str,str,f64,f64
"""Polars Eager""","""Load CSV""",11.0512,0.5682
"""Polars Eager""","""Describe numeric columns""",1.371,0.292
"""Polars Eager""","""Merchant type value counts""",0.2783,0.0121
"""Polars Eager""","""Missing values per column""",0.0002,0.001
"""Polars Eager""","""Min txn_date""",0.0213,0.002
…,…,…,…
"""Polars Eager""","""Avg/Total txn by merch type""",0.3571,0.0038
"""Polars Eager""","""Top 10 customers by volume""",7.7584,0.0029
"""Polars Eager""","""Fraud vs Non-Fraud Count""",0.1283,0.0027
"""Polars Eager""","""Fraud rate by merchant_type""",0.1447,0.0028
