## Polars Lazy analysis for banking datset

Dataset:
https://www.kaggle.com/datasets/ismetsemedov/transactions?resource=download

In [1]:
#Import libraries
import polars as pl
import time
import tracemalloc

In [2]:
# Create function to measure and log results (Polars Lazy - fixed for Series/scalars)
benchmark_results = []

def measure_and_log(step_name, library_name, operation_func, *args, **kwargs):
    import time
    import tracemalloc
    import polars as pl

    tracemalloc.start()
    start = time.perf_counter()

    result = operation_func(*args, **kwargs)

    end = time.perf_counter()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # ✅ Measure total object memory (only if DataFrame can be collected)
    total_mem = None
    try:
        if isinstance(result, pl.LazyFrame):
            total_mem = round(result.collect().estimated_size() / 10**6, 4)
        elif isinstance(result, pl.DataFrame):
            total_mem = round(result.estimated_size() / 10**6, 4)
    except Exception:
        total_mem = None

    new_entry = {
        "Library": library_name,
        "Step": step_name,
        "Time (s)": round(end - start, 4),
        "Peak Memory (MB)": round(peak / 10**6, 4),
        "Total Object Memory (MB)": total_mem
    }

    global benchmark_results
    benchmark_results = [
        entry for entry in benchmark_results
        if not (entry["Library"] == library_name and entry["Step"] == step_name)
    ]

    benchmark_results.append(new_entry)

    return result


In [3]:
transactions_csv = '/Users/adolfomytr/Documents/Data Science/Polars/synthetic_fraud_data.csv'

#### Step 1: Load the dataset

In [4]:
def load_lazy():
    return pl.read_csv(transactions_csv, try_parse_dates=True).lazy()

lf = measure_and_log("Load CSV", "Polars Lazy", load_lazy)

#### Step 2: Exploratory Data Analysis

In [5]:
summary = measure_and_log(
    "Describe numeric columns", "Polars Lazy",
    lambda: lf.collect().describe()
)

merchant_type_counts = measure_and_log(
    "Merchant type value counts", "Polars Lazy",
    lambda: lf.select(pl.col("merchant_type").value_counts()).collect()
)

missing_values = measure_and_log(
    "Missing values per column", "Polars Lazy",
    lambda: lf.null_count().collect()
)

min_date = measure_and_log(
    "Min txn_date", "Polars Lazy",
    lambda: lf.select(pl.col("timestamp").min()).collect()
)

max_date = measure_and_log(
    "Max txn_date", "Polars Lazy",
    lambda: lf.select(pl.col("timestamp").max()).collect()
)

unique_customers = measure_and_log(
    "Unique customer_id count", "Polars Lazy",
    lambda: lf.select("customer_id").unique().select(pl.len()).collect()
)

#### Step 3: Transaction Volume Analysis

In [6]:
daily_txns = measure_and_log("Daily transaction count", "Polars Lazy", lambda: 
    lf.with_columns(
        pl.col("timestamp").dt.truncate("1d").alias("day")
    ).group_by("day").agg(
        pl.len().alias("txn_count")
    ).collect()
)

monthly_txns = measure_and_log("Monthly transaction count", "Polars Lazy", lambda: 
    lf.with_columns(
        pl.col("timestamp").dt.truncate("1mo").alias("month")
    ).group_by("month").agg(
        pl.len().alias("txn_count")
    ).collect()
)

#### Step 4: Transaction Amount Aggregation

In [7]:
avg_total_by_type = measure_and_log("Avg/Total txn by merch type", "Polars Lazy", lambda: 
    lf.group_by("merchant_type").agg([
        pl.col("amount").mean().alias("avg_amount"),
        pl.col("amount").sum().alias("total_amount")
    ]).collect()
)

top_customers = measure_and_log("Top 10 customers by volume", "Polars Lazy", lambda: 
    lf.group_by("customer_id").agg(
        pl.col("amount").sum().alias("total_amount")
    ).sort("total_amount", descending=True).limit(10).collect()
)

#### Step 5: Fraud Analysis

In [8]:
fraud_vs_nonfraud = measure_and_log("Fraud vs Non-Fraud Count", "Polars Lazy", lambda:
    lf.group_by("is_fraud").agg(pl.len().alias("count")).collect()
)

fraud_rate_by_type = measure_and_log("Fraud rate by merchant_type", "Polars Lazy", lambda: 
    lf.group_by("merchant_type").agg(
        pl.col("is_fraud").mean().alias("fraud_rate")
    ).collect()
)

fraud_amount_distribution = measure_and_log("Fraud amount stats", "Polars Lazy", lambda:
    lf.filter(pl.col("is_fraud") == 1).collect().select("amount").describe()
)

#### Step 6: Export benchmark results

In [9]:
pl.DataFrame(benchmark_results).write_csv("/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_polars_lazy.csv")
pl.DataFrame(benchmark_results)

Library,Step,Time (s),Peak Memory (MB),Total Object Memory (MB)
str,str,f64,f64,f64
"""Polars Lazy""","""Load CSV""",10.7296,0.569,2471.3581
"""Polars Lazy""","""Describe numeric columns""",1.4702,0.2874,0.0015
"""Polars Lazy""","""Merchant type value counts""",0.3087,0.0119,0.0002
"""Polars Lazy""","""Missing values per column""",0.0011,0.0013,0.0001
"""Polars Lazy""","""Min txn_date""",0.0216,0.0017,0.0
…,…,…,…,…
"""Polars Lazy""","""Avg/Total txn by merch type""",0.1389,0.0031,0.0004
"""Polars Lazy""","""Top 10 customers by volume""",0.163,0.0035,0.0002
"""Polars Lazy""","""Fraud vs Non-Fraud Count""",0.0187,0.0027,0.0
"""Polars Lazy""","""Fraud rate by merchant_type""",0.0513,0.0023,0.0003
