## Polars Lazy analysis for banking datset

Dataset:
https://www.kaggle.com/datasets/ismetsemedov/transactions?resource=download

In [26]:
#Import libraries
import polars as pl
import time
import tracemalloc

In [27]:
# Create function to measure and log results
benchmark_results = []

def measure_and_log(step_name, library_name, operation_func, *args, **kwargs):
    tracemalloc.start()
    start = time.perf_counter()

    result = operation_func(*args, **kwargs)

    end = time.perf_counter()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    new_entry = {
        "Library": library_name,
        "Step": step_name,
        "Time (s)": round(end - start, 4),
        "Peak Memory (MB)": round(peak / 10**6, 4)
    }

    global benchmark_results
    benchmark_results = [
        entry for entry in benchmark_results
        if not (entry["Library"] == library_name and entry["Step"] == step_name)
    ]

    benchmark_results.append(new_entry)

    return result

In [28]:
transactions_csv = '/Users/adolfomytr/Documents/Data Science/Polars/synthetic_fraud_data.csv'

#### Step 1: Load the dataset

In [29]:
def load_lazy():
    return pl.read_csv(transactions_csv, try_parse_dates=True).lazy()

lf = measure_and_log("Load CSV", "Polars Lazy", load_lazy)

#### Step 2: Exploratory Data Analysis

In [30]:
summary = measure_and_log(
    "Describe numeric columns", "Polars Lazy",
    lambda: lf.collect().describe()
)

merchant_type_counts = measure_and_log(
    "Merchant type value counts", "Polars Lazy",
    lambda: lf.select(pl.col("merchant_type").value_counts()).collect()
)

missing_values = measure_and_log(
    "Missing values per column", "Polars Lazy",
    lambda: lf.null_count().collect()
)

min_date = measure_and_log(
    "Min txn_date", "Polars Lazy",
    lambda: lf.select(pl.col("timestamp").min()).collect()
)

max_date = measure_and_log(
    "Max txn_date", "Polars Lazy",
    lambda: lf.select(pl.col("timestamp").max()).collect()
)

unique_customers = measure_and_log(
    "Unique customer_id count", "Polars Lazy",
    lambda: lf.select("customer_id").unique().select(pl.len()).collect()
)

#### Step 3: Transaction Volume Analysis

In [31]:
daily_txns = measure_and_log("Daily transaction count", "Polars Lazy", lambda: 
    lf.with_columns(
        pl.col("timestamp").dt.truncate("1d").alias("day")
    ).group_by("day").agg(
        pl.len().alias("txn_count")
    ).collect()
)

monthly_txns = measure_and_log("Monthly transaction count", "Polars Lazy", lambda: 
    lf.with_columns(
        pl.col("timestamp").dt.truncate("1mo").alias("month")
    ).group_by("month").agg(
        pl.len().alias("txn_count")
    ).collect()
)

#### Step 4: Transaction Amount Aggregation

In [32]:
avg_total_by_type = measure_and_log("Avg/Total txn by type", "Polars Lazy", lambda: 
    lf.group_by("merchant_type").agg([
        pl.col("amount").mean().alias("avg_amount"),
        pl.col("amount").sum().alias("total_amount")
    ]).collect()
)

top_customers = measure_and_log("Top 10 customers by volume", "Polars Lazy", lambda: 
    lf.group_by("customer_id").agg(
        pl.col("amount").sum().alias("total_amount")
    ).sort("total_amount", descending=True).limit(10).collect()
)

#### Step 5: Fraud Analysis

In [33]:
fraud_vs_nonfraud = measure_and_log("Fraud vs Non-Fraud Count", "Polars Lazy", lambda:
    lf.group_by("is_fraud").agg(pl.len().alias("count")).collect()
)

fraud_rate_by_type = measure_and_log("Fraud rate by merchant_type", "Polars Lazy", lambda: 
    lf.group_by("merchant_type").agg(
        pl.col("is_fraud").mean().alias("fraud_rate")
    ).collect()
)

fraud_amount_distribution = measure_and_log("Fraud amount stats", "Polars Lazy", lambda:
    lf.filter(pl.col("is_fraud") == 1).collect().select("amount").describe()
)

#### Step 6: Export benchmark results

In [34]:
pl.DataFrame(benchmark_results).write_csv("/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_polars_lazy.csv")
pl.DataFrame(benchmark_results)

Library,Step,Time (s),Peak Memory (MB)
str,str,f64,f64
"""Polars Lazy""","""Load CSV""",11.0726,0.0035
"""Polars Lazy""","""Describe numeric columns""",1.4202,0.1031
"""Polars Lazy""","""Merchant type value counts""",0.616,0.0013
"""Polars Lazy""","""Missing values per column""",0.0012,0.0013
"""Polars Lazy""","""Min txn_date""",0.0202,0.0013
…,…,…,…
"""Polars Lazy""","""Avg/Total txn by type""",0.1313,0.0023
"""Polars Lazy""","""Top 10 customers by volume""",0.2028,0.0018
"""Polars Lazy""","""Fraud vs Non-Fraud Count""",0.0394,0.0022
"""Polars Lazy""","""Fraud rate by merchant_type""",0.1691,0.0018
