## Polars Eager analysis for banking datset

Dataset:
https://www.kaggle.com/datasets/ismetsemedov/transactions?resource=download

In [22]:
# Import libraries
import polars as pl
import time
import tracemalloc

In [23]:
# Create function to measure and log results
benchmark_results = []

def measure_and_log(step_name, library_name, operation_func, *args, **kwargs):
    tracemalloc.start()
    start = time.perf_counter()

    result = operation_func(*args, **kwargs)

    end = time.perf_counter()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Build the new entry
    new_entry = {
        "Library": library_name,
        "Step": step_name,
        "Time (s)": round(end - start, 4),
        "Peak Memory (MB)": round(peak / 10**6, 4)
    }

    # Remove existing entry for same library + step
    global benchmark_results
    benchmark_results = [
        entry for entry in benchmark_results
        if not (entry["Library"] == library_name and entry["Step"] == step_name)
    ]

    # Add the new one
    benchmark_results.append(new_entry)

    return result

In [24]:
transactions_csv = '/Users/adolfomytr/Documents/Data Science/Polars/synthetic_fraud_data.csv'

#### Step 1: Load the dataset

In [25]:
def load_csv():
    return pl.read_csv(transactions_csv, try_parse_dates=True)

df = measure_and_log("Load CSV", "Polars Eager", load_csv)

#### Step 2: Exploratory Data Analysis

In [26]:
# Summary statistics for numeric columns
summary = measure_and_log(
    "Describe numeric columns", "Polars Eager",
    lambda: df.describe()
)

# Value counts for merchant_type
merchant_type_counts = measure_and_log(
    "Merchang type value counts", "Polars Eager",
    lambda: df.select(pl.col("merchant_type")).to_series().value_counts()
)

# Missing values per column
missing_values = measure_and_log(
    "Missing values per column", "Polars Eager",
    lambda: df.null_count()
)

# Minimum timestamp
min_date = measure_and_log(
    "Min txn_date", "Polars Eager",
    lambda: df.select(pl.col("timestamp").min())
)

# Maximum timestamp
max_date = measure_and_log(
    "Max txn_date", "Polars Eager",
    lambda: df.select(pl.col("timestamp").max())
)

# Unique customer_id count
unique_customers = measure_and_log(
    "Unique customer_id count", "Polars Eager",
    lambda: df.select(pl.col("customer_id")).n_unique()
)

#### Step 3: Transaction Volume Analysis

In [27]:
# Daily transaction count
daily_txns = measure_and_log("Daily transaction count", "Polars Eager", lambda: 
    df.group_by("timestamp").count()
)

# Monthly transaction count (convert timestamp to month)
monthly_txns = measure_and_log("Monthly transaction count", "Polars Eager", lambda: 
    df.with_columns(
        pl.col("timestamp").dt.truncate("1mo").alias("month")
    ).group_by("month").count()
)

  df.group_by("timestamp").count()
  df.with_columns(


In [28]:
benchmark_results = pl.DataFrame(benchmark_results)
benchmark_results

Library,Step,Time (s),Peak Memory (MB)
str,str,f64,f64
"""Polars Eager""","""Load CSV""",10.2407,0.0035
"""Polars Eager""","""Describe numeric columns""",1.3188,0.1034
"""Polars Eager""","""Merchang type value counts""",0.5702,0.0016
"""Polars Eager""","""Missing values per column""",0.0,0.0001
"""Polars Eager""","""Min txn_date""",0.0221,0.0016
"""Polars Eager""","""Max txn_date""",0.0022,0.0016
"""Polars Eager""","""Unique customer_id count""",0.6007,0.0022
"""Polars Eager""","""Daily transaction count""",6.739,0.0058
"""Polars Eager""","""Monthly transaction count""",0.1157,0.0035
