## Pandas analysis for banking datset

Dataset:
https://www.kaggle.com/datasets/ismetsemedov/transactions?resource=download

In [29]:
#Import libraries
import pandas as pd
import time
import tracemalloc

In [30]:
#Create function to measure and log results
benchmark_results = []

def measure_and_log(step_name, library_name, operation_func, *args, **kwargs):
    import time
    import tracemalloc

    tracemalloc.start()
    start = time.perf_counter()

    result = operation_func(*args, **kwargs)

    end = time.perf_counter()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Build the new entry
    new_entry = {
        "Library": library_name,
        "Step": step_name,
        "Time (s)": round(end - start, 4),
        "Peak Memory (MB)": round(peak / 10**6, 4)
    }

    # Remove existing entry for same library + step
    global benchmark_results
    benchmark_results = [
        entry for entry in benchmark_results
        if not (entry["Library"] == library_name and entry["Step"] == step_name)
    ]

    # Add the new one
    benchmark_results.append(new_entry)

    return result

In [31]:
transactions_csv = '/Users/adolfomytr/Documents/Data Science/Polars/synthetic_fraud_data.csv'

#### Step 1: Load the dataset

In [32]:
def load_csv():
    return pd.read_csv(transactions_csv, parse_dates=["timestamp"])

df = measure_and_log("Load CSV", "Pandas", load_csv)

#### Step 2: Exploratory Data Analysis

In [33]:
summary = measure_and_log("Describe numeric columns", "Pandas", lambda: df.describe())

merchant_type_counts = measure_and_log("Merchang type value counts", "Pandas", lambda: df["merchant_type"].value_counts())

missing_values = measure_and_log("Missing values per column", "Pandas", lambda: df.isnull().sum())

min_date = measure_and_log("Min txn_date", "Pandas", lambda: df["timestamp"].min())
max_date = measure_and_log("Max txn_date", "Pandas", lambda: df["timestamp"].max())

unique_customers = measure_and_log("Unique customer_id count", "Pandas", lambda: df["customer_id"].nunique())


#### Step 3: Transaction Volume Analysis

In [34]:
daily_txns = measure_and_log("Daily transaction count", "Pandas", lambda: df.groupby(df["timestamp"].dt.to_period("D")).size())

monthly_txns = measure_and_log("Monthly transaction count", "Pandas", lambda: df.groupby(df["timestamp"].dt.to_period("M")).size())



#### Step 4: Transaction Amount Aggregation

In [35]:
avg_total_by_merch_type = measure_and_log("Avg/Total txn by merch type", "Pandas", lambda: df.groupby("merchant_type")["amount"].agg(["mean", "sum"]))

top_customers = measure_and_log("Top 10 customers by volume", "Pandas",
    lambda: df.groupby("customer_id")["amount"].sum().sort_values(ascending=False).head(10)
)

#### Step 5: Fraud Analysis

In [36]:
fraud_vs_nonfraud = measure_and_log("Fraud vs Non-Fraud Count", "Pandas", lambda: df["is_fraud"].value_counts())

fraud_rate_by_type = measure_and_log("Fraud rate by merchant_type", "Pandas",
    lambda: df.groupby("merchant_type")["is_fraud"].mean()
)

fraud_amount_distribution = measure_and_log("Fraud amount stats", "Pandas",
    lambda: df[df["is_fraud"] == 1]["amount"].describe()
)

#### Step 6: Export benchmark results

In [37]:
pd.DataFrame(benchmark_results).to_csv("/Users/adolfomytr/Documents/Data Science/Polars/benchmark_pandas.csv", index=False)
pd.DataFrame(benchmark_results)

Unnamed: 0,Library,Step,Time (s),Peak Memory (MB)
0,Pandas,Load CSV,171.9839,6036.7279
1,Pandas,Describe numeric columns,1.2556,366.7304
2,Pandas,Merchang type value counts,0.7293,14.9697
3,Pandas,Missing values per column,7.892,538.8379
4,Pandas,Min txn_date,0.0559,67.421
5,Pandas,Max txn_date,0.0132,14.9684
6,Pandas,Unique customer_id count,0.3876,59.9374
7,Pandas,Daily transaction count,0.3228,187.1018
8,Pandas,Monthly transaction count,0.2407,187.0986
9,Pandas,Avg/Total txn by merch type,0.4115,119.7487
