In [11]:
import boto3
import pandas as pd
import pyarrow.parquet as pq
import io

In [12]:
def read_profiles(bucket: str, source: str) -> pd.DataFrame:
    """
    Reads all non-empty Parquet profiling files for a given source from S3.

    Args:
        bucket (str): S3 bucket name
        source (str): Source name (e.g., "payments")

    Returns:
        pd.DataFrame: Concatenated DataFrame of all profiling files
    """
    prefix = f"profiling/source={source}/"
    s3 = boto3.client("s3")

    # List objects in folder
    objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix).get("Contents", [])
    
    # Filter out empty files
    objects = [obj for obj in objects if obj["Size"] > 0]

    if not objects:
        print(f"No non-empty profiling files found for source: {source}")
        return pd.DataFrame()

    dfs = []
    for obj in objects:
        obj_data = s3.get_object(Bucket=bucket, Key=obj["Key"])
        table = pq.read_table(io.BytesIO(obj_data["Body"].read()))
        dfs.append(table.to_pandas())

    df = pd.concat(dfs, ignore_index=True)
   
    return df

In [13]:
sources = ["payments",  "merchant", "device", "customer_behavior", "fraud_timing"]
bucket = "danske-bank-project-metadata"

source_columns = {}


all_profiles = []

for source in sources:
    df = read_profiles(bucket, source)
    if not df.empty:
        source_columns[source] = df["column_name"].unique()  
        all_profiles.append(df)
df_profiles = pd.concat(all_profiles, ignore_index=True)
# Quick overview
for source, cols in source_columns.items():
    print(f"\nSource: {source}")
    print(cols)



Source: payments
<ArrowStringArray>
[              'tx_id',          'event_time',      'sender_account',
    'receiver_account',              'amount',     'payment_channel',
            'location',       'source_system', 'ingestion_timestamp']
Length: 9, dtype: str

Source: merchant
<ArrowStringArray>
[                 'ID',            'Category',                'Type',
              'Amount',            'Location',       'source_system',
 'ingestion_timestamp']
Length: 7, dtype: str

Source: device
<ArrowStringArray>
['device', 'network', 'transaction_id', 'source_system',
 'ingestion_timestamp']
Length: 5, dtype: str

Source: customer_behavior
<ArrowStringArray>
[              'sender_account',          'avg_amount_last_24h',
            'tx_count_last_24h',           'max_velocity_score',
          'mean_velocity_score',           'min_velocity_score',
        'avg_geo_anomaly_score', 'avg_spending_deviation_score',
                'source_system',          'ingestion_timestamp']

In [14]:
df

Unnamed: 0,source,column_name,metric,metric_value,data_type,ingestion_timestamp,profiling_timestamp
0,fraud_timing,transaction_id,row_count,5000000.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
1,fraud_timing,transaction_id,null_count,0.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
2,fraud_timing,transaction_id,null_pct,0.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
3,fraud_timing,transaction_id,distinct_count,5000000.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
4,fraud_timing,timestamp,row_count,5000000.0,timestamp,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
5,fraud_timing,timestamp,null_count,3.0,timestamp,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
6,fraud_timing,timestamp,null_pct,6e-05,timestamp,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
7,fraud_timing,timestamp,distinct_count,4999594.0,timestamp,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
8,fraud_timing,location,row_count,5000000.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
9,fraud_timing,location,null_count,50470.0,string,2026-01-21 22:09:49.480295,2026-01-21 22:09:49.480295
