## Load DataFrame

In [None]:
import pandas as pd
from pathlib import Path

# adjust the path/name to the parquet file you want to load
pq_path = Path('audittest.pq')

if not pq_path.exists():
    print(f'Parquet file not found: {pq_path}')
else:
    try:
        df = pd.read_parquet(pq_path)
    except Exception as e:
        print('Error reading parquet:', e)
    else:
        print(f'Loaded {len(df)} rows from {pq_path}')
        # show a small preview
        print(df.head())

## General DataFrame Statistics of Interest

In [None]:
# Count of unique "parent" values
unique_parents_count = df['parent'].nunique()
print(f'Unique parent values: {unique_parents_count}')

# Count of unique files with suffix ".4gi"
gi_count = df[df['suffix'] == '.4gi']['rel_path'].nunique()
print(f'Unique .4gi files: {gi_count}')

# Newest and oldest mtime values
newest_mtime = df['mtime'].max()
oldest_mtime = df['mtime'].min()
print(f'Newest mtime: {newest_mtime}')
print(f'Oldest mtime: {oldest_mtime}')

# 5 largest "size_bytes" values
largest_sizes = df.nlargest(5, 'size_bytes')[['rel_path', 'size_bytes']]
print('5 largest files by size_bytes:')
print(largest_sizes)

## Show where the mtime for any `.ext`, `.org`, and `.4gl` files are greater than the **mtime** of the `.4gi` file for any given parent.

In [None]:
# Filter for .ext, .org, .4gl, and .4gi files
ext_org_4gl = df[df['suffix'].isin(['.ext', '.org', '.4gl'])]
gi_files = df[df['suffix'] == '.4gi']

# Prepare a mapping from parent to .4gi mtime
gi_mtimes = gi_files.set_index('parent')['mtime'].to_dict()

# Compare mtime for each file to its parent's .4gi mtime
def is_newer_than_gi(row):
    gi_mtime = gi_mtimes.get(row['parent'])
    return gi_mtime is not None and row['mtime'] > gi_mtime

result = ext_org_4gl[ext_org_4gl.apply(is_newer_than_gi, axis=1)].copy()
result['4gi_mtime'] = result['parent'].map(gi_mtimes)
print(result[["rel_path", "mtime", "4gi_mtime"]])