## Load DataFrame

In [None]:
import pandas as pd
from pathlib import Path

# adjust the path/name to the parquet file you want to load
pq_path = Path('audittest.pq')

if not pq_path.exists():
    print(f'Parquet file not found: {pq_path}')
else:
    try:
        df = pd.read_parquet(pq_path)
    except Exception as e:
        print('Error reading parquet:', e)
    else:
        print(f'Loaded {len(df)} rows from {pq_path}')
        # show a small preview
        print(df.head())

## General DataFrame Statistics of Interest

In [None]:
# Count of unique "parent" values
unique_parents_count = df['parent'].nunique()
print(f'Unique parent values: {unique_parents_count}')

# Count of unique files with suffix ".4gi"
gi_count = df[df['suffix'] == '.4gi']['rel_path'].nunique()
print(f'Unique .4gi files: {gi_count}')

# Newest and oldest mtime values
newest_mtime = df['mtime'].max()
oldest_mtime = df['mtime'].min()
print(f'Newest mtime: {newest_mtime}')
print(f'Oldest mtime: {oldest_mtime}')

# 5 largest "size_bytes" values
largest_sizes = df.nlargest(5, 'size_bytes')[['module', 'rel_path', 'size_bytes']]
print('5 largest files by size_bytes:')
print(largest_sizes)

# 5 largest "lines_of_code" values
largest_loc = df.nlargest(5, 'num_lines')[['module', 'rel_path', 'num_lines']]
print('5 largest counts of num_lines:')
print(largest_loc)

## Deeper Dive in Largest _lines-of-code_ Values

In [None]:
# Deeper dive in Largest "n_lines" values
lines_of_interest = df[df['suffix'] == ".4gl"]
# "official" app-directories, not _bkup, not sccs, etc.
lines_of_interest = lines_of_interest[lines_of_interest['parent'].str.endswith('.4gc')]
# null-cases
lines_of_interest = lines_of_interest[lines_of_interest['num_lines'] > 0]
print(f"Count is:", len(lines_of_interest))

import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
fig, (f1, f2) = plt.subplots(1, 2)
f1.hist(lines_of_interest['num_lines'], bins=30)
f1.yaxis.set_minor_locator(AutoMinorLocator())
f1.xaxis.set_label_text("num_lines")
f1.set_title("Frequency vs. Lines")
f2.hist(lines_of_interest['num_lines'], bins=30, range=(0,2000))
f2.yaxis.set_minor_locator(AutoMinorLocator())
f2.xaxis.set_minor_locator(AutoMinorLocator())
f2.set_title("Freq. vs. Lines (Detail)")
f2.xaxis.set_label_text("num_lines")
plt.ylabel('Frequency')

plt.show()

display(lines_of_interest.sort_values('num_lines', ascending=False)[['module', 'rel_path', 'num_lines']])

## **mtime** For Any `.ext`, `.org`, or `.4gl` Files Are Newer Than the **mtime** of the Respective `.4gi` File

In [None]:
from IPython.display import display
from datetime import datetime, timedelta
import isodate
compare_debounce = timedelta(seconds=10)
# Filter for .ext, .org, .4gl, and .4gi files
ext_org_4gl = df[df['suffix'].isin(['.ext', '.org', '.4gl'])]
gi_files = df[df['suffix'] == '.4gi']

# Prepare a mapping from parent to .4gi mtime
gi_mtimes = gi_files.set_index('parent')['mtime'].to_dict()

# Compare mtime for each file to its parent's .4gi mtime
def is_newer_than_gi(row):
    gi_mtime = gi_mtimes.get(row['parent'])
    t_gi_mtime = None
    t_row_mtime = None
    if gi_mtime is not None:
        t_gi_mtime = datetime.fromisoformat(str(gi_mtime))
        t_row_mtime = datetime.fromisoformat(str(row['mtime']))
    return gi_mtime is not None and t_row_mtime > t_gi_mtime + compare_debounce

def difference(row, in_iso=False):
    ret = None
    gi_mtime = gi_mtimes.get(row['parent'])
    t_gi_mtime = None
    t_row_mtime = None
    if gi_mtime is not None:
        t_gi_mtime = datetime.fromisoformat(str(gi_mtime))
        t_row_mtime = datetime.fromisoformat(str(row['mtime']))
        d = t_row_mtime - t_gi_mtime
        if in_iso:
            ret = isodate.duration_isoformat(d)
        else:
            ret = d.total_seconds()
    return ret

with_gi_mtimes = ext_org_4gl[ext_org_4gl.apply(is_newer_than_gi, axis='columns')].copy()
with_gi_mtimes['4gi_mtime'] = with_gi_mtimes['parent'].map(gi_mtimes)
result = with_gi_mtimes.copy()
result['secs_diff'] = with_gi_mtimes.apply(difference, axis='columns')
result['iso_diff'] = with_gi_mtimes.apply(difference, axis='columns', args=(True,))
display(result.sort_values('secs_diff', ascending=False)[["module", "rel_path", "mtime", "4gi_mtime", "secs_diff", "iso_diff"]])

plt.hist(result['secs_diff'], bins=40, range=(0, 8001))
plt.xticks(range(0, 8001, 1000))
plt.xticks(range(0, 8001, 200), minor=True)
plt.xlabel('secs_diff')
plt.ylabel('Frequency')

plt.show()


## All Apps That Have Changed Source-Files, Not Re-Built

In [None]:
print(result["parent"].unique())
print(f"Count is:", len(result["parent"].unique()))