# 01 — Hash and Anchor (Local Simulation)

This notebook teaches hashing + anchoring without running a blockchain node. It uses a **local JSON registry** to simulate on-chain storage.

## Hashing Policy (v1)
- **Algorithm**: SHA-256
- **Row Columns**: all columns in the dataset (explicitly listed in code)
- **Column Order**: sorted ascending (or specified list)
- **Separator**: `|` (with delimiter escaping)
- **Null Handling**: `None`/`NaN` → empty string
- **Floats**: normalized to 6-decimal precision (trailing zeros trimmed)
- **Whitespace**: `strip + collapse internal` for strings
- **Dataset Ordering**: by a stable business key if present (e.g., `shipment_id`), else by `row_hash`
- **Manifest**: JSON written next to the dataset with the exact settings and dataset hash

> Change policy → bump the version string (e.g., `hash-policy-2`) and re-derive hashes.


In [1]:
# Robust imports: find Portfolio root and load the local anchor helpers
from pathlib import Path
import sys

def find_portfolio_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "Blockchain/common/utils/hash_anchor.py").exists():
            return p
    raise RuntimeError(f"Couldn't find Portfolio root from: {start}")

PORTFOLIO_ROOT = find_portfolio_root(Path.cwd())
UTILS_DIR = (PORTFOLIO_ROOT / "Blockchain/common/utils").resolve()
if str(UTILS_DIR) not in sys.path:
    sys.path.insert(0, str(UTILS_DIR))

from hash_anchor import sha256_bytes, anchor_hash, verify_hash
print("Loaded hash_anchor from:", UTILS_DIR)

Loaded hash_anchor from: C:\Users\beall\OneDrive\Documents\Portfolio\Blockchain\common\utils


In [2]:
# Create a tiny demo CSV (idempotent) and read it
import pandas as pd
demo_csv = PORTFOLIO_ROOT / 'Blockchain/common/notebooks/demo_shipments.csv'
if not demo_csv.exists():
    demo_csv.write_text('shipment_id,gtin,case_id,temp_c\nS1,000123,CASE001,4.1\nS2,000123,CASE002,9.9\n', encoding='utf-8')
df = pd.read_csv(demo_csv)
df

Unnamed: 0,shipment_id,gtin,case_id,temp_c
0,S1,123,CASE001,4.1
1,S2,123,CASE002,9.9


In [3]:
# Hardened row hashing with a documented policy
import hashlib, math

SEP = '|'            # record in manifest
FLOAT_FMT = "{:.6f}" # record in manifest

def norm_val(v):
    if v is None or (isinstance(v, float) and math.isnan(v)):
        return ''
    if isinstance(v, str):
        return ' '.join(v.strip().split())
    if isinstance(v, float):
        s = FLOAT_FMT.format(v)
        return s.rstrip('0').rstrip('.') if '.' in s else s
    return str(v)

def row_digest(row, columns=None):
    cols = sorted(columns or row.index)
    parts = [norm_val(row[c]) for c in cols]
    parts = [p.replace(SEP, f"\\{SEP}") for p in parts]
    payload = SEP.join(parts).encode('utf-8')
    return hashlib.sha256(payload).hexdigest()

# Choose columns: here we use all columns; you can pass a subset
cols = sorted(df.columns)
df['row_hash'] = df.apply(lambda r: row_digest(r, columns=cols), axis=1)
df

Unnamed: 0,shipment_id,gtin,case_id,temp_c,row_hash
0,S1,123,CASE001,4.1,a8e361e42d77633e02042d7fc9a41f7ee082a9c9264af3...
1,S2,123,CASE002,9.9,79cae2578b17dda1f71b3d5c03bdc75d4813768d1b5304...


In [4]:
# Build a deterministic dataset-level hash
if 'shipment_id' in df.columns:
    row_hashes = df.sort_values('shipment_id')['row_hash']
else:
    row_hashes = df['row_hash'].sort_values().reset_index(drop=True)

dataset_payload = '\n'.join(row_hashes).encode('utf-8')
dataset_hash = hashlib.sha256(dataset_payload).hexdigest()
print("dataset_hash:", dataset_hash)

dataset_hash: 16a69b607cb86a202c15fd16c1465292076e07dabc2b63f5070ed4e0dbb80fce


In [5]:
# Anchor the dataset hash with a RELATIVE ref
rel = demo_csv.relative_to(PORTFOLIO_ROOT).as_posix()
ref = f"local://{rel}#v=hash-policy-1"
try:
    anchor_hash(dataset_hash, ref)
    print("Anchored dataset hash (relative ref)")
except ValueError:
    print("Dataset hash already anchored; update registry ref manually if desired.")
print("verify:", verify_hash(dataset_hash, ref))


Dataset hash already anchored; update registry ref manually if desired.
verify: True


In [6]:
# Write a manifest documenting the exact policy and dataset-level hash
import json
manifest = {
    "source": demo_csv.as_posix(),
    "hash_algorithm": "sha256",
    "row_policy": {
        "columns": cols,
        "separator": SEP,
        "float_precision": 6,
        "null_as": "",
        "whitespace_norm": "strip+collapse",
        "delimiter_escape": True
    },
    "ordering": "by 'shipment_id' ascending" if 'shipment_id' in df.columns else "by row_hash ascending",
    "dataset_hash": dataset_hash,
}
(PORTFOLIO_ROOT / 'Blockchain/common/notebooks/demo_shipments.manifest.json').write_text(
    json.dumps(manifest, indent=2), encoding='utf-8'
)
print("Wrote manifest →", (PORTFOLIO_ROOT / 'Blockchain/common/notebooks/demo_shipments.manifest.json'))

Wrote manifest → C:\Users\beall\OneDrive\Documents\Portfolio\Blockchain\common\notebooks\demo_shipments.manifest.json


In [9]:
# Verify via local API from *this* notebooks folder
import json, requests
from pathlib import Path

print("cwd:", Path.cwd())  # just to confirm

# Manifest is in the same folder as the notebook
manifest = json.loads(Path("demo_shipments.manifest.json").read_text(encoding="utf-8"))
hex_hash = manifest["dataset_hash"]

# The ref must match what you anchored (we used this earlier)
ref = "local://Blockchain/common/notebooks/demo_shipments.csv#v=hash-policy-1"

# Health check
h = requests.get("http://127.0.0.1:8000/healthz", timeout=5)
h.raise_for_status()
print("healthz:", h.json())

# Verify (requests handles the '#' in params for you)
r = requests.get("http://127.0.0.1:8000/verify", params={"hash": hex_hash, "ref": ref}, timeout=5)
r.raise_for_status()
r.json()  # expect {'ok': True, ...}


cwd: c:\Users\beall\OneDrive\Documents\Portfolio


FileNotFoundError: [Errno 2] No such file or directory: 'demo_shipments.manifest.json'

In [10]:
# 1) Is the API up?
import requests
print("health:", requests.get("http://127.0.0.1:8000/healthz", timeout=5).json())

# 2) Did we load a real hash?
print("hash:", hex_hash, "len:", len(hex_hash))

# 3) Does existence-only verification succeed?
print("verify(existence):", requests.get("http://127.0.0.1:8000/verify",
                                         params={"hash": hex_hash}, timeout=5).json())


health: {'status': 'ok', 'registry_exists': True}
hash: 16a69b607cb86a202c15fd16c1465292076e07dabc2b63f5070ed4e0dbb80fce len: 64
verify(existence): {'ok': True, 'hash': '16a69b607cb86a202c15fd16c1465292076e07dabc2b63f5070ed4e0dbb80fce', 'ref': None, 'reason': None}


In [None]:
# Auto-verify (repo-root aware): prefer relative ref; fall back to stored ref
import os, json, requests
from pathlib import Path

def find_portfolio_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "Blockchain/common/utils/hash_anchor.py").exists():
            return p
    raise RuntimeError(f"Couldn't find Portfolio root from: {start}")

PORTFOLIO_ROOT = find_portfolio_root(Path.cwd())

MANIFEST = PORTFOLIO_ROOT / "Blockchain/common/notebooks/demo_shipments.manifest.json"
REGISTRY = PORTFOLIO_ROOT / "Blockchain/common/utils/_local_anchor_registry.json"
BASE = os.getenv("VERIFY_API_BASE", "http://127.0.0.1:8000")

# Load hash from manifest
m = json.loads(MANIFEST.read_text(encoding="utf-8"))
hex_hash = m["dataset_hash"]

# Preferred (relative) ref that we anchored with earlier
preferred_ref = "local://Blockchain/common/notebooks/demo_shipments.csv#v=hash-policy-1"

def verify(hash_hex, ref=None):
    params = {"hash": hash_hex}
    if ref is not None:
        params["ref"] = ref  # requests will URL-encode the '#'
    r = requests.get(f"{BASE}/verify", params=params, timeout=5)
    r.raise_for_status()
    return r.json()

print("healthz:", requests.get(f"{BASE}/healthz", timeout=5).json())

res = verify(hex_hash, preferred_ref)
if res.get("ok"):
    print("✅ Verified with preferred ref")
    res
else:
    print("ℹ️ Preferred ref failed:", res.get("reason"))
    if REGISTRY.exists():
        reg = json.loads(REGISTRY.read_text(encoding="utf-8"))
        stored = reg.get(hex_hash)
        if stored and stored.get("ref"):
            stored_ref = stored["ref"]
            res2 = verify(hex_hash, stored_ref)
            if res2.get("ok"):
                print("✅ Verified with STORED ref from registry")
                print("stored_ref:", stored_ref)
                res2
            else:
                print("❌ Stored ref also failed:", res2.get("reason"))
                res2
        else:
            print("❌ Hash not present in local registry or no ref stored.")
            verify(hex_hash)  # existence-only
    else:
        print("❌ Local registry not found at:", REGISTRY)
        verify(hex_hash)      # existence-only


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\beall\\OneDrive\\Documents\\Portfolio\\demo_shipments.manifest.json'

## Next Steps
1. Replace the local registry with `web3.py` + `ProofOfProvenance.sol` on a testnet (e.g., Sepolia).
2. Add a tiny FastAPI endpoint `/verify?hash=...` that consults the registry.
3. Join `row_hash` or `dataset_hash` back into KPIs / optimization models to demonstrate tamper-evidence in your logistics projects.
