babelomics · dlopez-bioinfo · May 7, 2026 · Mar 26, 2026 · May 6, 2026 · May 6, 2026
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -8,11 +8,14 @@ on:
       - 'mkdocs.yml'
       - 'src/**'
   workflow_dispatch:
+    inputs:
+      bootstrap:
+        description: 'Wipe gh-pages before deploying (one-time when migrating from mkdocs gh-deploy to mike)'
+        type: boolean
+        default: false
 
 permissions:
   contents: write
-  pages: write
-  id-token: write
 
 concurrency:
   group: pages
@@ -21,11 +24,10 @@ concurrency:
 jobs:
   deploy:
     runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - uses: actions/setup-python@v5
         with:
@@ -34,5 +36,17 @@ jobs:
       - name: Install docs dependencies
         run: pip install -e ".[docs]"
 
-      - name: Build and deploy docs
-        run: mkdocs gh-deploy --force
+      - name: Configure git for mike
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Fetch gh-pages
+        run: git fetch origin gh-pages --depth=1 || true
+
+      - name: Bootstrap gh-pages (wipe old flat-deploy content)
+        if: ${{ inputs.bootstrap == true }}
+        run: mike delete --all --push --allow-empty
+
+      - name: Deploy dev docs with mike
+        run: mike deploy --push --update-aliases dev
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -126,3 +126,40 @@ jobs:
           tags: |
             ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}
             ghcr.io/${{ github.repository }}:latest
+
+  # ── 6. Versioned docs → gh-pages via mike (final releases only) ──────
+  docs:
+    needs: test
+    if: ${{ !contains(github.ref_name, 'rc') }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install docs dependencies
+        run: pip install -e ".[docs]"
+
+      - name: Configure git for mike
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Fetch gh-pages
+        run: git fetch origin gh-pages --depth=1 || true
+
+      - name: Extract version
+        id: version
+        run: echo "version=${GITHUB_REF_NAME#v}" >> $GITHUB_OUTPUT
+
+      - name: Deploy versioned docs and update latest alias
+        run: mike deploy --push --update-aliases ${{ steps.version.outputs.version }} latest
+
+      - name: Set default to latest
+        run: mike set-default --push latest
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,4 @@ src/afquery/_version.py
 *.parquet
 db/*
 site
+CONTEXT.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,35 @@
+# Contributing
+
+## Documentation deployment
+
+The published site at <https://dlopez-bioinfo.github.io/afquery/> is built with [MkDocs](https://www.mkdocs.org/) (Material theme) and versioned with [mike](https://github.com/jimporter/mike). Source content lives under `docs/` and is configured by `mkdocs.yml`.
+
+### CI workflows
+
+| Workflow | Trigger | What it does |
+|---|---|---|
+| `.github/workflows/docs.yml` | push to `master` (paths: `docs/**`, `mkdocs.yml`, `src/**`) and manual dispatch | `mike deploy --push --update-aliases dev` — publishes the working master under the `dev` alias |
+| `.github/workflows/release.yml` (job `docs`) | push of a `v*` tag (non-`rc`) | `mike deploy --push --update-aliases <version> latest` followed by `mike set-default --push latest` — publishes the tagged version, points `latest` at it, and makes `latest` the site root |
+
+### Cutting a release
+
+Tag the commit with a [PEP 440](https://peps.python.org/pep-0440/)-compatible version prefixed with `v` (e.g. `v0.3.0`) and push the tag. The `release.yml` workflow handles PyPI, the GitHub release, the Docker image, and the versioned docs deploy. Pre-release tags (anything containing `rc`) skip Docker and docs publishing.
+
+### Local preview
+
+```bash
+pip install -e ".[docs]"
+
+# Plain build — no versioning, fastest iteration on content
+mkdocs serve
+
+# Versioned layout — only after running mike deploy locally at least once
+mike deploy 0.0.0-test dev      # writes to local gh-pages branch (no push)
+mike serve                      # serves the gh-pages branch with version selector
+```
+
+To discard local mike state: `git branch -D gh-pages`.
+
+### Migrating from a flat `mkdocs gh-deploy` (one-time)
+
+When `gh-pages` still contains content from the previous flat deploy, the first `mike deploy` will leave the old root-level files in place. To wipe `gh-pages` clean before deploying, manually trigger the *Deploy Documentation* workflow with `bootstrap: true`. This runs `mike delete --all --push --allow-empty` before deploying, leaving only the versioned layout (`versions.json` + version subdirectories + redirector at root).
diff --git a/docs/advanced/coverage-evidence.md b/docs/advanced/coverage-evidence.md
@@ -0,0 +1,192 @@
+# Coverage Evidence
+
+Standard variant-only VCFs do not record hom-ref calls. When a sample has no
+entry at a position, AFQuery has to decide whether the sample is genuinely
+homozygous reference or simply was not sequenced there. For **fully-covered
+techs** — those registered without a BED capture file in the manifest, so
+every position is assumed to be sequenced — the answer is unambiguous. For
+**partially-covered techs** (whole-exome kits, gene panels), the BED proves a
+position was *targeted* by the assay, not that *this* sample was sequenced
+deeply enough to call a confident hom-ref.
+
+`N_NO_COVERAGE` lets you label that uncertain subset instead of forcing it
+into `N_HOM_REF`. The flags below decide *which* samples land there.
+
+---
+
+## What `N_NO_COVERAGE` represents
+
+`N_NO_COVERAGE` counts eligible samples whose hom-ref status is not trusted
+under the active criteria. The genotype invariant becomes:
+
+```
+N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL + N_NO_COVERAGE = n_eligible
+```
+
+Samples in `N_NO_COVERAGE` remain in `eligible` and contribute to `AN` (just
+like `N_FAIL`), so AC/AN/AF stay conservative — the field never inflates
+allele frequencies. Two rules always hold:
+
+- **Carriers are never reclassified.** A sample with a `het`, `hom`, or
+  `fail` call at the position stays in its category. `N_NO_COVERAGE` only
+  draws from non-carriers.
+- **Fully-covered samples are never gated.** Every coverage flag is a
+  *per-tech* decision evaluated only on partially-covered techs. Samples on
+  fully-covered techs are always treated as hom-ref when they have no
+  carrier call.
+
+---
+
+## Cohort-evidence gates at query time
+
+These flags use only the carriers already present in your cohort to decide
+whether each partially-covered tech has enough evidence to trust hom-ref at a
+position. They run at query time, so no database rebuild is needed.
+
+| Flag | Effect |
+|------|--------|
+| `--min-pass K`     | A partially-covered tech must have ≥K PASS carriers (`het ∪ hom`) at the position. If it falls short, all of its non-carrier samples move from `N_HOM_REF` to `N_NO_COVERAGE`. |
+| `--min-observed K` | Same shape, but counts every recorded carrier (`het ∪ hom ∪ fail`). Useful when a non-PASS call still proves the position was sequenced. |
+
+When both flags are >0, both must hold (AND). The default `0` disables the
+gate.
+
+!!! tip
+    If your VCFs do not carry `FORMAT/DP` or `FORMAT/GQ`, these are the
+    flags you want. They are the cheapest option and apply to any database.
+
+### Worked example
+
+The numbers below are illustrative; concrete values depend on your cohort.
+
+Default query — every BED-covered non-carrier counts as hom-ref:
+
+```bash
+afquery query --db ./db/ --locus chr1:925952
+```
+
+```
+chr1:925952 G>A  AC=142  AN=2742  AF=0.0518  n_eligible=1371  N_HET=138  N_HOM_ALT=2  N_HOM_REF=1231  N_FAIL=0  N_NO_COVERAGE=0
+```
+
+Now require at least one PASS carrier per partially-covered tech:
+
+```bash
+afquery query --db ./db/ --locus chr1:925952 --min-pass 1
+```
+
+```
+chr1:925952 G>A  AC=142  AN=2742  AF=0.0518  n_eligible=1371  N_HET=138  N_HOM_ALT=2  N_HOM_REF=1108  N_FAIL=0  N_NO_COVERAGE=123
+```
+
+Samples on partially-covered techs that did not contribute a single PASS
+carrier at this position have moved out of `N_HOM_REF` and into
+`N_NO_COVERAGE`. `AC`, `AN`, and `AF` are unchanged: the samples are still
+eligible, they just no longer count as confident hom-refs.
+
+---
+
+## Quality-aware filtering at database creation
+
+If your VCFs carry `FORMAT/DP`, `FORMAT/GQ`, or you trust the `QUAL` column,
+you can demand that carriers meet quality thresholds before they count as
+evidence for hom-ref. These flags apply when you create the database, so the
+coverage decision is baked in.
+
+| Flag (`create-db`) | Effect |
+|--------------------|--------|
+| `--min-dp D`     | Minimum `FORMAT/DP` per carrier. |
+| `--min-gq G`     | Minimum `FORMAT/GQ` per carrier. |
+| `--min-qual Q`   | Minimum VCF `QUAL` per carrier. |
+| `--min-covered K`| Per partially-covered tech, the position is "trusted" only if at least K of its carriers pass the quality thresholds. Non-carriers of failing positions are recorded as `N_NO_COVERAGE`. |
+
+A carrier counts as quality-passing only if **all** active thresholds hold
+(unset thresholds are simply ignored). At least one of these flags must be
+non-zero to enable quality-aware coverage filtering — without that, queries
+fall back to the cohort-evidence gates above.
+
+```bash
+afquery create-db \
+  --manifest samples.tsv \
+  --output-dir ./db/ \
+  --genome-build GRCh38 \
+  --bed-dir ./beds/ \
+  --min-dp 30 --min-gq 20 --min-covered 1
+```
+
+!!! note
+    The chosen thresholds are recorded with the database and re-applied
+    automatically when you grow it via `update-db --add-samples`. You do
+    not re-pass them on each update.
+
+Enabling quality-aware filtering requires creating (or re-creating) the
+database; existing databases without quality data must be rebuilt.
+
+### Tightening at query time — `--min-quality-evidence`
+
+Once a database has been built with at least one of `--min-dp`,
+`--min-gq`, `--min-qual`, or `--min-covered`, you can tighten the gate at
+query time without rebuilding:
+
+```bash
+afquery query --db ./db/ --locus chr1:925952 --min-quality-evidence 5
+```
+
+`--min-quality-evidence K` requires each partially-covered tech to have ≥K
+quality-passing carriers at the position. Non-carriers of failing techs
+(other than those already filtered at build time) move to `N_NO_COVERAGE`.
+
+Running the flag against a database that was not built with quality data
+exits with a clear error:
+
+```
+This database was not built with coverage quality data.
+Re-create with --min-dp / --min-gq to use --min-quality-evidence.
+```
+
+---
+
+## Choosing thresholds
+
+Three concrete profiles, ordered from cheapest to strictest:
+
+- **Pure-genotype cohorts** (no `FORMAT/DP` / `FORMAT/GQ` / reliable `QUAL`)
+  Use `--min-pass 1` at query time. Or `--min-observed 1` if you want
+  failed calls to also count as evidence the position was sequenced.
+  No rebuild needed; conservative — positions where your cohort happens to
+  have zero PASS calls flip to `N_NO_COVERAGE`.
+
+- **Cohorts with `FORMAT/DP` and `FORMAT/GQ`**
+  Build with `--min-dp 20 --min-gq 20 --min-covered 1`. Carriers with low
+  confidence stop validating positions, and the decision is stored in the
+  database — every query benefits without further flags.
+
+- **High-stakes clinical interpretation**
+  Layer `--min-quality-evidence 3` (or higher) on top of a quality-aware
+  database to demand multiple independent quality-passing carriers per tech
+  before trusting hom-ref.
+
+---
+
+## How the filters combine
+
+`N_NO_COVERAGE` is the union of:
+
+1. samples whose tech failed the build-time `--min-covered` gate;
+2. samples whose tech failed `--min-pass` / `--min-observed` at query time;
+3. samples whose tech failed `--min-quality-evidence`.
+
+Carriers are never included; the same sample is never counted twice.
+
+---
+
+## Next Steps
+
+- [Understanding Output](../getting-started/understanding-output.md) —
+  field definitions for `N_HOM_REF`, `N_FAIL`, and `N_NO_COVERAGE`
+- [FILTER=PASS Tracking](filter-pass-tracking.md) —
+  the related `N_FAIL` field for failed-quality carrier calls
+- [Technology Integration](../use-cases/technology-integration.md) —
+  mixing whole-genome, whole-exome, and panel data in one cohort
+- [Debugging Results](debugging-results.md) —
+  diagnosing unexpected `N_NO_COVERAGE` or AN values
diff --git a/docs/faq.md b/docs/faq.md
@@ -174,6 +174,19 @@ Use `afquery info --db ./db/` to list all registered codes before running querie
 
 ---
 
+## Is heteroplasmy taken into account when calculating allele frequency in mitochondrial variants? 
+
+Not explicitly. Proper quantification of heteroplasmy requires a specialized approach, similar to somatic variant analysis (e.g., in cancer), where a genomic position may contain multiple subpopulations of variants with different allele fractions. This type of modeling is not part of standard germline variant calling.
+
+In tools such as GATK operating in germline mode, genotypes are assigned based on the ploidy defined for the region, without representing a continuous spectrum of allele frequencies:
+
+- If the region is treated as haploid (as is typical for mitochondrial DNA), the caller reports the majority allele. If the signal is ambiguous, the position may be marked as uncertain.
+- If modeled as diploid, the caller fits genotypes into discrete states (e.g., 0/1 or 1/1). Allele fractions near 50% are typically classified as heterozygous.
+
+As a result, intermediate heteroplasmy levels (such as 20%) are not explicitly represented. Instead, they are forced into one of these discrete genotype states or lost as uncertainty.
+
+Therefore, this limitation arises from the variant calling step. The application operates on already discretized genotypes according to ploidy and does not model heteroplasmy as a continuous variable.
+
 ## Common Pitfalls
 
 ### What if AN is very low?

diff --git a/docs/getting-started/preprocessing.md b/docs/getting-started/preprocessing.md
@@ -33,6 +33,7 @@ Male samples should have haploid genotype calls at chrX non-PAR regions (GT=`1`,
 - **Non-PASS genotypes**: Masking them as missing ensures that low-quality calls do not inflate AC. AFQuery tracks these as N_FAIL.
 - **Homozygous reference calls**: Removing ref/ref genotypes reduces file size and speeds ingestion; they contribute AC=0 and are not needed
 - **INFO fields**: Stripping INFO reduces file size and speeds ingestion. Additionally, malformed or non-standard INFO fields produced by some variant callers can break downstream parsing; stripping them pre-emptively prevents these errors.
+- **FORMAT fields**: Only `GT`, `DP`, and `GQ` are preserved. `GT` is the genotype required by AFQuery for all queries. `DP` and `GQ` are read by the [coverage-evidence](../advanced/coverage-evidence.md) quality flags (`afquery create-db --min-dp / --min-gq / --min-qual / --min-covered`). VCFs without `DP`/`GQ` are still valid; their carriers simply contribute no quality evidence to the cohort. All other FORMAT fields (`PL`, `AD`, etc.) are dropped to reduce file size.
 
 ---
 

diff --git a/docs/getting-started/understanding-output.md b/docs/getting-started/understanding-output.md
@@ -16,6 +16,7 @@ This page explains what each field in AFQuery output means and how to interpret
 | **N_HOM_REF** | int | Number of eligible samples homozygous reference (GT=0/0 or GT=0) |
 | **n_eligible** | int | Number of samples in the eligible set (after sex/phenotype/tech filters) |
 | **N_FAIL** | int | Number of eligible samples with a non-ref allele called but FILTER≠PASS at this position. These samples are counted *only* in N_FAIL — not in N_HET, N_HOM_ALT, or N_HOM_REF. |
+| **N_NO_COVERAGE** | int | Number of eligible samples whose tech lacks coverage evidence at this position. Excluded from `N_HOM_REF` to keep AC/AN conservative. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). |
 
 
 ---
@@ -29,7 +30,7 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A
 ```
 
 ```
-chr1:925952 G>A  AC=3  AN=120  AF=0.0250  n_eligible=60  N_HET=1  N_HOM_ALT=1  N_HOM_REF=57  N_FAIL=0
+chr1:925952 G>A  AC=3  AN=120  AF=0.0250  n_eligible=60  N_HET=1  N_HOM_ALT=1  N_HOM_REF=57  N_FAIL=0  N_NO_COVERAGE=0
 ```
 
 ### TSV
@@ -39,8 +40,8 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A --format tsv
 ```
 
 ```
-chrom	pos	ref	alt	AC	AN	AF	n_eligible	N_HET	N_HOM_ALT	N_HOM_REF	N_FAIL
-chr1	925952	G	A	3	120	0.025000	60	1	1	57	0
+chrom	pos	ref	alt	AC	AN	AF	n_eligible	N_HET	N_HOM_ALT	N_HOM_REF	N_FAIL	N_NO_COVERAGE
+chr1	925952	G	A	3	120	0.025000	60	1	1	57	0	0
 ```
 
 ### JSON
@@ -62,7 +63,8 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A --format json
   "N_HET": 1,
   "N_HOM_ALT": 1,
   "N_HOM_REF": 57,
-  "N_FAIL": 0
+  "N_FAIL": 0,
+  "N_NO_COVERAGE": 0
 }
 ```
 
@@ -126,6 +128,7 @@ When using `afquery annotate`, the following INFO fields are added to each varia
 | `AFQUERY_N_HOM_ALT` | A (per ALT) | Homozygous alt sample count per ALT allele |
 | `AFQUERY_N_HOM_REF` | A (per ALT) | Homozygous ref sample count per ALT allele |
 | `AFQUERY_N_FAIL` | 1 (per site) | Fail sample count — shared across all ALT alleles |
+| `AFQUERY_N_NO_COVERAGE` | A (per ALT) | Eligible samples whose tech lacks coverage evidence at this position. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). |
 
 !!! note "Multi-allelic sites"
     Number=A fields have one value per ALT allele (comma-separated for multi-allelic sites). Number=1 fields are shared across all ALT alleles at the same position.
-Original file line number
+Diff line change
@@ Expand Up / @@ -60,3 +60,4 @@ src/afquery/_version.py @@
     *.parquet
     db/*
     site
+    CONTEXT.md