diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fcc7994..5fec24b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -8,11 +8,14 @@ on: - 'mkdocs.yml' - 'src/**' workflow_dispatch: + inputs: + bootstrap: + description: 'Wipe gh-pages before deploying (one-time when migrating from mkdocs gh-deploy to mike)' + type: boolean + default: false permissions: contents: write - pages: write - id-token: write concurrency: group: pages @@ -21,11 +24,10 @@ concurrency: jobs: deploy: runs-on: ubuntu-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 - uses: actions/setup-python@v5 with: @@ -34,5 +36,17 @@ jobs: - name: Install docs dependencies run: pip install -e ".[docs]" - - name: Build and deploy docs - run: mkdocs gh-deploy --force + - name: Configure git for mike + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Fetch gh-pages + run: git fetch origin gh-pages --depth=1 || true + + - name: Bootstrap gh-pages (wipe old flat-deploy content) + if: ${{ inputs.bootstrap == true }} + run: mike delete --all --push --allow-empty + + - name: Deploy dev docs with mike + run: mike deploy --push --update-aliases dev diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 554e52c..3736545 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -126,3 +126,40 @@ jobs: tags: | ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }} ghcr.io/${{ github.repository }}:latest + + # ── 6. Versioned docs → gh-pages via mike (final releases only) ────── + docs: + needs: test + if: ${{ !contains(github.ref_name, 'rc') }} + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install docs dependencies + run: pip install -e ".[docs]" + + - name: Configure git for mike + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Fetch gh-pages + run: git fetch origin gh-pages --depth=1 || true + + - name: Extract version + id: version + run: echo "version=${GITHUB_REF_NAME#v}" >> $GITHUB_OUTPUT + + - name: Deploy versioned docs and update latest alias + run: mike deploy --push --update-aliases ${{ steps.version.outputs.version }} latest + + - name: Set default to latest + run: mike set-default --push latest diff --git a/.gitignore b/.gitignore index 988d3de..0d74ea6 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,4 @@ src/afquery/_version.py *.parquet db/* site +CONTEXT.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d4ce12c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,35 @@ +# Contributing + +## Documentation deployment + +The published site at is built with [MkDocs](https://www.mkdocs.org/) (Material theme) and versioned with [mike](https://github.com/jimporter/mike). Source content lives under `docs/` and is configured by `mkdocs.yml`. + +### CI workflows + +| Workflow | Trigger | What it does | +|---|---|---| +| `.github/workflows/docs.yml` | push to `master` (paths: `docs/**`, `mkdocs.yml`, `src/**`) and manual dispatch | `mike deploy --push --update-aliases dev` — publishes the working master under the `dev` alias | +| `.github/workflows/release.yml` (job `docs`) | push of a `v*` tag (non-`rc`) | `mike deploy --push --update-aliases latest` followed by `mike set-default --push latest` — publishes the tagged version, points `latest` at it, and makes `latest` the site root | + +### Cutting a release + +Tag the commit with a [PEP 440](https://peps.python.org/pep-0440/)-compatible version prefixed with `v` (e.g. `v0.3.0`) and push the tag. The `release.yml` workflow handles PyPI, the GitHub release, the Docker image, and the versioned docs deploy. Pre-release tags (anything containing `rc`) skip Docker and docs publishing. + +### Local preview + +```bash +pip install -e ".[docs]" + +# Plain build — no versioning, fastest iteration on content +mkdocs serve + +# Versioned layout — only after running mike deploy locally at least once +mike deploy 0.0.0-test dev # writes to local gh-pages branch (no push) +mike serve # serves the gh-pages branch with version selector +``` + +To discard local mike state: `git branch -D gh-pages`. + +### Migrating from a flat `mkdocs gh-deploy` (one-time) + +When `gh-pages` still contains content from the previous flat deploy, the first `mike deploy` will leave the old root-level files in place. To wipe `gh-pages` clean before deploying, manually trigger the *Deploy Documentation* workflow with `bootstrap: true`. This runs `mike delete --all --push --allow-empty` before deploying, leaving only the versioned layout (`versions.json` + version subdirectories + redirector at root). diff --git a/docs/advanced/coverage-evidence.md b/docs/advanced/coverage-evidence.md new file mode 100644 index 0000000..364c1f2 --- /dev/null +++ b/docs/advanced/coverage-evidence.md @@ -0,0 +1,192 @@ +# Coverage Evidence + +Standard variant-only VCFs do not record hom-ref calls. When a sample has no +entry at a position, AFQuery has to decide whether the sample is genuinely +homozygous reference or simply was not sequenced there. For **fully-covered +techs** — those registered without a BED capture file in the manifest, so +every position is assumed to be sequenced — the answer is unambiguous. For +**partially-covered techs** (whole-exome kits, gene panels), the BED proves a +position was *targeted* by the assay, not that *this* sample was sequenced +deeply enough to call a confident hom-ref. + +`N_NO_COVERAGE` lets you label that uncertain subset instead of forcing it +into `N_HOM_REF`. The flags below decide *which* samples land there. + +--- + +## What `N_NO_COVERAGE` represents + +`N_NO_COVERAGE` counts eligible samples whose hom-ref status is not trusted +under the active criteria. The genotype invariant becomes: + +``` +N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL + N_NO_COVERAGE = n_eligible +``` + +Samples in `N_NO_COVERAGE` remain in `eligible` and contribute to `AN` (just +like `N_FAIL`), so AC/AN/AF stay conservative — the field never inflates +allele frequencies. Two rules always hold: + +- **Carriers are never reclassified.** A sample with a `het`, `hom`, or + `fail` call at the position stays in its category. `N_NO_COVERAGE` only + draws from non-carriers. +- **Fully-covered samples are never gated.** Every coverage flag is a + *per-tech* decision evaluated only on partially-covered techs. Samples on + fully-covered techs are always treated as hom-ref when they have no + carrier call. + +--- + +## Cohort-evidence gates at query time + +These flags use only the carriers already present in your cohort to decide +whether each partially-covered tech has enough evidence to trust hom-ref at a +position. They run at query time, so no database rebuild is needed. + +| Flag | Effect | +|------|--------| +| `--min-pass K` | A partially-covered tech must have ≥K PASS carriers (`het ∪ hom`) at the position. If it falls short, all of its non-carrier samples move from `N_HOM_REF` to `N_NO_COVERAGE`. | +| `--min-observed K` | Same shape, but counts every recorded carrier (`het ∪ hom ∪ fail`). Useful when a non-PASS call still proves the position was sequenced. | + +When both flags are >0, both must hold (AND). The default `0` disables the +gate. + +!!! tip + If your VCFs do not carry `FORMAT/DP` or `FORMAT/GQ`, these are the + flags you want. They are the cheapest option and apply to any database. + +### Worked example + +The numbers below are illustrative; concrete values depend on your cohort. + +Default query — every BED-covered non-carrier counts as hom-ref: + +```bash +afquery query --db ./db/ --locus chr1:925952 +``` + +``` +chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1231 N_FAIL=0 N_NO_COVERAGE=0 +``` + +Now require at least one PASS carrier per partially-covered tech: + +```bash +afquery query --db ./db/ --locus chr1:925952 --min-pass 1 +``` + +``` +chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1108 N_FAIL=0 N_NO_COVERAGE=123 +``` + +Samples on partially-covered techs that did not contribute a single PASS +carrier at this position have moved out of `N_HOM_REF` and into +`N_NO_COVERAGE`. `AC`, `AN`, and `AF` are unchanged: the samples are still +eligible, they just no longer count as confident hom-refs. + +--- + +## Quality-aware filtering at database creation + +If your VCFs carry `FORMAT/DP`, `FORMAT/GQ`, or you trust the `QUAL` column, +you can demand that carriers meet quality thresholds before they count as +evidence for hom-ref. These flags apply when you create the database, so the +coverage decision is baked in. + +| Flag (`create-db`) | Effect | +|--------------------|--------| +| `--min-dp D` | Minimum `FORMAT/DP` per carrier. | +| `--min-gq G` | Minimum `FORMAT/GQ` per carrier. | +| `--min-qual Q` | Minimum VCF `QUAL` per carrier. | +| `--min-covered K`| Per partially-covered tech, the position is "trusted" only if at least K of its carriers pass the quality thresholds. Non-carriers of failing positions are recorded as `N_NO_COVERAGE`. | + +A carrier counts as quality-passing only if **all** active thresholds hold +(unset thresholds are simply ignored). At least one of these flags must be +non-zero to enable quality-aware coverage filtering — without that, queries +fall back to the cohort-evidence gates above. + +```bash +afquery create-db \ + --manifest samples.tsv \ + --output-dir ./db/ \ + --genome-build GRCh38 \ + --bed-dir ./beds/ \ + --min-dp 30 --min-gq 20 --min-covered 1 +``` + +!!! note + The chosen thresholds are recorded with the database and re-applied + automatically when you grow it via `update-db --add-samples`. You do + not re-pass them on each update. + +Enabling quality-aware filtering requires creating (or re-creating) the +database; existing databases without quality data must be rebuilt. + +### Tightening at query time — `--min-quality-evidence` + +Once a database has been built with at least one of `--min-dp`, +`--min-gq`, `--min-qual`, or `--min-covered`, you can tighten the gate at +query time without rebuilding: + +```bash +afquery query --db ./db/ --locus chr1:925952 --min-quality-evidence 5 +``` + +`--min-quality-evidence K` requires each partially-covered tech to have ≥K +quality-passing carriers at the position. Non-carriers of failing techs +(other than those already filtered at build time) move to `N_NO_COVERAGE`. + +Running the flag against a database that was not built with quality data +exits with a clear error: + +``` +This database was not built with coverage quality data. +Re-create with --min-dp / --min-gq to use --min-quality-evidence. +``` + +--- + +## Choosing thresholds + +Three concrete profiles, ordered from cheapest to strictest: + +- **Pure-genotype cohorts** (no `FORMAT/DP` / `FORMAT/GQ` / reliable `QUAL`) + Use `--min-pass 1` at query time. Or `--min-observed 1` if you want + failed calls to also count as evidence the position was sequenced. + No rebuild needed; conservative — positions where your cohort happens to + have zero PASS calls flip to `N_NO_COVERAGE`. + +- **Cohorts with `FORMAT/DP` and `FORMAT/GQ`** + Build with `--min-dp 20 --min-gq 20 --min-covered 1`. Carriers with low + confidence stop validating positions, and the decision is stored in the + database — every query benefits without further flags. + +- **High-stakes clinical interpretation** + Layer `--min-quality-evidence 3` (or higher) on top of a quality-aware + database to demand multiple independent quality-passing carriers per tech + before trusting hom-ref. + +--- + +## How the filters combine + +`N_NO_COVERAGE` is the union of: + +1. samples whose tech failed the build-time `--min-covered` gate; +2. samples whose tech failed `--min-pass` / `--min-observed` at query time; +3. samples whose tech failed `--min-quality-evidence`. + +Carriers are never included; the same sample is never counted twice. + +--- + +## Next Steps + +- [Understanding Output](../getting-started/understanding-output.md) — + field definitions for `N_HOM_REF`, `N_FAIL`, and `N_NO_COVERAGE` +- [FILTER=PASS Tracking](filter-pass-tracking.md) — + the related `N_FAIL` field for failed-quality carrier calls +- [Technology Integration](../use-cases/technology-integration.md) — + mixing whole-genome, whole-exome, and panel data in one cohort +- [Debugging Results](debugging-results.md) — + diagnosing unexpected `N_NO_COVERAGE` or AN values diff --git a/docs/faq.md b/docs/faq.md index bca7372..33fd8ed 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -174,6 +174,19 @@ Use `afquery info --db ./db/` to list all registered codes before running querie --- +## Is heteroplasmy taken into account when calculating allele frequency in mitochondrial variants? + +Not explicitly. Proper quantification of heteroplasmy requires a specialized approach, similar to somatic variant analysis (e.g., in cancer), where a genomic position may contain multiple subpopulations of variants with different allele fractions. This type of modeling is not part of standard germline variant calling. + +In tools such as GATK operating in germline mode, genotypes are assigned based on the ploidy defined for the region, without representing a continuous spectrum of allele frequencies: + +- If the region is treated as haploid (as is typical for mitochondrial DNA), the caller reports the majority allele. If the signal is ambiguous, the position may be marked as uncertain. +- If modeled as diploid, the caller fits genotypes into discrete states (e.g., 0/1 or 1/1). Allele fractions near 50% are typically classified as heterozygous. + +As a result, intermediate heteroplasmy levels (such as 20%) are not explicitly represented. Instead, they are forced into one of these discrete genotype states or lost as uncertainty. + +Therefore, this limitation arises from the variant calling step. The application operates on already discretized genotypes according to ploidy and does not model heteroplasmy as a continuous variable. + ## Common Pitfalls ### What if AN is very low? diff --git a/docs/getting-started/preprocessing.md b/docs/getting-started/preprocessing.md index cdf6c04..080036f 100644 --- a/docs/getting-started/preprocessing.md +++ b/docs/getting-started/preprocessing.md @@ -33,6 +33,7 @@ Male samples should have haploid genotype calls at chrX non-PAR regions (GT=`1`, - **Non-PASS genotypes**: Masking them as missing ensures that low-quality calls do not inflate AC. AFQuery tracks these as N_FAIL. - **Homozygous reference calls**: Removing ref/ref genotypes reduces file size and speeds ingestion; they contribute AC=0 and are not needed - **INFO fields**: Stripping INFO reduces file size and speeds ingestion. Additionally, malformed or non-standard INFO fields produced by some variant callers can break downstream parsing; stripping them pre-emptively prevents these errors. +- **FORMAT fields**: Only `GT`, `DP`, and `GQ` are preserved. `GT` is the genotype required by AFQuery for all queries. `DP` and `GQ` are read by the [coverage-evidence](../advanced/coverage-evidence.md) quality flags (`afquery create-db --min-dp / --min-gq / --min-qual / --min-covered`). VCFs without `DP`/`GQ` are still valid; their carriers simply contribute no quality evidence to the cohort. All other FORMAT fields (`PL`, `AD`, etc.) are dropped to reduce file size. --- diff --git a/docs/getting-started/understanding-output.md b/docs/getting-started/understanding-output.md index edc8d23..ecf6c7f 100644 --- a/docs/getting-started/understanding-output.md +++ b/docs/getting-started/understanding-output.md @@ -16,6 +16,7 @@ This page explains what each field in AFQuery output means and how to interpret | **N_HOM_REF** | int | Number of eligible samples homozygous reference (GT=0/0 or GT=0) | | **n_eligible** | int | Number of samples in the eligible set (after sex/phenotype/tech filters) | | **N_FAIL** | int | Number of eligible samples with a non-ref allele called but FILTER≠PASS at this position. These samples are counted *only* in N_FAIL — not in N_HET, N_HOM_ALT, or N_HOM_REF. | +| **N_NO_COVERAGE** | int | Number of eligible samples whose tech lacks coverage evidence at this position. Excluded from `N_HOM_REF` to keep AC/AN conservative. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). | --- @@ -29,7 +30,7 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A ``` ``` -chr1:925952 G>A AC=3 AN=120 AF=0.0250 n_eligible=60 N_HET=1 N_HOM_ALT=1 N_HOM_REF=57 N_FAIL=0 +chr1:925952 G>A AC=3 AN=120 AF=0.0250 n_eligible=60 N_HET=1 N_HOM_ALT=1 N_HOM_REF=57 N_FAIL=0 N_NO_COVERAGE=0 ``` ### TSV @@ -39,8 +40,8 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A --format tsv ``` ``` -chrom pos ref alt AC AN AF n_eligible N_HET N_HOM_ALT N_HOM_REF N_FAIL -chr1 925952 G A 3 120 0.025000 60 1 1 57 0 +chrom pos ref alt AC AN AF n_eligible N_HET N_HOM_ALT N_HOM_REF N_FAIL N_NO_COVERAGE +chr1 925952 G A 3 120 0.025000 60 1 1 57 0 0 ``` ### JSON @@ -62,7 +63,8 @@ afquery query --db ./db/ --locus chr1:925952 --ref G --alt A --format json "N_HET": 1, "N_HOM_ALT": 1, "N_HOM_REF": 57, - "N_FAIL": 0 + "N_FAIL": 0, + "N_NO_COVERAGE": 0 } ``` @@ -126,6 +128,7 @@ When using `afquery annotate`, the following INFO fields are added to each varia | `AFQUERY_N_HOM_ALT` | A (per ALT) | Homozygous alt sample count per ALT allele | | `AFQUERY_N_HOM_REF` | A (per ALT) | Homozygous ref sample count per ALT allele | | `AFQUERY_N_FAIL` | 1 (per site) | Fail sample count — shared across all ALT alleles | +| `AFQUERY_N_NO_COVERAGE` | A (per ALT) | Eligible samples whose tech lacks coverage evidence at this position. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). | !!! note "Multi-allelic sites" Number=A fields have one value per ALT allele (comma-separated for multi-allelic sites). Number=1 fields are shared across all ALT alleles at the same position. diff --git a/docs/guides/annotate-vcf.md b/docs/guides/annotate-vcf.md index 0b617d6..758916f 100644 --- a/docs/guides/annotate-vcf.md +++ b/docs/guides/annotate-vcf.md @@ -27,6 +27,7 @@ afquery annotate \ | `AFQUERY_N_HOM_ALT` | Integer | A (per ALT) | Homozygous alt sample count | | `AFQUERY_N_HOM_REF` | Integer | A (per ALT) | Homozygous ref sample count | | `AFQUERY_N_FAIL` | Integer | 1 (per site) | Samples with FILTER≠PASS and alt allele called. Mutually exclusive with N_HET/N_HOM_ALT/N_HOM_REF. | +| `AFQUERY_N_NO_COVERAGE` | Integer | A (per ALT) | Eligible samples whose tech lacks coverage evidence at this position. Excluded from `N_HOM_REF` to keep AC/AN conservative. Always `0` unless a coverage-evidence filter is active. See [Coverage Evidence](../advanced/coverage-evidence.md). | !!! note "Multi-allelic sites" Number=A fields have one value per ALT allele (comma-separated for multi-allelic sites). Number=1 fields are shared across all ALT alleles at the same position. diff --git a/docs/guides/create-database.md b/docs/guides/create-database.md index 46112a2..9e2c3c4 100644 --- a/docs/guides/create-database.md +++ b/docs/guides/create-database.md @@ -101,6 +101,45 @@ See [FILTER=PASS Tracking](../advanced/filter-pass-tracking.md) for details. --- +## Coverage-Evidence Filters + +Four optional flags enable per-sample, quality-aware tracking of which positions +each partially-covered technology (WES, panels) actually covered. They are +fully opt-in. + +| Flag | Default | Effect | +|------|---------|--------| +| `--min-dp D` | 0 | Minimum `FORMAT/DP` for a carrier to count as quality evidence. | +| `--min-gq G` | 0 | Minimum `FORMAT/GQ` for a carrier to count as quality evidence. | +| `--min-qual Q` | 0.0 | Minimum VCF `QUAL` field for a carrier to count as quality evidence. | +| `--min-covered K`| 0 | Per partially-covered tech, the position is "trusted" only if at least K of its carriers pass the quality thresholds. Non-carriers of failing positions are recorded as `N_NO_COVERAGE`. | + +When any of these flags is non-zero AFQuery reads `FORMAT/DP`, `FORMAT/GQ`, +and `QUAL` from each variant call during ingest. Use the bundled +`resources/normalize_vcf.sh` (which preserves these FORMAT fields) or ensure +your own preprocessing keeps them. + +Example: + +```bash +afquery create-db \ + --manifest samples.tsv \ + --output-dir ./db/ \ + --genome-build GRCh38 \ + --bed-dir ./beds/ \ + --min-dp 30 --min-gq 20 --min-covered 1 +``` + +Thresholds are fixed at creation time. `update-db --add-samples` reuses them +and re-applies them to every position whose partially-covered tech receives +new samples (see [Update Database](update-database.md)). + +See [Coverage Evidence](../advanced/coverage-evidence.md) for when to reach +for each flag, how `N_NO_COVERAGE` is computed, and the query-time companion +flag `--min-quality-evidence`. + +--- + ## Validating the Result After creation, run: diff --git a/docs/guides/dump-export.md b/docs/guides/dump-export.md index 25e594a..c8ddf10 100644 --- a/docs/guides/dump-export.md +++ b/docs/guides/dump-export.md @@ -65,9 +65,12 @@ All three disaggregation modes work on the same principle: add stratified column Base columns (always present): ``` -chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL +chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL,N_NO_COVERAGE ``` +`N_NO_COVERAGE` is always emitted but is `0` unless a coverage-evidence +filter is active (see [Coverage Evidence](../advanced/coverage-evidence.md)). + === "--by-sex" Add separate columns for male and female: @@ -78,7 +81,7 @@ chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL Output columns: ``` - chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL,AC_male,AN_male,AF_male,N_HET_male,N_HOM_ALT_male,N_HOM_REF_male,N_FAIL_male,AC_female,AN_female,AF_female,N_HET_female,N_HOM_ALT_female,N_HOM_REF_female,N_FAIL_female + chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL,N_NO_COVERAGE,AC_male,AN_male,AF_male,N_HET_male,N_HOM_ALT_male,N_HOM_REF_male,N_FAIL_male,N_NO_COVERAGE_male,AC_female,AN_female,AF_female,N_HET_female,N_HOM_ALT_female,N_HOM_REF_female,N_FAIL_female,N_NO_COVERAGE_female ``` === "--by-tech" @@ -89,7 +92,7 @@ chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL afquery dump --db ./db/ --by-tech --output by_tech.csv ``` - Output columns include `AC_wgs`, `AN_wgs`, `AF_wgs`, `N_HET_wgs`, `N_HOM_ALT_wgs`, `N_HOM_REF_wgs`, `N_FAIL_wgs`, `AC_wes_v1`, `AN_wes_v1`, etc. (one group of seven columns per registered technology). + Output columns include `AC_wgs`, `AN_wgs`, `AF_wgs`, `N_HET_wgs`, `N_HOM_ALT_wgs`, `N_HOM_REF_wgs`, `N_FAIL_wgs`, `N_NO_COVERAGE_wgs`, `AC_wes_v1`, `AN_wes_v1`, etc. (one group of eight columns per registered technology). === "--by-phenotype" @@ -102,7 +105,7 @@ chrom,pos,ref,alt,AC,AN,AF,N_HET,N_HOM_ALT,N_HOM_REF,N_FAIL --output by_phenotype.csv ``` - Output includes `AC_E11.9`, `AN_E11.9`, `AF_E11.9`, `N_HET_E11.9`, `N_HOM_ALT_E11.9`, `N_HOM_REF_E11.9`, `N_FAIL_E11.9`, `AC_I10`, etc. + Output includes `AC_E11.9`, `AN_E11.9`, `AF_E11.9`, `N_HET_E11.9`, `N_HOM_ALT_E11.9`, `N_HOM_REF_E11.9`, `N_FAIL_E11.9`, `N_NO_COVERAGE_E11.9`, `AC_I10`, etc. === "--all-groups" diff --git a/docs/guides/query.md b/docs/guides/query.md index 8a87021..e0958be 100644 --- a/docs/guides/query.md +++ b/docs/guides/query.md @@ -97,7 +97,7 @@ Batch queries support variants across multiple chromosomes in a single file. Human-readable, one block per variant: ``` -chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1231 N_FAIL=0 +chr1:925952 G>A AC=142 AN=2742 AF=0.0518 n_eligible=1371 N_HET=138 N_HOM_ALT=2 N_HOM_REF=1231 N_FAIL=0 N_NO_COVERAGE=0 ``` ### tsv @@ -109,8 +109,8 @@ afquery query --db ./db/ --region chr1:900000-1000000 --format tsv ``` ``` -chrom pos ref alt AC AN AF n_eligible N_HET N_HOM_ALT N_HOM_REF N_FAIL -chr1 925952 G A 142 2742 0.051782 1371 138 2 1231 0 +chrom pos ref alt AC AN AF n_eligible N_HET N_HOM_ALT N_HOM_REF N_FAIL N_NO_COVERAGE +chr1 925952 G A 142 2742 0.051782 1371 138 2 1231 0 0 ``` @@ -136,13 +136,47 @@ afquery query --db ./db/ --locus chr1:925952 --format json "N_HET": 138, "N_HOM_ALT": 2, "N_HOM_REF": 1231, - "N_FAIL": 0 + "N_FAIL": 0, + "N_NO_COVERAGE": 0 } ] ``` --- +## Coverage-Evidence Filters (no_coverage) + +By default AFQuery counts every BED-covered sample without a variant call as +hom-ref. With standard variant-only VCFs that assumption can be wrong: a missing +position may simply mean the sample was not sequenced deeply enough at that locus. +Three optional flags let you trade hom-ref aggressiveness for confidence. Samples +that fall below a threshold are reported in **N_NO_COVERAGE** instead of N_HOM_REF +(they remain in `eligible` and `AN`, like `N_FAIL`). + +| Flag | Meaning | +|------|---------| +| `--min-pass K` | A partially-covered tech is valid for hom-ref at a position only if it has ≥K PASS carriers (het\|hom). Otherwise its non-carrier samples move to `N_NO_COVERAGE`. | +| `--min-observed K` | Same as `--min-pass`, but counts any VCF entry (`het\|hom\|fail`). Useful when you want to include calls that failed FILTER as evidence the position was sequenced. | +| `--min-quality-evidence K` | Requires ≥K quality-passing carriers per partially-covered tech. Requires a database built with `--min-dp`, `--min-gq`, `--min-qual`, or `--min-covered`. | + +`--min-pass` and `--min-observed` combine with AND (both must hold). Both +default to `0`, which disables the gate. + +```bash +afquery query --db ./db/ --locus chr1:925952 --min-pass 1 +afquery query --db ./db/ --region chr1:900000-1000000 --min-observed 2 --min-pass 1 +``` + +The genotype invariant becomes: +`N_HET + N_HOM_ALT + N_HOM_REF + N_FAIL + N_NO_COVERAGE = n_eligible`. + +Fully-covered samples (those whose tech was registered without a BED) are +never affected. Carrier samples (het/hom/fail) are never moved to +`N_NO_COVERAGE`. See [Coverage Evidence](../advanced/coverage-evidence.md) +for when to reach for each flag. + +--- + ## Sample Filtering All query modes support the same filter options: diff --git a/docs/guides/update-database.md b/docs/guides/update-database.md index 7961a88..7df1dfb 100644 --- a/docs/guides/update-database.md +++ b/docs/guides/update-database.md @@ -56,6 +56,28 @@ afquery update-db \ --bed-dir ./beds/ ``` +### Coverage-evidence handling + +If the database was created with `--min-dp` / `--min-gq` / `--min-qual` / +`--min-covered`, the existing thresholds are read from the database and +re-applied to all samples — old and new. There is no `update-db` flag to +override them; thresholds are fixed at creation time so that quality +decisions are comparable across batches. + +When new carriers push a partially-covered tech above the `--min-covered` +threshold at positions that were previously below it, those positions are +re-evaluated and their non-carrier samples once again count as `N_HOM_REF` +instead of `N_NO_COVERAGE`. The recomputation runs only for chromosomes +touched by the new samples; existing rows on other chromosomes are not +rewritten. + +VCFs added via `update-db` should preserve `FORMAT/DP` and `FORMAT/GQ` (the +bundled `resources/normalize_vcf.sh` does so by default). Samples without +those fields are still merged correctly but contribute no quality evidence. + +See [Coverage Evidence](../advanced/coverage-evidence.md) for the full flag +reference and when to use each one. + --- ## Remove Samples diff --git a/docs/guides/variant-info.md b/docs/guides/variant-info.md index c4cde67..dc6fa49 100644 --- a/docs/guides/variant-info.md +++ b/docs/guides/variant-info.md @@ -133,6 +133,7 @@ When `--ref` and `--alt` are specified, the `variant` block contains the actual | `het` | Heterozygous carrier, FILTER=PASS | | `hom` | Homozygous alt carrier, FILTER=PASS | | `alt` | Non-ref carrier with FILTER≠PASS (ploidy unknown) | +| `no_coverage` | Sample's tech lacks coverage evidence at this position; not a carrier. Only appears when a coverage-evidence filter (`--min-pass`, `--min-observed`, `--min-quality-evidence`, or build-time `--min-covered`) is active. The FILTER column is empty (text/tsv) or `null` (JSON) — `PASS`/`FAIL` does not apply because there is no call. See [Coverage Evidence](../advanced/coverage-evidence.md). | --- diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 9ef24ba..9cb83f0 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -24,6 +24,10 @@ afquery create-db [OPTIONS] | `--bed-dir` | TEXT | None | Directory containing BED files for WES technologies | | `--force` | flag | False | Delete any partial results and restart from scratch | | `--db-version` | TEXT | `1.0` | Version label for this database | +| `--min-dp` | INTEGER | `0` | Minimum `FORMAT/DP` for a carrier to count as quality evidence (0 = disabled) | +| `--min-gq` | INTEGER | `0` | Minimum `FORMAT/GQ` for a carrier to count as quality evidence (0 = disabled) | +| `--min-qual` | FLOAT | `0.0` | Minimum `QUAL` for a carrier to count as quality evidence (0 = disabled) | +| `--min-covered` | INTEGER | `0` | Minimum quality-passing carriers per partially-covered tech for hom-ref to be assumed (0 = disabled) | | `-v, --verbose` | flag | False | Verbose output with per-item progress | --- @@ -50,6 +54,9 @@ Exactly one of `--locus`, `--region`, or `--from-file` must be provided. | `--ref` | TEXT | None | Filter to specific reference allele (only for `--locus`) | | `--alt` | TEXT | None | Filter to specific alternate allele (only for `--locus`) | | `--format` | `text`\|`json`\|`tsv` | `text` | Output format | +| `--min-pass` | INTEGER | `0` | Min PASS carriers (`het\|hom`) per partially-covered tech for hom-ref to be assumed. Non-carriers move to `N_NO_COVERAGE` if a tech falls below the threshold. (0 = disabled) | +| `--min-observed` | INTEGER | `0` | Min any-VCF entries (`het\|hom\|fail`) per partially-covered tech for hom-ref to be assumed. (0 = disabled) | +| `--min-quality-evidence` | INTEGER | `0` | Min quality-passing carriers per partially-covered tech. Requires a database built with `--min-dp`, `--min-gq`, `--min-qual`, or `--min-covered`. (0 = disabled) | | `--no-warn` | flag | False | Suppress warnings for unknown phenotypes, technologies, and chromosomes | --- @@ -72,8 +79,15 @@ afquery annotate [OPTIONS] | `--tech` | TEXT | None | Technology filter. Repeatable; comma-separated or multiple flags. Use `^` prefix to exclude. | | `--threads` | INTEGER | all CPUs | Number of worker threads for parallel annotation | | `-v, --verbose` | flag | False | Verbose output with per-item progress | +| `--min-pass` | INTEGER | `0` | Min PASS carriers per partially-covered tech for hom-ref to be assumed (0 = disabled) | +| `--min-observed` | INTEGER | `0` | Min any-VCF entries per partially-covered tech (0 = disabled) | +| `--min-quality-evidence` | INTEGER | `0` | Min quality-passing carriers per partially-covered tech. Requires a database built with `--min-dp`, `--min-gq`, `--min-qual`, or `--min-covered`. (0 = disabled) | | `--no-warn` | flag | False | Suppress warnings for unknown phenotypes, technologies, and chromosomes | +The annotated VCF gains `AFQUERY_AC`, `AFQUERY_AN`, `AFQUERY_AF`, +`AFQUERY_N_HET`, `AFQUERY_N_HOM_ALT`, `AFQUERY_N_HOM_REF`, `AFQUERY_N_FAIL`, +and `AFQUERY_N_NO_COVERAGE` INFO fields. + --- ## dump @@ -101,6 +115,12 @@ afquery dump [OPTIONS] | `--threads` | INTEGER | all CPUs | Number of worker threads for parallel export | | `--all-variants` | flag | False | Include variants with AC=0 (covered but not observed). WARNING: may produce very large output. | | `-v, --verbose` | flag | False | Verbose output with per-item progress | +| `--min-pass` | INTEGER | `0` | Min PASS carriers per partially-covered tech for hom-ref to be assumed (0 = disabled) | +| `--min-observed` | INTEGER | `0` | Min any-VCF entries per partially-covered tech (0 = disabled) | +| `--min-quality-evidence` | INTEGER | `0` | Min quality-passing carriers per partially-covered tech. Requires a database built with `--min-dp`, `--min-gq`, `--min-qual`, or `--min-covered`. (0 = disabled) | + +CSV output adds an `N_NO_COVERAGE` column (and per-group variants +`N_NO_COVERAGE_