[search] Improve clnsig analysis (#486)

* improve clinvarVcf.CLNSIG analysis, so that we can autocomplete pathogen, without matching conflicting_classifications_of_pathogenicity. This is accomplished by blocking terms that start with “conflicting” and applying edge ngram transform in all other values. This gives a good balance between accuracy and sensitivity. With this, `clinvarVcf.CLNSIG:pathogenic` matches `likely_pathogenic` and `pathogenic`, even if the terms are found comma separated. However "conflicting_classifications_of_pathogenicity" will not match, if that is the only term ("conflicting_classifications_of_pathogenicity,pathogenic" would match however). Live on bystro-dev Test queries: 1. `other` matches anything with `other` as a complete term (meaning conflicting_classification_of_pathogenicity,other matches) 2. `pathogenic` https://bystro-dev.emory.edu/results?_id=6637a7caa0e17a1660ba743b&search&q=clinvarvcf.clnsig:pathogenic&size=10&from=0 3. `likely_pathogenic` https://bystro-dev.emory.edu/results?_id=6637a7caa0e17a1660ba743b&search&q=clinvarvcf.clnsig:likely_pathogenic&size=10&from=0 4. `(likely pathogenic)` https://bystro-dev.emory.edu/results?_id=6637a7caa0e17a1660ba743b&search&q=clinvarvcf.clnsig:(likely%20pathogenic)&size=10&from=0 5. `pa`, `pat`, `path`, `pathogen`, etc, all match the same set, because only "pathogenic" starts with these characters: https://bystro-dev.emory.edu/results?_id=6637a7caa0e17a1660ba743b&search&q=clinvarvcf.clnsig:pathog&size=10&from=0 6. `like`, `likel`, `likely` etc., all match terms starting with `likely`, such as `likely_pathogenic` and `likely_benign`: https://bystro-dev.emory.edu/results?_id=6637a7caa0e17a1660ba743b&search&q=clinvarvcf.clnsig:likely&size=10&from=0
bystrogenomics · May 7, 2024 · f9b080e · f9b080e
1 parent 9ea590d
commit f9b080e
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 45 deletions.
diff --git a/config/hg19.mapping.yml b/config/hg19.mapping.yml
@@ -28,9 +28,24 @@ index_settings:
           - uppercase
           - asciifolding
     filter:
-      exclude_pathogenic:
+      clnsig_conditional_underscore_split:
+        type: condition
+        filter: ["clnsig_underscore_split_filter"]
+        script:
+          source: "token.getTerm().toString().startsWith('conflicting') == false"
+      clnsig_conditional_filter:
+        type: condition
+        filter: ["clnsig_edge_ngram_filter"]
+        script:
+          source: "token.getTerm().toString().startsWith('conflicting') == false"
+      clnsig_edge_ngram_filter:
+        type: edge_ngram
+        min_gram: 1
+        max_gram: 20
+      clnsig_underscore_split_filter:
         type: pattern_capture
-        patterns: ["conflicting_interpretations_of_pathogenicity"]
+        patterns: ["([^_]+)"]
+        preserve_original: false
       catenate_filter:
         type: word_delimiter
         catenate_words: true
@@ -95,6 +110,9 @@ index_settings:
           - digit
           - punctuation
     tokenizer:
+      comma_tokenizer:
+          type: pattern
+          pattern: ","
       hgvs_tokenizer:
         type: pattern
       edge_ngram_tokenizer:
@@ -106,6 +124,12 @@ index_settings:
           - digit
           - punctuation
     analyzer:
+      clnsig_analyzer:
+          tokenizer: comma_tokenizer
+          filter:
+            - lowercase
+            - clnsig_conditional_underscore_split
+            - clnsig_conditional_filter
       hgvs_analyzer:
         type: custom
         tokenizer: hgvs_tokenizer
@@ -129,14 +153,6 @@ index_settings:
           - catenate_filter_split
           - english_stemmer
           - autocomplete_filter
-      autocomplete_english_split_clinsig:
-        type: custom
-        tokenizer: whitespace
-        filter:
-          - lowercase
-          - exclude_pathogenic
-          - asciifolding
-          - catenate_filter_split
       autocomplete_english_graph:
         type: custom
         tokenizer: keyword
@@ -450,7 +466,7 @@ mappings:
           normalizer: lowercase_normalizer
         CLNSIG:
           type: text
-          analyzer: autocomplete_english_split_clinsig
+          analyzer: clnsig_analyzer
           search_analyzer: search_english_simple
           fields:
             exact:

diff --git a/config/hg19.yml b/config/hg19.yml
@@ -1,7 +1,7 @@
 ---
 assembly: hg19
 build_author: alexkotlar
-build_date: 2024-03-07T13:01:00
+build_date: 2024-05-02T01:08:00
 chromosomes:
   - chr1
   - chr2
@@ -28,7 +28,7 @@ chromosomes:
   - chrM
   - chrX
   - chrY
-database_dir: /mnt/ssd2/annotator/hg19_v9
+database_dir: /mnt/annotator/hg19_v9
 fileProcessors:
   snp:
     args: --emptyField NA --minGq .95
@@ -62,7 +62,7 @@ tracks:
     - caddIndel
   tracks:
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       local_files:
         - chr1.fa.gz
         - chr2.fa.gz
@@ -122,9 +122,9 @@ tracks:
               - chrY.fa.gz
           completed: 2023-11-09T20:16:00
           name: fetch
-      version: 35
+      version: 36
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       build_field_transformations:
         description: split [;]
         ensemblID: split [;]
@@ -203,9 +203,9 @@ tracks:
                   r.chrom = %chromosomes%
           completed: 2024-05-02T01:01:00
           name: fetch
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       build_row_filters:
         AS_FilterStatus: == PASS
       features:
@@ -272,9 +272,9 @@ tracks:
         - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.Y.vcf.bgz
       name: gnomad.exomes
       type: vcf
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       local_files:
         - whole_genome_SNVs.tsv.chr1.organized-by-chr.txt.sorted.txt.gz
         - whole_genome_SNVs.tsv.chr10.organized-by-chr.txt.sorted.txt.gz
@@ -303,9 +303,9 @@ tracks:
       name: cadd
       sorted: 1
       type: cadd
-      version: 6
+      version: 7
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       dist: true
       features:
         - name2
@@ -315,9 +315,9 @@ tracks:
       ref: refSeq
       to: txEnd
       type: nearest
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       dist: true
       features:
         - name2
@@ -326,9 +326,9 @@ tracks:
       name: nearestTss.refSeq
       ref: refSeq
       type: nearest
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       features:
         - alt
         - id
@@ -368,9 +368,9 @@ tracks:
         - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/genomes/gnomad.genomes.r2.1.1.sites.*.vcf.bgz
       name: gnomad.genomes
       type: vcf
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       features:
         - id
         - alt
@@ -427,9 +427,9 @@ tracks:
       utils:
         - completed: 2023-11-09T20:19:00
           name: DbSnp2FormatInfo
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       build_field_transformations:
         CLNDISDB: split [|]
         CLNDN: split [|]
@@ -463,17 +463,17 @@ tracks:
               - https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
           completed: 2024-05-02T01:02:00
           name: fetch
-      version: 4
+      version: 5
     - build_author: alexkotlar
-      build_date: 2024-03-07T13:01:00
+      build_date: 2024-05-02T01:08:00
       features:
         - alt
         - PHRED: number
       local_files:
         - /mnt/files1/bystro_annotator/raw_files/hg19/caddIndel/Indels.vcf.gz
       name: caddIndel
       type: vcf
-      version: 4
+      version: 5
     - based: 1
       build_field_transformations:
         chrom: chr .
@@ -519,4 +519,4 @@ tracks:
               - ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
           completed: 2024-03-07T12:51:00
           name: fetch
-version: 4
+version: 5
diff --git a/config/hg38.mapping.yml b/config/hg38.mapping.yml
@@ -28,9 +28,24 @@ index_settings:
           - uppercase
           - asciifolding
     filter:
-      exclude_pathogenic:
+      clnsig_conditional_underscore_split:
+        type: condition
+        filter: ["clnsig_underscore_split_filter"]
+        script:
+          source: "token.getTerm().toString().startsWith('conflicting') == false"
+      clnsig_conditional_filter:
+        type: condition
+        filter: ["clnsig_edge_ngram_filter"]
+        script:
+          source: "token.getTerm().toString().startsWith('conflicting') == false"
+      clnsig_edge_ngram_filter:
+        type: edge_ngram
+        min_gram: 1
+        max_gram: 20
+      clnsig_underscore_split_filter:
         type: pattern_capture
-        patterns: ["conflicting_interpretations_of_pathogenicity"]
+        patterns: ["([^_]+)"]
+        preserve_original: false
       catenate_filter:
         type: word_delimiter
         catenate_words: true
@@ -95,6 +110,9 @@ index_settings:
           - digit
           - punctuation
     tokenizer:
+      comma_tokenizer:
+          type: pattern
+          pattern: ","
       hgvs_tokenizer:
         type: pattern
       edge_ngram_tokenizer:
@@ -106,6 +124,12 @@ index_settings:
           - digit
           - punctuation
     analyzer:
+      clnsig_analyzer:
+        tokenizer: comma_tokenizer
+        filter:
+          - lowercase
+          - clnsig_conditional_underscore_split
+          - clnsig_conditional_filter
       hgvs_analyzer:
         type: custom
         tokenizer: hgvs_tokenizer
@@ -129,14 +153,6 @@ index_settings:
           - catenate_filter_split
           - english_stemmer
           - autocomplete_filter
-      autocomplete_english_split_clinsig:
-        type: custom
-        tokenizer: whitespace
-        filter:
-          - lowercase
-          - exclude_pathogenic
-          - asciifolding
-          - catenate_filter_split
       autocomplete_english_graph:
         type: custom
         tokenizer: keyword
@@ -450,7 +466,7 @@ mappings:
           normalizer: lowercase_normalizer
         CLNSIG:
           type: text
-          analyzer: autocomplete_english_split_clinsig
+          analyzer: clnsig_analyzer
           search_analyzer: search_english_simple
           fields:
             exact: