In [55]:
import sys

sys.path.append('..')

In [45]:
from filter_clinvar_xml import filter_xml, pprint
from cmat.clinvar_xml_io import *
from cmat.clinvar_xml_io.xml_parsing import find_elements
from collections import Counter, defaultdict

First get all the somatic/oncogenic records that were dropped from the most recent submission.

In [3]:
# Full paths redacted
invalid_evidence_rcvs = [r.strip() for r in open('batch-2024-12/logs/invalid_evidence_rcvs.txt').readlines()]
multiple_class_rcvs = [r.strip() for r in open('batch-2024-12/logs/multiple_classification_rcvs.txt').readlines()]

In [7]:
all_rcvs = set(invalid_evidence_rcvs + multiple_class_rcvs)

In [8]:
len(all_rcvs)

505

In [6]:
input_xml = 'clinvar.xml.gz'
somatic_xml = 'somatic.xml.gz'

In [9]:
filter_xml(
    input_xml=input_xml,
    output_xml=somatic_xml,
    filter_fct=lambda r: r.accession in all_rcvs,
    max_num=len(all_rcvs)
)

INFO:filter_clinvar_xml:Records written: 505


First issue is **multiplicity**.

Currently:
* `clinicalSignificances` is a list of strings but these are parsed from a single description - e.g. [RCV000002127](https://www.ncbi.nlm.nih.gov/clinvar/RCV000002127/), "Pathogenic/Likely pathogenic" becomes `pathogenic` and `likely pathogenic`
* `confidence` is a single string, in ClinVar this is "review status" and is associated with a single description

New version:
* ClinVar can contain multiple clinical classifications, each containing one (or more) description(s) and a review status.
    
Example: [RCV000443639](https://www.ncbi.nlm.nih.gov/clinvar/RCV000443639/)
```
<Classifications>
    <GermlineClassification>
        <ReviewStatus>no assertion criteria provided</ReviewStatus>
        <Description DateLastEvaluated="2015-07-14" SubmissionCount="1">Likely pathogenic</Description>
    </GermlineClassification>  
    <OncogenicityClassification>
        <ReviewStatus>criteria provided, single submitter</ReviewStatus>
        <Description DateLastEvaluated="2024-07-31" SubmissionCount="1">Oncogenic</Description>
    </OncogenicityClassification>
</Classifications>
```

Sole example of multiple somatic: [RCV000426735](https://www.ncbi.nlm.nih.gov/clinvar/RCV000426735/) - note annoyingly description & review status are not 1:1
```
<Classifications>
  <GermlineClassification>
    <ReviewStatus>criteria provided, single submitter</ReviewStatus>
    <Description DateLastEvaluated="2023-08-23" SubmissionCount="2">Uncertain significance</Description>
  </GermlineClassification>
  <SomaticClinicalImpact>
    <ReviewStatus>no assertion criteria provided</ReviewStatus>
    <Description DateLastEvaluated="2024-01-24" ClinicalImpactAssertionType="diagnostic" ClinicalImpactClinicalSignificance="supports diagnosis" SubmissionCount="1">Tier I - Strong</Description>
    <Description DateLastEvaluated="2024-01-24" ClinicalImpactAssertionType="prognostic" ClinicalImpactClinicalSignificance="poor outcome" SubmissionCount="1">Tier I - Strong</Description>
  </SomaticClinicalImpact>
</Classifications>
```

Second issue is **new values and fields**.
* New categorisation of clinical classification: Germline, Somatic, Oncogenicity
* New terms appear in the "description" field which we currently report in `clinicalSignificances`
* New fields for somatic clinical impact only - assertion type and (what they call) clinical significance

See [here](https://github.com/EBIvariation/CMAT/issues/396#issuecomment-1898804129) for lists of values.

In [11]:
dataset = ClinVarDataset(somatic_xml)

In [46]:
# Terms in the description field - e.g. "Tier I - Strong" or "Likely oncogenic"
somatic_terms = Counter()
oncogenic_terms = Counter()

# Additional somatic terms
somatic_impact_assertion_types = Counter()
somatic_impact_clin_sigs = Counter()

# e.g. (somatic, somatic) or (germline, oncogenic) - nb. everything *not* in this list of 505 is just (germline,)
rcv_classifications = defaultdict(list)

for r in dataset:
    rcv_all_class = []
    for c in r.clinical_classifications:
        class_type = c.class_xml.tag
        descriptions = find_elements(c.class_xml, './Description')
        
        if class_type == 'GermlineClassification':
            # Assume we support germline terms, curious if there are any multiples here though...
            for d in descriptions:
                rcv_all_class.append(class_type)
                
        elif class_type == 'SomaticClinicalImpact':
            for d in descriptions:
                rcv_all_class.append(class_type)
                clin_class_term = d.text.lower()
                somatic_terms[clin_class_term] += 1
                
                assert_type = d.attrib.get('ClinicalImpactAssertionType', '').lower()
                somatic_impact_assertion_types[assert_type] += 1
                
                clin_sig = d.attrib.get('ClinicalImpactClinicalSignificance', '').lower()
                somatic_impact_clin_sigs[clin_sig] += 1
                
        elif class_type == 'OncogenicityClassification':
            for d in descriptions:
                rcv_all_class.append(class_type)
                clin_class_term = d.text.lower()
                oncogenic_terms[clin_class_term] += 1
        else:
            print("unknown classification type:", class_type)
    
    rcv_all_class = tuple(sorted(rcv_all_class))
    rcv_classifications[rcv_all_class].append(r.accession)

In [47]:
rcv_class_counts = {
 k: len(v) for k,v in rcv_classifications.items()   
}

In [48]:
rcv_class_counts

{('SomaticClinicalImpact',): 18,
 ('OncogenicityClassification',): 442,
 ('OncogenicityClassification', 'SomaticClinicalImpact'): 1,
 ('GermlineClassification', 'OncogenicityClassification'): 40,
 ('GermlineClassification', 'SomaticClinicalImpact'): 3,
 ('GermlineClassification',
  'SomaticClinicalImpact',
  'SomaticClinicalImpact'): 1}

In [41]:
somatic_terms

Counter({'tier i - strong': 6,
         'tier iii - unknown': 9,
         'tier iv - benign/likely benign': 6,
         'tier ii - potential': 3})

In [42]:
oncogenic_terms

Counter({'likely oncogenic': 305,
         'uncertain significance': 12,
         'oncogenic': 166})

In [43]:
somatic_impact_assertion_types

Counter({'prognostic': 4, '': 15, 'therapeutic': 2, 'diagnostic': 3})

In [44]:
somatic_impact_clin_sigs

Counter({'poor outcome': 4,
         '': 15,
         'sensitivity/response': 2,
         'supports diagnosis': 3})

In [49]:
rcv_classifications[('GermlineClassification',
  'SomaticClinicalImpact',
  'SomaticClinicalImpact')]

['RCV000426735']

In [52]:
442/505

0.8752475247524752

Summary:
* All values and all fields are being used to varying degrees
* Most data involves oncogenic classification, so no assertion types etc.
* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)