In [1]:
import sys

sys.path.append('..')

In [2]:
from filter_clinvar_xml import pprint, iterate_cvs_from_xml, filter_xml

In [3]:
import gzip
import xml.etree.ElementTree as ElementTree
import pandas as pd
import os
from random import random
from collections import Counter

In [4]:
from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.xml_parsing import *

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
pd.set_option('display.max_rows', 500)

In [7]:
work_dir = '/home/april/projects/opentargets/new_clinvar_xml'

# Downloaded latest which at the time was the Feb 2024 release:
# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_2024-02.xml.gz
clinvar_xml = os.path.join(work_dir, 'ClinVarRCVRelease_00-latest.xml.gz')
sample_xml = os.path.join(work_dir, 'sample-2.0.xml.gz')

Counts to look at:
* How many have what subset of `Classification` options (germline, somatic, oncogenicity)
    * are there any unexpected combinations, e.g. germline + onco?
* Occurrence of all possible germline/somatic/oncogenicity terms (some overlap with [this](https://github.com/EBIvariation/CMAT/tree/master/data-exploration/clinvar-variant-types))
    * see [here](https://github.com/EBIvariation/CMAT/issues/396#issuecomment-1898804129)
* For cases with multiple classifications, what do the dates and review statuses look like? Do they tend to be the same?

In [9]:
# Take a random sample of records to test on - rerun only if needed
def f(x):
    return random() < 0.001

# filter_xml(clinvar_xml, sample_xml, filter_fct=f, max_num=3000)

INFO:filter_clinvar_xml:Records written: 3000


In [46]:
# dataset = ClinVarDataset(sample_xml)
dataset = ClinVarDataset(clinvar_xml)

In [47]:
def get_classifications(record):
    results = []
    class_elts = find_elements(r.rcv, './Classifications/*')
    for elt in class_elts:
        results.append({
            'type': elt.tag,
            'review_status': find_mandatory_unique_element(elt, './ReviewStatus').text,
            'description': find_mandatory_unique_element(elt, './Description').text
        })
    return results

In [48]:
classifications = [get_classifications(r) for r in dataset]

In [49]:
len(classifications)

3223688

In [53]:
class_type_counts = Counter()
multiple_class_count = 0
for classif in classifications:
    if len(classif) > 1:
        multiple_class_count += 1
    for c in classif:
        class_type_counts[c['type']] += 1

In [54]:
class_type_counts

Counter({'GermlineClassification': 3223688})

In [55]:
multiple_class_count

0

Outcome: in this first release at least, the data is the same even though the structure has changed.

I searched through the VCV XML, the string `SomaticClinicalImpact` is at least present there but it always seems to be set to 0 submission and no meaningful value.

I've emailed ClinVar about this to ask if it's intended and these values will only be used in new submissions.

In [None]:
vcv_classifications = []
vcv_count = 0

with gzip.open(os.path.join(work_dir, 'ClinVarVCVRelease_00-latest.xml.gz'), 'rt') as fh:
    for event, elem in ElementTree.iterparse(fh):
        if elem.tag == 'VariationArchive':
            pprint(elem)
            vcv_count += 1
        if vcv_count > 10:
            break
#         if elem.tag in {'SomaticClinicalImpact', 'OncogenicityClassification', 'GermlineClassification'}:
#             try:
#                 vcv_classifications.append({
#                     'type': elem.tag,
#                     'review_status': find_mandatory_unique_element(elem, './ReviewStatus').text,
#                     'description': find_mandatory_unique_element(elem, './Description').text
#                 })
#             except:
#                 continue
#             finally:
#                 elem.clear()
#         vcv_count += 1
#         if len(vcv_classifications) > 100 or vcv_count > 500000:
#             break

In [21]:
vcv_classifications

[{'type': 'GermlineClassification',
  'review_status': 'criteria provided, single submitter',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'criteria provided, single submitter',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineClassification',
  'review_status': 'no assertion criteria provided',
  'description': 'Pathogenic'},
 {'type': 'GermlineCla

In [19]:
vcv_count

500001