# Atlas of Genetics and Cytogenetics in Oncology and Haematology

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from typing import NamedTuple
from collections import Counter
from urllib.parse import urljoin
import pickle
import json

## Pull tables

In [2]:
chromosomes = [
    'X', 'Y'
] + list(range(1,23))
url = 'http://atlasgeneticsoncology.org/Indexbychrom/idxa_X.html'

In [3]:
# load responses
with open('data/atlas_responses.pkl', 'rb') as f:
    chr_resp_objects = pickle.load(f)

## Tokenize text

In [4]:
band_chars = r'[pq][0-9.]+'
chr_chars = r'[XY0-9]+'

iscn_translocation_re = r't' +\
    f'\((?P<chr1>{chr_chars});(?P<chr2>{chr_chars})\)' +\
    f'\((?P<band1>{band_chars});(?P<band2>{band_chars})\)'

gene_fusion_re = r'(?P<five_p_gene>\w+|\?)/(?P<three_p_gene>\w+|\?)'

band_only_re = f'\\((?P<bo_chr>{chr_chars})(?P<bo_band>{band_chars})\\)'

tissue_context_re = r'\(((Bone)|(Kidney)|(Soft Tissue(s)?)|(Lung)|(Breast)|(Prostate))( tumors)?\)'
tissue_context_re2 = r'(solely )?in [\w\s]+'

iscn_region_dup_re = r'dup' +\
    f'\((?P<dup_chr>{chr_chars})\)' +\
    f'\((?P<dup_region_start>{band_chars})-?(?P<dup_region_stop>{band_chars})\)'

iscn_region_del_re = r'del' +\
    f'\((?P<del_chr>{chr_chars})\)' +\
    f'\((?P<del_region_start>{band_chars})-?(?P<del_region_stop>{band_chars})\)'

chr_amp_re = r'\+' + chr_chars

token_specification = [
    ('TRANSLOCATION', iscn_translocation_re),
    ('GENE_FUSION', gene_fusion_re),
    ('REGION_DUP', iscn_region_dup_re),
    ('REGION_DEL', iscn_region_del_re),
    ('CHR_AMP', chr_amp_re),
    ('SKIP', r'[ \t]+'),
    ('BAND_ONLY', band_only_re),
    ('TISSUE_CONTEXT', tissue_context_re),
    ('TISSUE_CONTEXT_2', tissue_context_re2),
    ('FAIL', r'.')
]

class Token(NamedTuple):
    type: str
    value: str
    groupdict: dict

def tokenize(string):
    tokens = list()
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    
    for mo in re.finditer(tok_regex, string, flags=re.IGNORECASE):
        kind = mo.lastgroup
        value = mo.group()
        groupdict = {k:v for k, v in mo.groupdict().items() if v and k != kind}
        if kind == 'SKIP':
            continue
        elif kind == 'FAIL':
            raise RuntimeError(f'{value!r} unexpected.')
        tokens.append(Token(kind, value, groupdict))
    return tokens

test_str = 'del(X)(p22p22) P2RY8/CRLF2'
m = tokenize(test_str)
re.match(iscn_region_del_re, 'del(X)(p22p22)')
# re.compile(iscn_region_del_re).pattern
# tokenize('t(X;1)(p22;p34) PTPRF/RPS6KA3')

<re.Match object; span=(0, 14), match='del(X)(p22p22)'>

In [5]:
m

[Token(type='REGION_DEL', value='del(X)(p22p22)', groupdict={'del_chr': 'X', 'del_region_start': 'p22', 'del_region_stop': 'p22'}),
 Token(type='GENE_FUSION', value='P2RY8/CRLF2', groupdict={'five_p_gene': 'P2RY8', 'three_p_gene': 'CRLF2'})]

In [6]:
expected_headers = ['Annotated Leukemias', 'Other Leukemias', 'Annotated Tumors', 'Other Tumors']

In [7]:
c = Counter()
matches = list()
observed_texts = set()
failures = set()

for chromosome, r in chr_resp_objects.items():
    s = BeautifulSoup(r.content, 'html.parser')
    tables = s.find_all('table', class_='sortable')
    for table in tables[:4]:
        header = table.th.text.strip()
        assert header in expected_headers
        for data in table.find_all('td'):
            href = data.a.attrs['href']
            text = data.a.text
            t = ';'.join([href,text])
            if t in observed_texts:
                continue
            else:
                observed_texts.add(t)
            try:
                tokens = tokenize(text)
                k = ':'.join([header, 'match'])
                matches.append({
                    'biomarker': text,
                    'tokens': tokens,
                    'url': urljoin(url, href),
                    'annot_type': header,
                })
            except RuntimeError as e:
                k = ':'.join([header, 'fail'])
                failures.add(text)
            c[k] += 1
match = len(matches)
failure = len(failures)
total = len(observed_texts)
                
print(f'Succeeded on {match} of {total} records ({match / total})).')

Succeeded on 10845 of 11349 records (0.9555908009516257)).


In [8]:
total = sum(c.values())
fails = sum([v for k, v in c.items() if k.endswith('fail')])
successes = total - fails
for k in sorted(c):
    print(f'{k}: {c[k]}')

Annotated Leukemias:fail: 220
Annotated Leukemias:match: 486
Annotated Tumors:fail: 107
Annotated Tumors:match: 547
Other Leukemias:fail: 69
Other Leukemias:match: 756
Other Tumors:fail: 108
Other Tumors:match: 9056


## Construct locations and variants

### First, from Atlas dataset

In [9]:
from ga4gh.vr import models
from ga4gh.core import ga4gh_identify, ga4gh_serialize
import hashlib
import sys
import csv
from copy import copy

In [10]:
def vrs_hash(obj):
    digest_size = 24
    
    blob = ga4gh_serialize(obj)
    digest = hashlib.sha512(blob).digest()
    tdigest_int = int.from_bytes(digest[:digest_size], byteorder=sys.byteorder)
    return(tdigest_int)

hash_models = [
    models.ChromosomeLocation,
    models.GeneLocation,
    models.Allele,
    models.LocationJunction,
    models.RelativeAbundance,
    models.VariationSet
]

def construct_VRS_chromosome_location(chromosome, start, end):
    interval = models.NamedInterval(start=start, end=end)
    location = models.ChromosomeLocation(
        species_id="taxonomy:9606",
        interval=interval,
        chr=chromosome
    )
    return location

for m in hash_models:
    m.__hash__ = vrs_hash

In [11]:
HGNC_PRIMARY = dict()
HGNC_ALIAS = dict()
HGNC_CHR_LOC = dict()

cband_re = re.compile(f'(?P<chr>{chr_chars})(?P<band>{band_chars})')

with open('data/hgnc_table.txt', 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for record in reader:
        v = record['HGNC ID'].lower()
        k = record['Approved symbol']
        HGNC_PRIMARY[k] = v
        
        previous_symbols = record['Previous symbols'].split(', ')
        alias_symbols = record['Alias symbols'].split(', ')
        for symbol in previous_symbols:
            HGNC_ALIAS[symbol.strip("'")] = v
        for symbol in alias_symbols:
            HGNC_ALIAS[symbol.strip("'")] = v
        
        m = cband_re.match(record['Chromosome'])
        if m is None:
            continue
        d = m.groupdict()
        HGNC_CHR_LOC[v] = construct_VRS_chromosome_location(
            d['chr'], d['band'], d['band']
        )


In [12]:
def construct_VRS_gene_location(gene_symbol):
    if gene_symbol == '?':
        return None
    gene_id = HGNC_PRIMARY.get(gene_symbol, None)
    if gene_id is None:
        gene_id = HGNC_ALIAS.get(gene_symbol, None)
    if gene_id is None:
        gene_id=f"unidentified.symbol:{gene_symbol}"
    location = models.GeneLocation(
        gene_id=gene_id,
        interval=models.UndefinedInterval()
    )
    return location

def fusion_to_gene_locations(fusion_token):
    try:
        locations = [
            construct_VRS_gene_location( fusion_token.groupdict['five_p_gene'] ),
            construct_VRS_gene_location( fusion_token.groupdict['three_p_gene'] ),
        ]
    except KeyError:
        return []
    return locations
    
def del_to_region_location(del_token):
    l = construct_VRS_chromosome_location(
        chromosome=del_token.groupdict['del_chr'],
        start=del_token.groupdict['del_region_start'],
        end=del_token.groupdict['del_region_stop']
    )
    return l

def chr_amp_to_region_location(chr_amp_token):
    v = chr_amp_token.value
    interval = models.NamedInterval(start='pter', end='qter')
    location = models.ChromosomeLocation(
        species_id="taxonomy:9606",
        interval=interval,
        chr=v[1:]
    )
    return location
    
def translocation_to_region_locations(trx_token):
    locations = [(
        construct_VRS_chromosome_location(
            chromosome=trx_token.groupdict['chr1'],
            start='pter',
            end=trx_token.groupdict['band1']),
        construct_VRS_chromosome_location(
            chromosome=trx_token.groupdict['chr2'],
            start=trx_token.groupdict['band2'],
            end='qter')
        ),(
        construct_VRS_chromosome_location(
            chromosome=trx_token.groupdict['chr2'],
            start='pter',
            end=trx_token.groupdict['band2']),
        construct_VRS_chromosome_location(
            chromosome=trx_token.groupdict['chr1'],
            start=trx_token.groupdict['band1'],
            end='qter')
        )
    ]
    return locations

def dup_to_region_location(dup_token):
    location = construct_VRS_chromosome_location(
        chromosome=dup_token.groupdict['dup_chr'],
        start=dup_token.groupdict['dup_region_start'],
        end=dup_token.groupdict['dup_region_stop']
    )
    return location

def index_variant(variant, location, record, variant_index, record_index):
    if location is None:
        return
    l = variant_index.get(location, list())
    l.append(variant)
    variant_index[location] = l
    
    l = record_index.get(variant, list())
    l.append(record)
    record_index[variant] = l

In [13]:
atlas_variants = dict()
atlas_records = dict()

for match in matches:
    for token in match['tokens']:
        t = token.type
        if t == 'GENE_FUSION':
            genes = fusion_to_gene_locations(token)
            if not genes:
                continue
            variant = models.LocationJunction(
                left=genes[0],
                right=genes[1]
            )
            index_variant(variant, genes[0], match, atlas_variants, atlas_records)
            index_variant(variant, genes[1], match, atlas_variants, atlas_records)
        elif t == 'REGION_DEL':
            location = ( del_to_region_location(token) )
            state = models.ChromosomeState(
                molecularRegion=None
            )
            # Molecular Variant
            variant = models.Allele(
                location=location,
                state=state
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
            # Systemic Variant
            state = models.AmbiguousState()
            molecular_variation = models.Allele(
                location=location,
                state=state
            )
            variant = models.RelativeAbundance(
                molecularVariation=molecular_variation,
                relativeQuantity='less_than'
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
        elif t == 'REGION_DUP':
            location = ( dup_to_region_location(token) )
            # Note: dups (insertions at the original chromosome location) are awkward due to the use of regions as locations.
            #   e.g. how to differentiate insertion of Xq11-q44 at Xq11 with and without replacement of Xq11?
            #   per ISCN 2016 convention (Section 9.2.9) insertions occur AT a band, no further specificity.
            state = models.ChromosomeState(
                molecularRegion=location
            )
            insertion_location=construct_VRS_chromosome_location(location.chr, location.interval.end, location.interval.end)
            # Molecular Variant
            variant = models.Allele(
                location=insertion_location,
                state=state
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
            # Systemic Variant
            state = models.AmbiguousState()
            molecular_variation = models.Allele(
                location=location,
                state=state
            )
            variant = models.RelativeAbundance(
                molecularVariation=molecular_variation,
                relativeQuantity='greater_than'
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
        elif t == 'CHR_AMP':
            location = chr_amp_to_region_location(token)
            state=models.AmbiguousState()
            molecular_variation = models.Allele(
                location=location,
                state=state
            )
            variant = models.RelativeAbundance(
                molecularVariation=molecular_variation,
                relativeQuantity='greater_than'
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
        elif t == 'TRANSLOCATION':
            der1_regions, der2_regions = translocation_to_region_locations(token)
            # First derived chromosome
            der1 = models.LocationJunction(
                left=der1_regions[0],
                right=der1_regions[1]
            )
            # Second derived chromosome
            der2 = models.LocationJunction(
                left=der2_regions[0],
                right=der2_regions[1]
            )
            # Using a VariationSet in lieu of an explicit class for a Molecular Profile
            try:
                assert hash(der1) != hash(der2)
            except AssertionError:
                # Example value that triggers this is t(X;X)(p22;p22). Not clear what is meant, skip.
                continue
            variant = models.VariationSet(
                members=[der1, der2]
            )
            index_variant(variant, location, match, atlas_variants, atlas_records)
        elif 'CONTEXT' in t:
            continue # ignore context for now
        elif t == 'BAND_ONLY':
            continue # ignore band only as a variant
        else:
            raise NotImplementedError(f'Token type {t} not handled.')

In [14]:
len(atlas_variants)

8488

In [15]:
len(atlas_records)

14213

## Second, from Project GENIE

v8.0 public, downloaded from SAGE: 10.7303/syn22228642

### Fusions

In [16]:
genie_fusion_samples = dict()
genie_sample_fusions = dict()
fusion_re = re.compile(r'(\w+)-(\w+) fusion')
failed = set()
evaluated = 0
skipped = 0
with open('data/genie_8_0_public/data_fusions.txt') as fusion_file:
    fusion_reader = csv.DictReader(fusion_file, delimiter='\t')
    for record in fusion_reader:
        evaluated += 1
        m = fusion_re.match(record['Fusion'])
        if m:
            left, right = map(construct_VRS_gene_location, m.groups())
            if not (left or right):
                skipped += 1
                continue
            variant = models.LocationJunction(
                left=left,
                right=right
            )
            s = genie_fusion_samples.get(variant, set())
            s.add(record['Tumor_Sample_Barcode'])
            genie_fusion_samples[variant] = s
            s = genie_sample_fusions.get(record['Tumor_Sample_Barcode'], set())
            s.add(variant)
            genie_sample_fusions[record['Tumor_Sample_Barcode']] = s
        else:
            skipped += 1

In [17]:
print(f'Intergenic, 2-partner fusions: {evaluated - skipped} ({(evaluated - skipped)/evaluated*100:.1f}% of records)')
print(f'Average fusions per sample: {sum(map(len, genie_sample_fusions.values())) / len(genie_sample_fusions):.2f}')

Intergenic, 2-partner fusions: 13686 (50.0% of records)
Average fusions per sample: 1.15


In [18]:
matched_fusions = dict()
for fusion in genie_fusion_samples:
    matches = set()
    if fusion.left:
        for variant in atlas_variants.get(fusion.left, []):
            if variant.type!=fusion.type:
                continue
            if variant.left != fusion.left:
                continue
            if variant.right is None or variant.right == fusion.right:
                matches.add(variant)
    if fusion.right:
        for variant in atlas_variants.get(fusion.right, []):
            if variant.type!=fusion.type:
                continue
            if variant.right != fusion.right:
                continue
            if variant.left is None or variant.left == fusion.left:
                matches.add(variant)
    if matches:
        matched_fusions[fusion] = matches

In [19]:
print(f'{len(matched_fusions)} of {len(genie_fusion_samples)} ({len(matched_fusions) / len(genie_fusion_samples) * 100:.1f}%) fusions with matches to Atlas.')
samples = len(genie_sample_fusions)
matched_samples = set()
for fusion in matched_fusions:
    matched_samples.update(genie_fusion_samples[fusion])
print(f'{len(matched_samples)} of {samples} ({len(matched_samples) / samples * 100:.1f}%) samples with fusion matches to Atlas.')

245 of 4241 (5.8%) fusions with matches to Atlas.
1186 of 5876 (20.2%) samples with fusion matches to Atlas.


In [20]:
from datetime import date

genie_license = {
    'resource': 'AACR Project GENIE',
    'data_use_conditions_url': 'https://www.aacr.org/wp-content/uploads/2020/02/20200127_GENIE_Data_Guide_7.pdf'
}

atlas_license = {
    'resource': 'Atlas of Genetics and Cytogenetics in Oncology and Haematology',
    'cc_license': 'CC BY-NC-ND 2.0 FR',
    'cc_license_url': 'https://creativecommons.org/licenses/by-nc-nd/2.0/fr/deed.en',
    'data_use_conditions_url': 'http://atlasgeneticsoncology.org/BackpageAbout.html#COPYRIGHT'
}

genie_version = 'v8.0_public'
atlas_version = date(day=14, month=7, year=2020).isoformat()
vrs_version = '1.x.prototype'
vrs_warning = 'This dataset structures variants using an unofficial, prototype VRS build'

_meta = {
    'warning': vrs_warning,
    'data_licenses': {
        'genie': genie_license,
        'atlas': atlas_license
    },
    'versions': {
        'vrs_version': vrs_version,
        'genie_version': genie_version,
        'atlas_version': atlas_version
    },
}

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def uniqify_and_clean(records):
    #  Somewhere I'm double-registering records to variants.
    #    I expect that this is caused by records appearing on two pages corresponding
    #    to the two constituent chromsomal regions of each translocation.
    #    This is a hack to uniqify records.
    #    TODO: Fix double-registering bug
    logging.debug(f'records: {records}')
    out = list()
    observed_record_hashes = set()
    for record in records:
        h = hash(json.dumps(record))
        if h in observed_record_hashes:
            continue
        else:
            observed_record_hashes.add(h)
            if 'tokens' in record:
                del record['tokens']
            out.append(record)
    return out

records = list()
for fusion in matched_fusions:
    annotations = list()
    for matched_fusion in matched_fusions[fusion]:
        annotations.extend(atlas_records[matched_fusion])
    annotations = uniqify_and_clean(annotations)
    record = {
        'genie_fusion': ga4gh_identify(fusion),
        'genie_samples': list(genie_fusion_samples[fusion]),
        'atlas_fusion': ga4gh_identify(matched_fusion),
        'atlas_annotations': annotations
    }
    records.append(record)
genie_fusions = {
    '_meta': _meta,
    'records': records,
    'total': len(records)
}

with open('out/genie_fusions.json', 'w') as f:
    json.dump(genie_fusions, f)

### Questions
Which fusions are most frequently seen?

### Copy Number

For each gene-sample combination, a copy number level is specified:
- "-2" is a deep loss, possibly a homozygous deletion
- "-1" is a single-copy loss (heterozygous deletion)
- "0" is diploid
- "1" indicates a low-level gain
- "2" is a high-level amplification.

In [21]:
from collections import defaultdict

genie_cna_samples = defaultdict(set)
genie_sample_cnas = defaultdict(set)
evaluated_genes = 0
skipped_genes = set()
with open('data/genie_8_0_public/data_CNA.txt') as cna_file:
    cna_reader = csv.DictReader(cna_file, delimiter='\t')
    for gene_cna in cna_reader:
        evaluated_genes += 1
        gene_symbol = gene_cna.pop('Hugo_Symbol')
        gene_location = construct_VRS_gene_location(gene_symbol)
        if not str(gene_location.gene_id).startswith('hgnc'):
            skipped_genes.add(gene_symbol)
            continue
        molecular_variation = models.Allele(
            location=gene_location,
            state=models.AmbiguousState()
        )
        gt_variant = models.RelativeAbundance(
            molecularVariation=molecular_variation,
            relativeQuantity='greater_than'
        )
#         eq_variant = models.RelativeAbundance(
#             molecularVariation=molecular_variation,
#             relativeQuantity='equal_to'
#         )
        lt_variant = models.RelativeAbundance(
            molecularVariation=molecular_variation,
            relativeQuantity='less_than'
        )
        for sample, cna in gene_cna.items():
            if cna == 'NA':
                continue
            cna = float(cna)
            if cna == 0:
                continue # Only evaluate CN gain / loss
#                 genie_cna_samples[eq_variant].add(sample)
#                 genie_sample_cnas[sample].add(eq_variant)
            elif cna > 0:
                genie_cna_samples[gt_variant].add(sample)
                genie_sample_cnas[sample].add(gt_variant)
            elif cna < 0:
                genie_cna_samples[lt_variant].add(sample)
                genie_sample_cnas[sample].add(gt_variant)

genie_cna_samples = dict(genie_cna_samples)
genie_sample_cnas = dict(genie_sample_cnas)

In [22]:
valid_genes = evaluated_genes - len(skipped_genes)
total_samples = len(cna_reader.fieldnames) - 1
cna_samples = len(genie_sample_cnas)

print(f'Valid genes: {valid_genes} ({valid_genes/evaluated_genes*100:.1f}% of all genes)')
print(f'Samples with gene CNAs: {cna_samples} ({cna_samples/total_samples*100:.1f}% of {total_samples} samples)')
print(f'Average gene CNAs per sample: {sum(map(len, genie_sample_cnas.values())) / len(genie_sample_cnas):.2f}')

Valid genes: 930 (100.0% of all genes)
Samples with gene CNAs: 40812 (59.2% of 68996 samples)
Average gene CNAs per sample: 29.44


In [23]:
# Translate to focal CNA

gene_to_focal_cna = dict()

for gene_cna, s in genie_cna_samples.items():
    gene_id = str(gene_cna.molecularVariation.location.gene_id)
    try:
        focal_location = HGNC_CHR_LOC[gene_id]
    except KeyError:
        gene_to_focal_cna[gene_cna] = None
        continue
    molecular_variation = models.Allele(
        location=focal_location,
        state=models.AmbiguousState()
    )
    focal_cna = models.RelativeAbundance(
        molecularVariation=molecular_variation,
        relativeQuantity=gene_cna.relativeQuantity
    )
    gene_to_focal_cna[gene_cna] = focal_cna

In [24]:
# Search

band_re = re.compile(r'[pq](ter|\d\d(.\d+)?)')

def band_comp(l_input, r_input, mode):
    # default mode, le
    if mode == 'le':
        left = str(l_input)
        right = str(r_input)
    elif mode == 'ge':
        right = str(l_input)
        left = str(r_input)
    else:
        raise ValueError
    assert band_re.fullmatch(left) and band_re.fullmatch(right)
    if left == 'pter' or right == 'qter':
        return True
    elif left == 'qter' or right == 'pter':
        return False
    comp_len = min(len(left), len(right))
    left = left[:comp_len]
    right = right[:comp_len]
    if left == right:
        return True
    if left[0] == 'p':
        if right[0] == 'q':
            return True
        else:
            right_temp = right
            right = left
            left = right_temp
    for i in range(comp_len):
        if left[i] > right[i]:
            return False
        elif left[i] < right[i]:
            return True
    return True

assert band_comp('p11', 'q11', 'le') is True
assert band_comp('p11', 'q11', 'ge') is False
assert band_comp('p31', 'p11', 'le') is True
assert band_comp('p11', 'p31', 'le') is False
assert band_comp('p31', 'p11', 'ge') is False
assert band_comp('p11', 'p31', 'ge') is True
assert band_comp('p11.1', 'p11', 'le') is True # Subbands always compare True to parent bands
assert band_comp('p11.1', 'p11', 'ge') is True
assert band_comp('p11', 'p11.1', 'le') is True
assert band_comp('p11.1', 'p11.1', 'ge') is True

matched_cnas = dict()
for gene_cna in genie_cna_samples:
    focal_cna = gene_to_focal_cna[gene_cna]
    if focal_cna is None:
        continue
    
    # Brute force search – slow, but quick to implement.
    #   For this size search space, completes in reasonable time.
    matches = set()
    for atlas_variant in atlas_records:
        if atlas_variant.type != 'RelativeAbundance':
            continue
        if focal_cna.relativeQuantity != atlas_variant.relativeQuantity:
            continue
        genie_loc = focal_cna.molecularVariation.location
        atlas_loc = atlas_variant.molecularVariation.location
        # ignore multi-band amplifications
        if atlas_loc.interval.start[:3] !=  atlas_loc.interval.end[:3]:
            continue
        if genie_loc.chr != atlas_loc.chr:
            continue
        if not band_comp(genie_loc.interval.start, atlas_loc.interval.start, 'ge'):
            continue
        if not band_comp(genie_loc.interval.end, atlas_loc.interval.end, 'le'):
            continue
        matches.add(atlas_variant)
    if matches:
        matched_cnas[gene_cna] = matches

In [25]:
print(f'{len(matched_cnas)} of {len(genie_cna_samples)} ({len(matched_cnas) / len(genie_cna_samples) * 100:.1f}%) gene CNAs with matches to Atlas.')
samples = len(genie_sample_cnas)
matched_samples = set()
for cna in matched_cnas:
    matched_samples.update(genie_cna_samples[cna])
print(f'{len(matched_samples)} of {samples} ({len(matched_samples) / samples * 100:.1f}%) samples with gene CNA matches to Atlas.')

385 of 1756 (21.9%) gene CNAs with matches to Atlas.
26219 of 40812 (64.2%) samples with gene CNA matches to Atlas.


In [26]:
for cna in matched_cnas:
    annotations = list()
    for matched_cna in matched_cnas[cna]:
        annotations.extend(atlas_records[matched_cna])
    annotations = uniqify_and_clean(annotations)
    record = {
        'genie_cna': ga4gh_identify(cna),
        'genie_samples': list(genie_cna_samples[cna]),
        'atlas_fusion': ga4gh_identify(matched_cna),
        'atlas_annotations': annotations
    }
    records.append(record)
genie_cnas = {
    '_meta': _meta,
    'records': records,
    'total': len(records)
}

with open('out/genie_cnas.json', 'w') as f:
    json.dump(genie_cnas, f)

In [31]:
variant_records = dict()
for k, v in atlas_records.items():
    variant_records[ga4gh_identify(k)] = uniqify_and_clean(v)

In [32]:
_meta = {
    'warning': vrs_warning,
    'data_licenses': {
        'atlas': atlas_license
    },
    'versions': {
        'vrs_version': vrs_version,
        'atlas_version': atlas_version
    },
}

atlas_variant_records = {
    '_meta': _meta,
    'atlas_variant_annotations': variant_records,
    'total_variants': len(variant_records)
}

with open('out/atlas_variant_annotations.json', 'w') as f:
    json.dump(atlas_variant_records, f)

In [33]:
vrs_objects = list(set(genie_cna_samples) | set(atlas_records) | set(genie_fusion_samples))
errors = set()
for variation in vrs_objects:
    if variation._id is None:
        try:
            variation._id = ga4gh_identify(variation)
        except KeyError as e:
            errors.add(str(e))

In [34]:
from operator import attrgetter
sorted_objects = list()
for variation in sorted(vrs_objects, key=lambda x: str(x._id)):
    sorted_objects.append(variation.for_json())
with open('out/vrs_records.json', 'w') as f:
    json.dump(sorted_objects, f)

### Questions
Which CNAs are most frequently seen?