In [1]:
import os
import sys

import pandas as pd
import requests

from eva_cttv_pipeline.clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils import *
from eva_cttv_pipeline.clinvar_identifier_parsing import *
from consequence_prediction.repeat_expansion_variants.pipeline import *

sys.path.append('../')
from filter_clinvar_xml import filter_xml

In [2]:
PROJECT_ROOT = '/home/april/projects/opentargets'
clinvar_path = os.path.join(PROJECT_ROOT, 'ClinVarFullRelease_00-latest.xml.gz')
microsat_xml = os.path.join(PROJECT_ROOT, 'repeat-expansion', 'microsatellites.xml.gz')

In [3]:
# Filter XML for microsatellites
def microsat(x):
    return x.measure and x.measure.variant_type == 'Microsatellite'


filter_xml(
    input_xml=clinvar_path,
    output_xml=microsat_xml,
    filter_fct=microsat,
)

INFO:filter_clinvar_xml:Records written: 18551


In [3]:
# only microsatellite variants
dataset = ClinVarDataset(microsat_xml)

## Part 1: HGVS parsing

Confirming HGVS coverage (similar to complex event investigation). Note that the python hgvs module can't parse *anything* in this dataset, so omitting it from the investigation.

In [4]:
def try_to_parse_hgvs(hgvs_list):
    one_parseable = False
    for hgvs in hgvs_list:
        try:
            if any(parse_variant_identifier(hgvs)):
                one_parseable = True
        except:
            pass  # these are None
    return one_parseable

In [None]:
hgvs_count = 0
toplevel_refseq_hgvs_count = 0
parseable_hgvs_count = 0
parseable_name_count = 0
parseable_hgvs_only = 0
parseable_name_only = 0

hgvs_output_file = open(os.path.join(PROJECT_ROOT, 'repeat-expansion', 'unparseable-hgvs.txt'), 'w+')

for record in dataset:
    m = record.measure
    can_parse_hgvs = False
    can_parse_name = False
    
    if m.hgvs:
        hgvs_count += 1
        if m.toplevel_refseq_hgvs:
            toplevel_refseq_hgvs_count += 1

        # hgvs parseability
        if try_to_parse_hgvs(m.hgvs):
            parseable_hgvs_count += 1
            can_parse_hgvs = True
        else:
            hgvs_output_file.write(f'\n{record.accession}\n')
            for hgvs in m.hgvs:
                hgvs_output_file.write(hgvs + '\n')
                
    # variant name parseability
    if try_to_parse_hgvs(m.all_names):
        parseable_name_count += 1
        can_parse_name = True
        
    # exclusive counts
    if can_parse_hgvs and not can_parse_name:
        parseable_hgvs_only += 1
    if can_parse_name and not can_parse_hgvs:
        parseable_name_only += 1
        
hgvs_output_file.close()

In [6]:
# collect results - 
counts = {
    'Any HGVS': hgvs_count,
    'Top level refseq HGVS': toplevel_refseq_hgvs_count,
    'HGVS parseable': parseable_hgvs_count,
    'Name parseable': parseable_name_count,
    'Only HGVS parseable': parseable_hgvs_only,
    'Only name parseable': parseable_name_only,
}

In [7]:
# total is 18551
counts

{'Any HGVS': 18538,
 'Top level refseq HGVS': 18456,
 'HGVS parseable': 18533,
 'Name parseable': 18469,
 'Only HGVS parseable': 81,
 'Only name parseable': 17}

### Thoughts

* We should probably try parsing everything we can
* Order of preference: top level refseq > any other HGVS > any variant name
* Should support LRG (assuming Ensembl accepts this)
    
Unparseable from logs (parsing name only, falls back on hgvs when name missing):
```
NC_000004.12:g.(41745972_41746031)ins(15_27)
NC_000004.12:g.(41745972_41746031)ins(15_27)
NC_000004.12:g.(41745972_41746031)ins(15_27)
NG_031977.1:g.(5321_5338)ins(360_?)
NG_031977.1:g.(5321_5338)ins(360_?)
NG_054747.1:g.(19392_19426)TTTTA[(7_?)]TTTCA[(n)]
NR_002717.2(ATXN8OS):n.1103CTG[(107_127)]
NR_002717.2(ATXN8OS):n.1103CTG[(107_127)]
NR_002717.2(ATXN8OS):n.1103CTG[(107_127)]
NR_002717.2(ATXN8OS):n.1103CTG[(107_127)]
NR_002717.2(ATXN8OS):n.1103CTG[(15_40)]
NR_002717.2(ATXN8OS):n.1103CTG[(15_40)]
NR_002717.2(ATXN8OS):n.1103CTG[(15_40)]
NR_003051.3(RMRP):n.-10_-9insCTCTGTGAAGCCTCTGTGAAGC
NR_120611.1:n.192CCG[(35_?)]
fragile site, folic acid type, rare, fra(12)(q13.1)
```

Unparseable output from above counts (parsing any hgvs):
```
RCV000008537
LRG_863t1:c.589_591CAG(36_38)
LRG_863p1:p.Gln197_Gln208delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGln

RCV000761550
LRG_762t1:c.-128GGC(55_200)

RCV000761551
LRG_762t1:c.-128GGC(55_200)

RCV000853558
NR_120611.1:n.192CCG[(35_?)]

RCV000856572
NG_054747.1:g.(19392_19426)TTTTA[(7_?)]TTTCA[(n)]
```


## Part 2: Consequence prediction

Specifically issues like these:
* Missing genes: UGT1A, ATXN8, various LOC*
* Genes mapping to non-standard chromosomes in Ensembl

In [8]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
variants, s = load_clinvar_data(microsat_xml)

In [10]:
len(variants)

1504

In [11]:
s

Counter({'short_expansion': 3514,
         'deletion': 14298,
         'repeat_expansion': 641,
         'no_complete_coords': 98})

In [None]:
variants = annotate_ensembl_gene_info(variants)

In [13]:
def incomplete(row):
     return not (
        pd.notnull(row['EnsemblGeneID']) and
        pd.notnull(row['EnsemblGeneName']) and
        pd.notnull(row['RepeatType']) and
        row['EnsemblChromosomeName'] in STANDARD_CHROMOSOME_NAMES
    )


def required_cols(variants):
    return display_cols(variants)


def display_cols(variants, add_cols=None):
    if add_cols:
        return variants[add_cols + ['GeneSymbol', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType', 'EnsemblChromosomeName']]
    return variants[['GeneSymbol', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType', 'EnsemblChromosomeName']]


def gene_symbol_like(variants, s):
    variants_with_names = variants[pd.notna(variants['GeneSymbol'])]
    return variants_with_names[variants_with_names['GeneSymbol'].str.contains(s)]


def nonstandard_chr_name(variants):
    variants_with_chr_name = variants[pd.notna(variants['EnsemblChromosomeName'])]
    return variants_with_chr_name[~variants_with_chr_name['EnsemblChromosomeName'].isin(STANDARD_CHROMOSOME_NAMES)]

In [14]:
# all incomplete variants
incomplete_variants = variants[variants.apply(incomplete, axis=1)]

In [15]:
print(len(variants))
print(len(incomplete_variants))

1544
63


#### Overview:

18,550 microsatellites ><sup>(1)</sup> 739 repeat expansion candidates > 1544 annotated variants ><sup>(2)</sup> 1485 complete

* (1) is dropping deletions + short expansions which is expected
    * I assume we'll eventually also care about microsatellites that aren't expansions but that's maybe a discussion for another day
* (2) is a combination of parsing issues not extracting variant type, and consequence prediction issues described in this section

### Missing genes

In [16]:
missing_genes = pd.concat([
    gene_symbol_like(incomplete_variants, 'UGT1A'),
    gene_symbol_like(incomplete_variants, 'ATXN8'),
    gene_symbol_like(incomplete_variants, 'LOC'),
])

display_cols(missing_genes, add_cols=['Name'])

Unnamed: 0,Name,GeneSymbol,EnsemblGeneID,EnsemblGeneName,RepeatType,EnsemblChromosomeName
4,NC_000002.11:g.234668881_234668882TA[5][6][7][8],UGT1A,,,short_tandem_repeat_expansion,
17,NC_000002.11:g.234668881_234668882TA[5][6][7][8],UGT1A,,,short_tandem_repeat_expansion,
30,NC_000002.11:g.234668881_234668882TA[5][6][7][8],UGT1A,,,short_tandem_repeat_expansion,
1531,NR_002717.2(ATXN8OS):n.1103CTG[(107_127)],ATXN8OS,ENSG00000230223,ATXN8OS,,13
1532,NR_002717.2(ATXN8OS):n.1103CTG[(107_127)],ATXN8,,,,
1533,NR_002717.2(ATXN8OS):n.1103CTG[(107_127)],ATXN8OS,ENSG00000230223,ATXN8OS,,13
1534,NR_002717.2(ATXN8OS):n.1103CTG[(107_127)],ATXN8,,,,
1535,NR_002717.2(ATXN8OS):n.1103CTG[(15_40)],ATXN8,,,,
1537,NR_002717.2(ATXN8OS):n.1103CTG[(15_40)],ATXN8OS,ENSG00000230223,ATXN8OS,,13
35,NC_000004.11:g.3076606GCA[(40_?)],LOC109461479,,,trinucleotide_repeat_expansion,


In [17]:
# Check ATXN8 - ATXN8OS always present
atxn8_rcvs = variants[variants['GeneSymbol'] == 'ATXN8']['RCVaccession']
display_cols(variants[variants['RCVaccession'].isin(atxn8_rcvs)].sort_values(['RCVaccession', 'GeneSymbol']), add_cols=['RCVaccession'])

Unnamed: 0,RCVaccession,GeneSymbol,EnsemblGeneID,EnsemblGeneName,RepeatType,EnsemblChromosomeName
1532,RCV000000215,ATXN8,,,,
1533,RCV000000215,ATXN8OS,ENSG00000230223,ATXN8OS,,13.0
1534,RCV000006519,ATXN8,,,,
1531,RCV000006519,ATXN8OS,ENSG00000230223,ATXN8OS,,13.0
1535,RCV001260914,ATXN8,,,,
1537,RCV001260914,ATXN8OS,ENSG00000230223,ATXN8OS,,13.0
1536,RCV001260914,LOC109461478,,,,


In [18]:
# Check UGT1A - specific UGT1A* always present
ugt1a_rcvs = variants[variants['GeneSymbol'] == 'UGT1A']['RCVaccession']
display_cols(variants[variants['RCVaccession'].isin(ugt1a_rcvs)].sort_values(['RCVaccession', 'GeneSymbol']), add_cols=['RCVaccession'])

Unnamed: 0,RCVaccession,GeneSymbol,EnsemblGeneID,EnsemblGeneName,RepeatType,EnsemblChromosomeName
17,RCV000211183,UGT1A,,,short_tandem_repeat_expansion,
21,RCV000211183,UGT1A1,ENSG00000241635,UGT1A1,short_tandem_repeat_expansion,2.0
16,RCV000211183,UGT1A10,ENSG00000242515,UGT1A10,short_tandem_repeat_expansion,2.0
15,RCV000211183,UGT1A3,ENSG00000288702,UGT1A3,short_tandem_repeat_expansion,2.0
14,RCV000211183,UGT1A4,ENSG00000244474,UGT1A4,short_tandem_repeat_expansion,2.0
13,RCV000211183,UGT1A5,ENSG00000288705,UGT1A5,short_tandem_repeat_expansion,2.0
11,RCV000211183,UGT1A6,ENSG00000167165,UGT1A6,short_tandem_repeat_expansion,2.0
18,RCV000211183,UGT1A7,ENSG00000244122,UGT1A7,short_tandem_repeat_expansion,2.0
10,RCV000211183,UGT1A8,ENSG00000242366,UGT1A8,short_tandem_repeat_expansion,2.0
9,RCV000211183,UGT1A9,ENSG00000241119,UGT1A9,short_tandem_repeat_expansion,2.0


In [19]:
# Check LOC* - chr/start/stop always present
loc_vars = incomplete_variants[incomplete_variants['GeneSymbol'].str.contains('LOC')]
loc_rcvs = loc_vars['RCVaccession'].tolist()

for r in dataset:
    if r.accession in loc_rcvs:
        m = r.measure
        print(f"{m.preferred_gene_symbols}: {m.chr}, {m.sequence_location_helper('start')}, {m.sequence_location_helper('stop')}")

['LOC109461479', 'HTT']: 4, 3074935, 3074936
['CSTB', 'LOC109029533']: 21, 43776468, 43776479
['C9orf72', 'LOC109504728']: 9, 27573523, 27573524
['LOC109504725', 'AR']: X, 67545318, 67545320
['DM1-AS', 'DMPK', 'LOC109461477', 'LOC107075317']: 19, 45770205, 45770264
['ATXN8', 'LOC109461478', 'ATXN8OS']: 13, 70139384, 70139386
['LOC108663996', 'TBP']: 6, 170561908, 170561908
['LOC108663996', 'TBP']: 6, 170561908, 170561908
['LOC108510657', 'FXN']: 9, 69037287, 69037304
['LOC108510657', 'FXN']: 9, 69037287, 69037304


In [20]:
# Another approach to LOCs using NCBI esearch
def esearch(s):
    eutils_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    esearch_url = eutils_url + 'esearch.fcgi'
    esummary_url = eutils_url + 'esummary.fcgi'

    payload = {'db': 'Gene', 'term': f'"{s}"', 'retmode': 'JSON'}
    data = requests.get(esearch_url, params=payload).json()
    if data:
        result_id_list = data.get('esearchresult').get('idlist')
        payload = {'db': 'Gene', 'id': ','.join(result_id_list), 'retmode': 'JSON'}
        summary_list = requests.get(esummary_url, params=payload).json()
        return result_id_list, summary_list
    return None, None    

In [21]:
locs = gene_symbol_like(incomplete_variants, 'LOC')
locs = locs['GeneSymbol'].tolist()

for l in locs:
    print('=====')
    ids, summary_list = esearch(l)
    for i in ids:
        loc_name = summary_list['result'][i]['name']
        print(loc_name)
        print(f'https://www.ncbi.nlm.nih.gov/gene/{loc_name[3:]}')
        print(summary_list['result'][i]['genomicinfo'])

=====
LOC109461479
https://www.ncbi.nlm.nih.gov/gene/109461479
[{'chrloc': '4', 'chraccver': 'NC_000004.12', 'chrstart': 3074876, 'chrstop': 3074939, 'exoncount': ''}]
=====
LOC107075317
https://www.ncbi.nlm.nih.gov/gene/107075317
[{'chrloc': '19', 'chraccver': 'NC_000019.10', 'chrstart': 45766792, 'chrstop': 45772994, 'exoncount': ''}]
=====
LOC109461477
https://www.ncbi.nlm.nih.gov/gene/109461477
[{'chrloc': '19', 'chraccver': 'NC_000019.10', 'chrstart': 45770204, 'chrstop': 45770265, 'exoncount': ''}]
=====
LOC108663996
https://www.ncbi.nlm.nih.gov/gene/108663996
[{'chrloc': '6', 'chraccver': 'NC_000006.12', 'chrstart': 170561906, 'chrstop': 170562020, 'exoncount': ''}]
=====
LOC108663996
https://www.ncbi.nlm.nih.gov/gene/108663996
[{'chrloc': '6', 'chraccver': 'NC_000006.12', 'chrstart': 170561906, 'chrstop': 170562020, 'exoncount': ''}]
=====
LOC108510657
https://www.ncbi.nlm.nih.gov/gene/108510657
[{'chrloc': '9', 'chraccver': 'NC_000009.12', 'chrstart': 69037275, 'chrstop': 6903

### Non-standard chromosomes

In [22]:
nonstandard_chr = nonstandard_chr_name(incomplete_variants)

display_cols(nonstandard_chr)

Unnamed: 0,GeneSymbol,EnsemblGeneID,EnsemblGeneName,RepeatType,EnsemblChromosomeName
168,TGM1,ENSG00000285348,TGM1,trinucleotide_repeat_expansion,CHR_HG1_PATCH
192,CACNA1C,ENSG00000285479,CACNA1C,trinucleotide_repeat_expansion,CHR_HG1815_PATCH
194,CACNA1C,ENSG00000285479,CACNA1C,trinucleotide_repeat_expansion,CHR_HG1815_PATCH
234,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
236,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
238,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
240,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
242,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
244,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7
246,CDKN1C,ENSG00000273707,CDKN1C,trinucleotide_repeat_expansion,CHR_HSCHR11_1_CTG7


In [23]:
# Check non-standard chromosome names - also have standard chromosome annotated
nonstandard_chr_names = nonstandard_chr['EnsemblChromosomeName'].tolist()
nonstandard_rcvs = variants[variants['EnsemblChromosomeName'].isin(nonstandard_chr_names)]['RCVaccession'].tolist()

(
    variants[variants['RCVaccession'].isin(nonstandard_rcvs)].groupby(['RCVaccession','EnsemblGeneName'])
    .agg({'EnsemblChromosomeName': lambda x: x.tolist()})
    .sort_values(['RCVaccession'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,EnsemblChromosomeName
RCVaccession,EnsemblGeneName,Unnamed: 2_level_1
RCV000311296,CACNA1C,"[CHR_HG1815_PATCH, 12]"
RCV000394635,CACNA1C,"[CHR_HG1815_PATCH, 12]"
RCV000537679,CDKN1C,"[CHR_HSCHR11_1_CTG7, 11]"
RCV000556973,CDKN1C,"[CHR_HSCHR11_1_CTG7, 11]"
RCV000589405,SURF1,"[CHR_HG2030_PATCH, 9]"
RCV000628599,CDKN1C,"[CHR_HSCHR11_1_CTG7, 11]"
RCV000671331,TGM1,"[CHR_HG1_PATCH, 14]"
RCV000690567,NIPA1,"[CHR_HG2365_PATCH, 15]"
RCV000704146,MYH11,"[CHR_HSCHR16_1_CTG1, 16]"
RCV000704146,NDE1,"[CHR_HSCHR16_1_CTG1, 16]"


## Part 3: Correctness

* Mixed repeats
* Classification of microsatellite events without complete coordinates - are they all indeed expansions?
* Spot-check some weird cases

In [24]:
# transcript_id, coordinate_span, repeat_unit_length, is_protein_hgvs

parse_variant_identifier('NM_004409.4:c.*224_*283CTG[(173_283)]CCG[1]CTG[8]CCG[2]CTG[2]CCG[1]CTG[4]CCG[1]CTG[30]')
# 224-283 bases after last coding region, CTG repeated 173-283 times, CCG once, etc.
# what exactly is the coordinate range telling you?

('NM_004409', 60, 3, False)

In [25]:
parse_variant_identifier('NC_000008.10:g.119379055_119379157TGAAA[100_?]TAAAA[40_?]')
# TGAAA repeated 100+ times, TAAAA repeated 40+ times?
# again what is the point of the coordinate range?

('NC_000008', 103, 5, False)

In [26]:
parse_variant_identifier('NM_000548.3(TSC2):c.5068+27_5069-47dup34')
# 27 from one end of intron @ 5068, up to 47 from other end - 34 bases duplicated
# how are you supposed to know how long the intron is??? 34 + 27 + 47 + 1?

('NM_000548', -73, None, False)

In [27]:
parse_variant_identifier('NM_001243246.1(P3H1):c.2049_2067CGAGCGGGTGAGAGCAGCT[3] (p.Trp696delinsSerSerGlyTer)')
# 19 bases within coding region repeated 3 times

('NM_001243246', 19, 19, False)

In [28]:
len('CGAGCGGGTGAGAGCAGCT') == 2067 - 2049 + 1

True

In [29]:
parse_variant_identifier('NM_000368.4(TSC1):c.914-88_914-58T(27_30)')
# 88 from one end of intron up to 58 from same end, T repeated 27-30 times
# coordinate span seems to be max length of the variant

('NM_000368', 31, 1, False)

In [30]:
parse_variant_identifier('NM_001256054.2(C9orf72):c.-45+163_-45+180GGGGCC(2_25)')
# 45 bases before first coding region - 163-180 bases after this?, GGGGCC repeated 2-25 times

('NM_001256054', 18, 6, False)

### Thoughts

* Meaning of coordinate span not always clear
* Mixed repeats
    * the repeat unit is I guess ill-defined, but we currently use the first
    * might need to get all repeat units in the future

### Repeats lacking explicit coordinates

In [31]:
incomplete_repeats = []
for r in dataset:
    if r.measure and not r.measure.has_complete_coordinates and r.measure.is_repeat_expansion_variant:
        incomplete_repeats.append(r.measure)

In [32]:
len(incomplete_repeats)

98

In [33]:
# pretty print xml
def pprint(x):
    print(ElementTree.tostring(x, encoding='unicode'))

In [47]:
insertions = []
deletions = []
repeat_number_multiple = []  # span is some multiple of repeat unit length
non_repeat_multiple = []  # span is not a multiple of repeat unit length
missing_seq_loc = []
multiple_seq_loc = []
unparseable = []
for m in incomplete_repeats:
    if 'del' in m.get_variant_name_or_hgvs():
        deletions.append(m)
        continue    
    seq_locs = find_elements(m.measure_xml, './SequenceLocation[@Assembly="GRCh38"]')
    if len(seq_locs) < 1:
        missing_seq_loc.append(m)
        continue
    if len(seq_locs) > 1:
        multiple_seq_loc.append(m)
        continue
        
    sl = seq_locs[0]    
    start = int(sl.attrib.get('start'))
    stop = int(sl.attrib.get('stop'))
    loc_span = stop - start + 1
    repeat_unit_len = m.hgvs_properties.repeat_unit_length
    if not repeat_unit_len:
        unparseable.append(m)
        continue

    if start == stop:
        insertions.append(m)
    elif loc_span % repeat_unit_len == 0:
        repeat_number_multiple.append((m, loc_span, repeat_unit_len))
    else:
        non_repeat_multiple.append((m, loc_span, repeat_unit_len))

In [48]:
print('Insertions:', len(insertions))
print('Deletions:', len(deletions))
print('Multiple of repeat unit length:', len(repeat_number_multiple))
print('Not a repeat multiple:', len(non_repeat_multiple))
print('Multiple locations:', len(multiple_seq_loc))
print('Missing location:', len(missing_seq_loc))

# This one is actually a non-coding HGVS that we should otherwise be able to parse:
# NR_002717.2(ATXN8OS):n.1103CTG[(15_40)]
# start is 70139384, stop is 70139386, span == unit == 3
print('Unparseable:', len(unparseable))

Insertions: 8
Deletions: 3
Multiple of repeat unit length: 31
Not a repeat multiple: 0
Multiple locations: 2
Missing location: 53
Unparseable: 1


In [49]:
for m, span, unit in repeat_number_multiple:
    print(m.get_variant_name_or_hgvs())
    print('span:', span)
    print('repeat unit length:', unit)
    print()

NM_003159.2(CDKL5):c.145+4_145+5AT(11_13)
span: 2
repeat unit length: 2

NM_003159.2(CDKL5):c.145+4_145+5AT(11_13)
span: 2
repeat unit length: 2

NM_003159.2(CDKL5):c.145+4_145+5AT[15]
span: 2
repeat unit length: 2

NM_003159.2(CDKL5):c.145+4_145+5AT[13]
span: 2
repeat unit length: 2

NM_006392.4(NOP56):c.3+71GGCCTG[(650_?)]
span: 6
repeat unit length: 6

NM_000583.4(GC):c.1035-67TAAA[(6_10)]
span: 4
repeat unit length: 4

NM_021619.3(PRDM12):c.1041CGC[(15_?)]
span: 3
repeat unit length: 3

NC_000007.14:g.55020560AC[9][14][15][16][17][18][19][20][21][22][23]
span: 2
repeat unit length: 2

NM_020151.3(STARD7):c.291-1572_291-1518ATTTT[376]ATTTC[274]
span: 3250
repeat unit length: 5

NC_000002.11:g.234668881_234668882TA[5][6][7][8]
span: 2
repeat unit length: 2

NC_000002.11:g.234668881_234668882TA[5][6][7][8]
span: 2
repeat unit length: 2

NC_000002.11:g.234668881_234668882TA[5][6][7][8]
span: 2
repeat unit length: 2

NM_000100.3(CSTB):c.-210CCCCGCCCCGCG[(2_3)]
span: 12
repeat unit lengt