In [1]:
from itertools import combinations
import microhapdb
print('MicroHapDB version', microhapdb.__version__)

MicroHapDB version 0.3


In [2]:
def panel_alpha():
    '''Initial minimum-effort panel selection.
    
    There are many factors to consider when designing a panel. This method
    ignores most of those considerations and simply grabs the microhap locus
    from each autosome with the largest effective number of alleles (Ae)
    averaged across all populations.
    '''
    loci = microhapdb.loci.query('Source == "ALFRED"').\
        sort_values('AvgAe', ascending=False).\
        drop_duplicates('Chrom')
    return sorted(loci.ID)

In [3]:
markerids = panel_alpha()
markers = microhapdb.idmap[
    (microhapdb.idmap.mhdbID.isin(markerids)) &
    (microhapdb.idmap.XRef.str.startswith('mh'))
].drop(columns=['Table'])
markers.to_csv('alpha-panel.tsv', sep='\t', index=False)
markers

Unnamed: 0,XRef,mhdbID
124,mh01KK-117,MHDBL000013
132,mh10KK-163,MHDBL000017
156,mh11KK-180,MHDBL000030
191,mh12CP-008,MHDBL000048
215,mh13KK-218,MHDBL000060
226,mh14CP-003,MHDBL000066
241,mh15CP001,MHDBL000074
255,mh16KK-049,MHDBL000081
274,mh17CP-001,MHDBL000092
299,mh18CP-005,MHDBL000105


In [4]:
def panel_beta():
    '''First attempt at optimizing panel design.
    
    Slightly less unsophisticated approach to panel selection than panel_alpha,
    but still ignoring some important considerations. This method focuses on a
    few simple filters and simple operations.
    - discard any microhap not present in ALFRED
    - discard any microhap with an average Ae of less than 2.0
    - discard any microhap that spans more than 250 bp
    - for each chromosome, grab the 3 microhaps with the highest combined
      average Ae such that no 2 microhaps occur within 25 Mb; if this criterion
      is too strict, reduce the distance and then the number of desired
      microhaps from 3 to 2 until a compatible set is selected
    - combine microhaps from all chromosomes and select the top 50 by AvgAe
    '''
    loci = microhapdb.loci.copy()
    loci['Length'] = loci['End'] - loci['Start']
    locusids = set()
    for chromid in loci.Chrom.unique():
        chromloci = loci[
            (loci.Source == 'ALFRED') &
            (loci.Chrom == chromid) &
            (loci.AvgAe > 2.0) &
            (loci.Length <= 250)
        ]

        def trycombos(n=3, dist=25e6):
            opt_ae, opt_loci = None, None
            for testlocusids in combinations(chromloci.ID, n):
                testloci = loci[loci.ID.isin(testlocusids)]
                for coord1, coord2 in combinations(testloci.Start, 2):
                    if abs(coord1 - coord2) < dist:
                        break
                else:
                    ae = sum(testloci.AvgAe) / len(testloci.AvgAe)
                    if opt_ae is None or ae > opt_ae:
                        opt_ae = ae
                        opt_loci = testlocusids
            return opt_loci
        params = (
            (3, 25e6), (3, 20e6), (2, 25e6), (2, 20e6),
            (3, 15e6), (2, 15e6),
            (3, 10e6), (2, 10e6), (2, 7.5e6),
        )
        for n, dist in params:
            testloci = trycombos(n=n, dist=dist)
            if testloci is not None:
                break
        assert testloci is not None, chromid
        locusids.update(testloci)
    panel = loci[loci.ID.isin(locusids)].sort_values('AvgAe').head(50)
    return list(panel.ID)

In [5]:
markerids = panel_beta()
markers = microhapdb.idmap[
    (microhapdb.idmap.mhdbID.isin(markerids)) &
    (microhapdb.idmap.XRef.str.startswith('mh'))
].drop(columns=['Table'])
markers.to_csv('beta-panel.tsv', sep='\t', index=False)
markers

Unnamed: 0,XRef,mhdbID
112,mh01KK-205,MHDBL000007
116,mh01CP-016,MHDBL000009
124,mh01KK-117,MHDBL000013
136,mh10CP-003,MHDBL000019
138,mh10KK-170,MHDBL000020
151,mh10KK-101,MHDBL000027
156,mh11KK-180,MHDBL000030
167,mh11KK-037,MHDBL000036
171,mh11KK-191,MHDBL000038
191,mh12CP-008,MHDBL000048
