# Extract mutations in VGSC

This notebook extracts data on all mutations in the VGSC gene.

## Setup

In [1]:
%run setup.ipynb

In [2]:
# download gene annotations from vectorbase
!wget \
    --no-clobber \
    -O ../../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz \
    https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp44gff3gz


File `../../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz' already there; not retrieving.


In [3]:
# download the Davies et al. (2007) gene models
!wget \
    --no-clobber \
    -O ../../data/davies_vgsc_model_20170125.gff3 \
    http://alimanfoo.github.io/assets/davies_vgsc_model_20170125.gff3


File `../../data/davies_vgsc_model_20170125.gff3' already there; not retrieving.


In [4]:
# load the vectorbase geneset
geneset_agamp44 = allel.gff3_to_dataframe('../../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
                                       attributes=['ID', 'Parent'])
# geneset_agamp44 = geneset_to_pandas(geneset_agamp44)
geneset_agamp44.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,VectorBase,chromosome,1,49364325,-1,.,-1,2L,b'.'
1,2L,VectorBase,gene,157348,186936,-1,-,-1,AGAP004677,b'.'
2,2L,VectorBase,mRNA,157348,181305,-1,-,-1,AGAP004677-RA,AGAP004677
3,2L,VectorBase,three_prime_UTR,157348,157495,-1,-,-1,b'.',AGAP004677-RA
4,2L,VectorBase,exon,157348,157623,-1,-,-1,b'.',AGAP004677-RA


In [5]:
# subset to VGSC
geneset_agamp44_vgsc = geneset_agamp44.query(region_vgsc.query_str).copy()
# replace CDS IDs as not informative
geneset_agamp44_vgsc['ID'].values[(geneset_agamp44_vgsc.type == 'CDS').values] = ''
geneset_agamp44_vgsc.type.value_counts()

exon    93
CDS     93
mRNA     3
gene     1
Name: type, dtype: int64

In [6]:
# load the Davies geneset
geneset_davies = allel.gff3_to_dataframe('../../data/davies_vgsc_model_20170125.gff3',
                                         attributes=['ID', 'Parent'])
# geneset_davies = geneset_to_pandas(geneset_davies)
geneset_davies.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C8N2,AGAP004707


In [7]:
# make a combined geneset
geneset_vgsc_combined = pandas.concat([geneset_agamp44_vgsc, geneset_davies])
geneset_vgsc_combined.query("type == 'mRNA'")

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
666,2L,VectorBase,mRNA,2358158,2431617,-1,+,-1,AGAP004707-RA,AGAP004707
729,2L,VectorBase,mRNA,2358158,2431617,-1,+,-1,AGAP004707-RB,AGAP004707
792,2L,VectorBase,mRNA,2358158,2431617,-1,+,-1,AGAP004707-RC,AGAP004707
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C8N2,AGAP004707
5,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C10N2,AGAP004707
6,2L,Davies et al. (2007),mRNA,2358158,2431617,-1,+,-1,Davies-C11N2,AGAP004707


In [8]:
# setup a variant annotator
annotator = veff.Annotator(
    fasta_path='../../ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/genome/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa', 
    gff3_path=['../../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
               '../../data/davies_vgsc_model_20170125.gff3'],
    seqid='2L'
)

In [9]:
# identify VGSC transcripts
transcript_ids = [f.feature_id for f in annotator.get_children('AGAP004707')]
transcript_ids

['AGAP004707-RA',
 'AGAP004707-RB',
 'AGAP004707-RC',
 'Davies-C1N2',
 'Davies-C3N2',
 'Davies-C5N2',
 'Davies-C7N2',
 'Davies-C8N2',
 'Davies-C10N2',
 'Davies-C11N2',
 'Davies-C1N9',
 'Davies-C8N9',
 'Davies-C1N9ck']

In [10]:
# tabulate Davies exons
tbl_davies_exons = (
    etl
    .fromdataframe(geneset_davies)
    .eq('type', 'CDS')
    .cutout('Parent', 'source', 'type', 'score', 'strand', 'phase')
    .merge(key=('start', 'end'))
    .rename('seqid', 'exon_seqid')
    .rename('ID', 'exon')
    .rename('start', 'exon_start')
    .rename('end', 'exon_end')
    .movefield('exon_seqid', 0)
)
tbl_davies_exons.displayall()

0|exon_seqid,1|exon_start,2|exon_end,3|exon
2L,2358158,2358304,1
2L,2359640,2359672,2j
2L,2361989,2362144,3
2L,2381065,2381270,4
2L,2382270,2382398,5
2L,2385694,2385785,6
2L,2390129,2390341,7
2L,2390425,2390485,8
2L,2390594,2390738,9
2L,2391156,2391320,10


## Extract table of variants

In [11]:
callset = phase2_ar1.callset
callset

Group(/, 8)
  arrays: 1; samples
  groups: 7; 2L, 2R, 3L, 3R, UNKN, X, Y_unplaced
  store: DirectoryStore

In [12]:
callset.store.path

'/home/aliman/src/github/alimanfoo/agam-vgsc-report/ngs.sanger.ac.uk/production/ag1000g/phase2/AR1/variation/main/zarr2/ag1000g.phase2.ar1'

In [13]:
# what fields are available?
print(', '.join(callset['2L/variants']))

ABHet, ABHom, AC, AF, ALT, AN, Accessible, BaseQRankSum, CHROM, Coverage, CoverageMQ0, DP, DS, Dels, FILTER_BaseQRankSum, FILTER_FS, FILTER_HRun, FILTER_HighCoverage, FILTER_HighMQ0, FILTER_LowCoverage, FILTER_LowMQ, FILTER_LowQual, FILTER_NoCoverage, FILTER_PASS, FILTER_QD, FILTER_ReadPosRankSum, FILTER_RefN, FILTER_RepeatDUST, FS, HRun, HW, HaplotypeScore, HighCoverage, HighMQ0, InbreedingCoeff, LowCoverage, LowMQ, LowPairing, MLEAC, MLEAF, MQ, MQ0, MQRankSum, NDA, NoCoverage, OND, POS, QD, QUAL, REF, RPA, RU, ReadPosRankSum, RefMasked, RefN, RepeatDUST, RepeatMasker, RepeatTRF, STR, VariantType, is_snp, num_alleles, svlen


In [14]:
callset_snpeff = phase2_ar1.callset_snpeff_agamp42
callset_snpeff

{'2L': <HDF5 group "/2L" (2 members)>,
 '2R': <HDF5 group "/2R" (2 members)>,
 '3L': <HDF5 group "/3L" (2 members)>,
 '3R': <HDF5 group "/3R" (2 members)>,
 'X': <HDF5 group "/X" (2 members)>}

In [16]:
# what SNPEFF fields are available?
print(', '.join(callset_snpeff['2L']['variants/ANN'].dtype.names))

Allele, Annotation, Annotation_Impact, Gene_Name, Gene_ID, Feature_Type, Feature_ID, Transcript_BioType, Rank, HGVS_c, HGVS_p, cDNA_pos, cDNA_length, CDS_pos, CDS_length, AA_pos, AA_length, Distance


In [18]:
samples = phase2_ar1.df_samples
samples.head()

Unnamed: 0_level_0,src_code,population,country,region,contributor,contact,year,m_s,sex,n_sequences,mean_coverage
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95033368,30.99
AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95843804,31.7
AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,107420666,35.65
AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,95993752,29.46
AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,103044262,33.67


In [19]:
def tabulate_variants(callset, snpeff, seqid, start, end, pop_ids, subpops):
    """Build a table of variants for a given callset and genome region."""
    
    variants = callset[seqid]['variants']
    ann = snpeff[seqid]['variants']['ANN']
    pos = allel.SortedIndex(variants['POS'])
    loc = pos.locate_range(start, end)
    genotype = allel.GenotypeArray(callset[seqid]['calldata/genotype'][loc])
    acs = genotype.count_alleles_subpops(max_allele=3, subpops=subpops)
    
    # extract columns
    variants_fields = [
        'CHROM',
        'POS',
        'num_alleles',
        'REF',
        'ALT',
        'AC',
        'FILTER_PASS',
        'NoCoverage',
        'LowCoverage',
        'HighCoverage',
        'LowMQ',
        'HighMQ0',
        'RepeatDUST',
        'RepeatMasker',
        'RepeatTRF',
        'FS',
        'HRun',
        'QD',
        'ReadPosRankSum',
    ]
    ann_fields = ['Allele', 'Annotation', 'HGVS_c', 'HGVS_p', 'Feature_ID', 'CDS_pos']
    cols = (
        [variants[f][loc] for f in variants_fields] + 
        [ann[loc][f] for f in ann_fields] + 
        [acs[p].to_frequencies() for p in pop_ids]
    )

    def split_alleles(row):
        for i in range(row.num_alleles - 1):
            # break down alleles
            out = [
                row['CHROM'], 
                row['POS'], 
                row['num_alleles'], 
                row['REF'], 
                row['ALT'][i], 
                row['AC'][i], 
                i, 
            ]
            # add in remaining variant annotations
            out += [row[f] for f in variants_fields[6:]]
            # SNPEFF annotation only applies to first allele
            if i == 0:
                out += [row[f] for f in ann_fields]
            else:
                out += [None for f in ann_fields]
            # add in population allele frequencies
            out += [row[p][i+1] for p in pop_ids]
            yield out
        
    tbl = (
        etl
        .fromcolumns(cols, header=variants_fields + ann_fields + list(pop_ids))
        .rowmapmany(split_alleles, header=variants_fields[:6] + ['ALTIX'] + variants_fields[6:] + ann_fields + list(pop_ids), failonerror=True)
        .convert('CHROM REF ALT Allele Annotation HGVS_c HGVS_p Feature_ID'.split(), lambda v: str(v, 'ascii'))
        .rename({f: 'SNPEFF_' + f for f in ann_fields})
        .rename({p: 'AF_%s' % p for p in pop_ids})
        .addfield('check_allele', lambda row: row['SNPEFF_Allele'] is None or row['SNPEFF_Allele'] == row['ALT'])
    )
    
    return tbl

In [20]:
pop_ids = phase2_ar1.pop_ids
pop_ids

('AOcol',
 'BFcol',
 'GHcol',
 'CIcol',
 'GNcol',
 'GW',
 'GM',
 'GNgam',
 'BFgam',
 'GHgam',
 'CMgam',
 'UGgam',
 'GAgam',
 'GQgam',
 'FRgam',
 'KE')

In [25]:
callset_samples = callset['samples'][:].astype('U').tolist()
callset_samples[:5]

['AA0040-C', 'AA0041-C', 'AA0042-C', 'AA0043-C', 'AA0044-C']

In [26]:
sample_indices = np.array([callset_samples.index(s) for s in samples.index.values])
sample_indices

array([   0,    1,    2, ..., 1139, 1140, 1141])

In [28]:
subpops = {p: sample_indices[samples.population == p].tolist() for p in pop_ids}

In [29]:
# build a table of variants from phase 1
tbl_variants = tabulate_variants(callset=callset, snpeff=callset_snpeff, 
                                 seqid=region_vgsc.seqid, start=region_vgsc.start, end=region_vgsc.end, 
                                 pop_ids=pop_ids, subpops=subpops)
tbl_variants

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.297,-0.022003,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.2656,0,16.391,-2.0918,G,splice_region_variant&intron_varia,n.147+5A>G,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.4043,0,16.109,1.2041,G,intron_variant,n.147+12T>G,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136363636364,0.0,0.0,0.0,0.0,0.0,True
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.758,-0.94482,C,intron_variant,n.147+24T>C,.,AGAP004707-RA,-1,0.0,0.00666666666667,0.0,0.0,0.0,0.0164835164835,0.0307692307692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.0078,0,9.7891,1.3066,T,intron_variant,n.147+49C>T,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


## Annotate effects for all transcripts

In [30]:
cds_effects = [
    'NON_SYNONYMOUS_CODING', 
    'SYNONYMOUS_CODING',    
]
intron_effects = [
    'INTRONIC', 
    'SPLICE_CORE',
    'SPLICE_REGION',        
]
selected_effects = cds_effects + intron_effects

In [31]:
def lpop(l, default=None):
    """Pop the first item from a list if not empty."""
    try:
        return l[0]
    except IndexError:
        return default


In [32]:
def transcript_effect(transcript_id):
    def f(row):
        e = lpop([e for e in row.VEFF if e.transcript_id == transcript_id])
        if e and e.effect in cds_effects:
            return (e.effect, e.aa_change)
        elif e and e.effect in intron_effects:
            return (e.effect, e.intron_cds_5prime, e.intron_5prime_dist, e.intron_cds_3prime, e.intron_3prime_dist)
        else:
            return None
    return f


In [33]:
tbl_variants_eff = (
    tbl_variants
    # join in Davies exon information
    .intervalleftjoin(
        # don't include shorter exon alternatives
        tbl_davies_exons.select('exon', lambda v: v[-1] != '-'),
        lkey='CHROM', rkey='exon_seqid', lstart='POS', rstart='exon_start', lstop='POS', rstop='exon_end', include_stop=True)
    .cutout('exon_seqid')
    .addfield('VEFF', lambda row: [e for e in annotator.get_effects(chrom=row.CHROM, pos=row.POS, ref=row.REF, alt=row.ALT) 
                                   if e.effect in selected_effects])
    .addfield(transcript_ids[0], transcript_effect(transcript_ids[0]))
    .addfield(transcript_ids[1], transcript_effect(transcript_ids[1]))
    .addfield(transcript_ids[2], transcript_effect(transcript_ids[2]))
    .addfield(transcript_ids[3], transcript_effect(transcript_ids[3]))
    .addfield(transcript_ids[4], transcript_effect(transcript_ids[4]))
    .addfield(transcript_ids[5], transcript_effect(transcript_ids[5]))
    .addfield(transcript_ids[6], transcript_effect(transcript_ids[6]))
    .addfield(transcript_ids[7], transcript_effect(transcript_ids[7]))
    .addfield(transcript_ids[8], transcript_effect(transcript_ids[8]))
    .addfield(transcript_ids[9], transcript_effect(transcript_ids[9]))
    .addfield(transcript_ids[10], transcript_effect(transcript_ids[10]))
    .addfield(transcript_ids[11], transcript_effect(transcript_ids[11]))
    .addfield(transcript_ids[12], transcript_effect(transcript_ids[12]))
    .cutout('VEFF')
    .replaceall('.', None)
    .replaceall('', None)
    .cache()
)

In [34]:
tbl_variants_eff.display(20)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.297,-0.022003,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.2656,0,16.391,-2.0918,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,,,,"('SPLICE_REGION', 'AGAP004707-PA', 5, 'AGAP004707-PA', -3698)","('SPLICE_REGION', 'AGAP004707-PB', 5, 'AGAP004707-PB', -3698)","('SPLICE_REGION', 'AGAP004707-PC', 5, 'AGAP004707-PC', -3698)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)"
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.4043,0,16.109,1.2041,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136363636364,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.758,-0.94482,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00666666666667,0.0,0.0,0.0,0.0164835164835,0.0307692307692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.0078,0,9.7891,1.3066,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"
2L,2358395,2,A,G,1,0,False,0,5,22,0,0,False,False,False,1.2139,0,14.031,0.10602,G,intron_variant,n.147+91A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 91, 'AGAP004707-PA', -3612)","('INTRONIC', 'AGAP004707-PB', 91, 'AGAP004707-PB', -3612)","('INTRONIC', 'AGAP004707-PC', 91, 'AGAP004707-PC', -3612)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '2j', -1245)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '3', -3594)","('INTRONIC', '1', 91, '2j', -1245)","('INTRONIC', '1', 91, '3', -3594)"
2L,2358405,2,T,A,1,0,True,0,7,20,0,0,False,False,False,8.6016,1,10.82,0.59619,A,intron_variant,n.147+101T>A,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 101, 'AGAP004707-PA', -3602)","('INTRONIC', 'AGAP004707-PB', 101, 'AGAP004707-PB', -3602)","('INTRONIC', 'AGAP004707-PC', 101, 'AGAP004707-PC', -3602)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)"
2L,2358407,2,G,C,1,0,True,0,7,19,0,0,False,False,False,1.1348,0,11.289,-0.49805,C,intron_variant,n.147+103G>C,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 103, 'AGAP004707-PA', -3600)","('INTRONIC', 'AGAP004707-PB', 103, 'AGAP004707-PB', -3600)","('INTRONIC', 'AGAP004707-PC', 103, 'AGAP004707-PC', -3600)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '2j', -1233)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '3', -3582)","('INTRONIC', '1', 103, '2j', -1233)","('INTRONIC', '1', 103, '3', -3582)"
2L,2358441,2,A,T,100,0,False,0,10,23,0,0,False,False,False,2.9668,1,21.812,1.0693,T,intron_variant,n.147+137A>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013468013468,0.0,0.652173913043,0.111111111111,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 137, 'AGAP004707-PA', -3566)","('INTRONIC', 'AGAP004707-PB', 137, 'AGAP004707-PB', -3566)","('INTRONIC', 'AGAP004707-PC', 137, 'AGAP004707-PC', -3566)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '2j', -1199)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '2j', -1199)","('INTRONIC', '1', 137, '3', -3548)"
2L,2358463,2,G,T,5,0,False,0,8,22,0,0,False,False,False,26.516,0,15.102,0.14001,T,intron_variant,n.147+159G>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0520833333333,True,,,,"('INTRONIC', 'AGAP004707-PA', 159, 'AGAP004707-PA', -3544)","('INTRONIC', 'AGAP004707-PB', 159, 'AGAP004707-PB', -3544)","('INTRONIC', 'AGAP004707-PC', 159, 'AGAP004707-PC', -3544)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '2j', -1177)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '2j', -1177)","('INTRONIC', '1', 159, '3', -3526)"


## Inspect missense variants

In [36]:
def simplify_missense_effect(v):
    if v and v[0] == 'NON_SYNONYMOUS_CODING':
        return v[1]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_missense = (
    tbl_variants_eff
    .select(lambda row: any(row[t] and row[t][0] == 'NON_SYNONYMOUS_CODING' for t in transcript_ids))
    .convert(transcript_ids, simplify_missense_effect)
)
tbl_variants_missense.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.297,-0.022003,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,2358158,2358304,1,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N
2L,2359670,2,G,A,7,0,False,1,271,1,1,0,False,False,False,5.7812,6,14.133,-0.20105,A,intron_variant,n.147+1366G>,,AGAP004707-RA,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010101010101,0.0,0.0,0.0,0.0,0.0104166666667,True,2359640,2359672,2j,,,,,,,,E60K,,,,E60K,
2L,2362002,2,A,T,3,0,True,0,1,3,0,0,False,False,False,6.375,0,13.594,-0.22095,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,,,,D54V,D54V,D54V,D54V,D65V,D54V,D54V,D54V,D65V,D54V
2L,2362019,2,G,T,3,0,True,0,0,6,0,0,False,False,False,7.2539,0,14.891,-0.30298,T,missense_variant,n.160G>T,p.Gly54Cys,AGAP004707-RA,160.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,G54C,G54C,G54C,G60C,G60C,G60C,G60C,G71C,G60C,G60C,G60C,G71C,G60C
2L,2362023,2,C,T,1,0,True,0,1,4,0,0,False,False,False,0.0,0,13.398,-2.0684,T,missense_variant,n.164C>T,p.Pro55Leu,AGAP004707-RA,164.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0054347826087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,P55L,P55L,P55L,P61L,P61L,P61L,P61L,P72L,P61L,P61L,P61L,P72L,P61L
2L,2390168,2,A,G,2,0,True,0,2,17,0,0,False,False,False,0.0,1,15.008,-0.057007,G,missense_variant,n.752A>G,p.Lys251Arg,AGAP004707-RA,752.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0144927536232,0.0,0.0,0.0,True,2390129,2390341,7,K251R,K251R,K251R,K257R,K214R,K257R,K257R,K268R,K257R,K257R,K257R,K268R,K257R
2L,2390177,2,G,A,215,0,True,0,4,13,0,0,False,False,False,0.479,1,19.5,1.877,A,missense_variant,n.761G>A,p.Arg254Lys,AGAP004707-RA,761.0,0.0,0.0,0.00909090909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.313131313131,0.0,0.202898550725,0.0,0.0,0.0,True,2390129,2390341,7,R254K,R254K,R254K,R260K,R217K,R260K,R260K,R271K,R260K,R260K,R260K,R271K,R260K
2L,2390305,2,A,T,1,0,True,0,1,18,0,0,False,False,False,15.07,0,10.156,1.5254,T,missense_variant,n.889A>T,p.Thr297Ser,AGAP004707-RA,889.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00446428571429,0.0,0.0,0.0,0.0,True,2390129,2390341,7,T297S,T297S,T297S,T303S,T260S,T303S,T303S,T314S,T303S,T303S,T303S,T314S,T303S
2L,2390311,2,G,A,1,0,True,0,1,15,0,0,False,False,False,0.92188,3,12.844,-0.74414,A,missense_variant,n.895G>A,p.Glu299Lys,AGAP004707-RA,895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,2390129,2390341,7,E299K,E299K,E299K,E305K,E262K,E305K,E305K,E316K,E305K,E305K,E305K,E316K,E305K
2L,2390448,2,G,A,6,0,True,0,0,18,0,0,False,False,False,1.9453,0,16.109,-0.95801,A,missense_variant,n.949G>A,p.Gly317Ser,AGAP004707-RA,949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010101010101,0.0,0.0,0.0,0.0,0.0,True,2390425,2390485,8,G317S,G317S,G317S,G323S,G280S,G323S,G323S,G334S,G323S,G323S,G323S,G334S,G323S


## Inspect splice site variants

In [38]:
def simplify_intron_effect(v):
    if v and v[0] in ['SPLICE_REGION', 'SPLICE_CORE']:
        if math.fabs(v[2]) < math.fabs(v[4]):
            return v[1], v[2]
        else:
            return v[3], v[4]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_splice = (
    tbl_variants_eff
    .select(lambda row: any(row[t] and row[t][0] in ['SPLICE_REGION', 'SPLICE_CORE'] for t in transcript_ids))
    .convert(transcript_ids, simplify_intron_effect)
)
tbl_variants_splice.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.2656,0,16.391,-2.0918,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 5)","('AGAP004707-PB', 5)","('AGAP004707-PC', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)"
2L,2362002,2,A,T,3,0,True,0,1,3,0,0,False,False,False,6.375,0,13.594,-0.22095,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -5)","('AGAP004707-PB', -5)","('AGAP004707-PC', -5)",,,,,,,,,,
2L,2362003,2,C,T,3,0,True,0,1,4,0,0,False,False,False,1.7217,0,14.508,0.27905,T,splice_region_variant&intron_varia,n.148-4C>T,,AGAP004707-RA,-1,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)",,,,,,,,,,
2L,2382263,2,A,G,180,0,True,0,50,2,0,0,False,False,False,11.797,0,25.219,-3.1875,G,splice_region_variant&intron_varia,n.492-7A>G,,AGAP004707-RA,-1,0.00641025641026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.473214285714,0.0,0.0,0.0,0.760416666667,True,,,,"('AGAP004707-PA', -7)","('AGAP004707-PB', -7)","('AGAP004707-PC', -7)","('5', -7)",,"('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)"
2L,2390125,2,A,C,1,0,True,0,5,15,0,0,False,False,False,2.4473,1,15.18,0.54785,C,splice_region_variant&intron_varia,n.713-4A>C,,AGAP004707-RA,-1,0.0,0.0,0.0,0.00704225352113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)"
2L,2390126,2,C,T,2,0,True,0,5,15,0,0,False,False,False,5.6133,0,14.273,-1.0088,T,splice_region_variant&intron_varia,n.713-3C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003367003367,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -3)","('AGAP004707-PB', -3)","('AGAP004707-PC', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)"
2L,2400176,2,A,G,1,0,True,0,1,13,0,0,False,False,False,0.0,0,22.172,0.75098,G,splice_region_variant&intron_varia,n.1572+3A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 3)","('AGAP004707-PB', 3)","('AGAP004707-PC', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)"
2L,2407888,2,T,C,9,0,True,0,1,14,0,0,False,False,False,2.3145,0,16.672,0.3811,C,splice_region_variant&intron_varia,n.2017-6T>C,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0494505494505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -6)","('AGAP004707-PB', -6)","('AGAP004707-PC', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)"
2L,2408000,2,A,G,1,0,True,0,8,9,0,0,False,False,False,3.0684,0,11.133,0.79395,G,splice_region_variant&intron_varia,n.2116+7A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0555555555556,0.0,0.0,True,,,,"('AGAP004707-PA', 7)","('AGAP004707-PB', 7)","('AGAP004707-PC', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)"
2L,2417362,2,A,G,838,0,False,5,1067,0,0,0,False,False,False,43.969,1,29.844,1.5029,G,splice_region_variant&intron_varia,n.2637+4A>G,,AGAP004707-RA,-1,0.818181818182,0.846666666667,0.827272727273,0.915492957746,0.875,0.0555555555556,0.0230769230769,1.0,1.0,1.0,0.0929054054054,0.0,0.0,0.166666666667,0.0,0.0,True,,,,"('AGAP004707-PA', 4)","('AGAP004707-PB', 4)","('AGAP004707-PC', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)"


## Write out variants to file

In [39]:
(tbl_variants_eff
 .teepickle('../../data/phase2/tbl_variants.pkl')
 .convert(transcript_ids, lambda v: ':'.join(map(str, v)))
 .replaceall(None, 'NA')
 .tocsv('../../data/phase2/tbl_variants.csv')
)

In [40]:
# check OK
etl.frompickle('../../data/phase2/tbl_variants.pkl')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.297,-0.022003,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.2656,0,16.391,-2.0918,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,,,,"('SPLICE_REGION', 'AGAP004707-PA', 5, 'AGAP004707-PA', -3698)","('SPLICE_REGION', 'AGAP004707-PB', 5, 'AGAP004707-PB', -3698)","('SPLICE_REGION', 'AGAP004707-PC', 5, 'AGAP004707-PC', -3698)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)"
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.4043,0,16.109,1.2041,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136363636364,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.758,-0.94482,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00666666666667,0.0,0.0,0.0,0.0164835164835,0.0307692307692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.0078,0,9.7891,1.3066,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"


In [41]:
etl.fromcsv('../../data/phase2/tbl_variants.csv')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_BFcol,28|AF_GHcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_GNgam,34|AF_BFgam,35|AF_GHgam,36|AF_CMgam,37|AF_UGgam,38|AF_GAgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.297,-0.022003,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.2656,0,16.391,-2.0918,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835,0.0,0.0,0.0,0.0,0.0,True,,,,SPLICE_REGION:AGAP004707-PA:5:AGAP004707-PA:-3698,SPLICE_REGION:AGAP004707-PB:5:AGAP004707-PB:-3698,SPLICE_REGION:AGAP004707-PC:5:AGAP004707-PC:-3698,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:2j:-1331,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:2j:-1331,SPLICE_REGION:1:5:3:-3680
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.4043,0,16.109,1.2041,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136363636364,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:12:AGAP004707-PA:-3691,INTRONIC:AGAP004707-PB:12:AGAP004707-PB:-3691,INTRONIC:AGAP004707-PC:12:AGAP004707-PC:-3691,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.758,-0.94482,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00666666666667,0.0,0.0,0.0,0.0164835164835,0.0307692307692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:24:AGAP004707-PA:-3679,INTRONIC:AGAP004707-PB:24:AGAP004707-PB:-3679,INTRONIC:AGAP004707-PC:24:AGAP004707-PC:-3679,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.0078,0,9.7891,1.3066,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00549450549451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:49:AGAP004707-PA:-3654,INTRONIC:AGAP004707-PB:49:AGAP004707-PB:-3654,INTRONIC:AGAP004707-PC:49:AGAP004707-PC:-3654,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636
