In [3]:
import numpy as np
import pandas as pd
import json
import statsmodels.stats.multitest as mt
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Load variants; needs hacking for long dataframe
all_variants_begun = pd.read_csv('../data/gnomad_v2.1.1_gpcr_variants_unfiltered.csv',index_col=0,low_memory=False,nrows=1000000)
all_variants_continued = pd.read_csv('../data/gnomad_v2.1.1_gpcr_variants_unfiltered.csv',index_col=0,low_memory=False,skiprows=1000001,names=all_variants_begun.columns)
all_variants = pd.concat((all_variants_begun,all_variants_continued))                    
all_variants

Unnamed: 0,HGNC symbol,HGNC name,Grch37 symbol,chromosome,genome_pos,variant_id,reference_bases,alternate_bases,allele_number,allele_count,...,vep_ensembl_transcript,vep_ensembl_protein,vep_protein_pos,vep_amino_acids,vep_distance_to_transcript,vep_swissprot_match,vep_SIFT,vep_PolyPhen,Uniprot name,Ensembl transcript
0,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174698,['rs751332400'],G,A,251396,4,...,ENST00000368121,ENSP00000357103,,,502.0,Q16570,,,ACKR1_HUMAN,ENST00000368122
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174698,['rs751332400'],G,A,251396,4,...,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN,ENST00000368122
2,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174698,['rs751332400'],G,A,251396,4,...,ENST00000435307,ENSP00000398406,,,10.0,,,,ACKR1_HUMAN,ENST00000368122
3,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174698,['rs751332400'],G,A,251396,4,...,ENST00000537147,ENSP00000441985,,,,Q16570,,,ACKR1_HUMAN,ENST00000368122
4,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174703,['rs1800846'],C,G,251424,2,...,ENST00000368121,ENSP00000357103,,,497.0,Q16570,,,ACKR1_HUMAN,ENST00000368122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048402,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063479,['rs759318862'],T,C,209512,2,...,ENST00000395946,ENSP00000379277,,,,,,,XCR1_HUMAN,ENST00000309285
1048403,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063479,['rs759318862'],T,C,209512,2,...,ENST00000542109,ENSP00000438119,,,,P46094,,,XCR1_HUMAN,ENST00000309285
1048404,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063483,['rs767355373'],G,T,204936,3,...,ENST00000309285,ENSP00000310405,,,,P46094,,,XCR1_HUMAN,ENST00000309285
1048405,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063483,['rs767355373'],G,T,204936,3,...,ENST00000395946,ENSP00000379277,,,,,,,XCR1_HUMAN,ENST00000309285


In [51]:
# Load transcripts that produce Uniprot canonical isoform
canonical_transcripts_uniprot = pd.read_csv('../../receptors/Uniprot_GPCR_canonical_transcripts.csv')
canonical_transcripts_uniprot

Unnamed: 0,HGNC symbol,Uniprot name,Canonical ensembl transcript
0,ACKR1,ACKR1_HUMAN,ENST00000368122
1,ACKR2,ACKR2_HUMAN,ENST00000422265
2,ACKR3,ACKR3_HUMAN,ENST00000272928
3,ACKR4,ACKR4_HUMAN,ENST00000249887
4,ADCYAP1R1,PACR_HUMAN,ENST00000304166
...,...,...,...
388,TSHR,TSHR_HUMAN,ENST00000298171
389,UTS2R,UR2R_HUMAN,ENST00000313135
390,VIPR1,VIPR1_HUMAN,ENST00000325123
391,VIPR2,VIPR2_HUMAN,ENST00000262178


In [52]:
# filter variants for ones that effect the canonical transcript
all_variants_filtered = all_variants.drop(
    ['Uniprot name','Ensembl transcript'],
    axis=1
)
all_variants_filtered = all_variants_filtered.merge(canonical_transcripts_uniprot, on='HGNC symbol')
all_variants_filtered = all_variants_filtered[all_variants_filtered['vep_ensembl_transcript']==all_variants_filtered['Canonical ensembl transcript']]
all_variants_filtered = all_variants_filtered.drop('Canonical ensembl transcript',axis=1)
all_variants_filtered

Unnamed: 0,HGNC symbol,HGNC name,Grch37 symbol,chromosome,genome_pos,variant_id,reference_bases,alternate_bases,allele_number,allele_count,...,vep_ensembl_gene,vep_ensembl_transcript,vep_ensembl_protein,vep_protein_pos,vep_amino_acids,vep_distance_to_transcript,vep_swissprot_match,vep_SIFT,vep_PolyPhen,Uniprot name
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174698,['rs751332400'],G,A,251396,4,...,ENSG00000213088,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN
5,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174703,['rs1800846'],C,G,251424,2,...,ENSG00000213088,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN
9,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174706,['rs774009195'],C,T,251410,2,...,ENSG00000213088,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN
13,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174707,['rs745541007'],C,T,251440,17,...,ENSG00000213088,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN
17,ACKR1,atypical chemokine receptor 1 (Duffy blood group),DARC,1,159174707,['rs745541007'],C,G,251440,1,...,ENSG00000213088,ENST00000368122,ENSP00000357104,,,,Q16570,,,ACKR1_HUMAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048392,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063458,['rs568812959'],C,T,228398,8,...,ENSG00000173578,ENST00000309285,ENSP00000310405,,,,P46094,,,XCR1_HUMAN
1048395,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063459,['rs766170275'],G,A,227994,2,...,ENSG00000173578,ENST00000309285,ENSP00000310405,,,,P46094,,,XCR1_HUMAN
1048398,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063476,['rs751498207'],T,A,211160,1,...,ENSG00000173578,ENST00000309285,ENSP00000310405,,,,P46094,,,XCR1_HUMAN
1048401,XCR1,X-C motif chemokine receptor 1,XCR1,3,46063479,['rs759318862'],T,C,209512,2,...,ENSG00000173578,ENST00000309285,ENSP00000310405,,,,P46094,,,XCR1_HUMAN


In [54]:
# filter missense variants
moderate_variants = all_variants_filtered[all_variants_filtered['vep_impact'] == 'MODERATE']
moderate_variants = moderate_variants[moderate_variants['vep_consequence'].str.contains('missense')]
moderate_variants['reference_amino_acid'] = moderate_variants['vep_amino_acids'].apply(lambda x: x.split('/')[0])
moderate_variants['alternate_amino_acid'] = moderate_variants['vep_amino_acids'].apply(lambda x: x.split('/')[1])
moderate_variants['vep_protein_pos'] = moderate_variants['vep_protein_pos'].astype(int)
moderate_variants = moderate_variants.reset_index()
moderate_variants = moderate_variants.drop([
    'index',
    'Grch37 symbol',
    'chromosome',
    'genome_pos',
    'reference_bases',
    'alternate_bases',
    'vep_impact',
    'vep_gene_symbol',
    'vep_distance_to_transcript',
    'vep_amino_acids',
    'Uniprot name'
    ],axis=1)
moderate_variants.columns = [
    'HGNC symbol',
    'HGNC name',
    'variant_id',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous',
    'consequence',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'protein_pos',
    'swissprot_match',
    'SIFT',
    'PolyPhen',
    'reference_amino_acid',
    'alternate_amino_acid'
]
moderate_variants = moderate_variants[[
    'HGNC symbol',
    'HGNC name',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'swissprot_match',
    'variant_id',
    'protein_pos',
    'reference_amino_acid',
    'alternate_amino_acid',
    'consequence',
    'SIFT',
    'PolyPhen',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous'
]]
moderate_variants.to_csv('../data/gnomad_v2.1.1_gpcr_variants_missense.csv')
moderate_variants

Unnamed: 0,HGNC symbol,HGNC name,ensembl_gene,ensembl_transcript,ensembl_protein,swissprot_match,variant_id,protein_pos,reference_amino_acid,alternate_amino_acid,consequence,SIFT,PolyPhen,allele_number,allele_count,num_alternate_homozygous
0,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs745464625'],3,N,S,missense_variant,tolerated(0.06),probably_damaging(0.987),251294,1,0
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs1297356885'],5,L,V,missense_variant,deleterious(0.01),probably_damaging(0.99),251250,1,0
2,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs763887810'],8,A,V,missense_variant&splice_region_variant,tolerated(1),benign(0.001),244322,5,0
3,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs369005597'],9,E,K,missense_variant,tolerated(0.8),benign(0.121),246306,1,0
4,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs531431646'],16,N,S,missense_variant,tolerated(0.17),possibly_damaging(0.689),250308,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102923,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs757344022'],12,F,L,missense_variant,tolerated(0.65),benign(0.009),250566,2,0
102924,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs778906589'],12,F,I,missense_variant,tolerated(0.39),benign(0.002),250508,1,0
102925,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs778906589'],12,F,V,missense_variant,tolerated(0.5),benign(0.003),250508,13,0
102926,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs1299692557'],2,E,D,missense_variant,tolerated(0.08),possibly_damaging(0.468),246062,3,0


In [55]:
# filter pLoF variants
high_variants = all_variants_filtered[all_variants_filtered['vep_impact'] == 'HIGH'].copy()

high_variants[['protein_pos_start','protein_pos_end']] = high_variants['vep_protein_pos'].str.split('-',expand=True)
high_variants[['reference_amino_acid','alternate_amino_acid']] = high_variants['vep_amino_acids'].str.split('/',expand=True)
high_variants = high_variants.reset_index()
high_variants = high_variants.drop([
    'index',
    'Grch37 symbol',
    'chromosome',
    'genome_pos',
    'reference_bases',
    'alternate_bases',
    'vep_impact',
    'vep_gene_symbol',
    'vep_distance_to_transcript',
    'vep_amino_acids',
    'vep_SIFT',
    'vep_PolyPhen',
    'vep_protein_pos',
    'Uniprot name'
     ],axis=1)
high_variants.columns = [
    'HGNC symbol',
    'HGNC name',
    'variant_id',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous',
    'consequence',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'swissprot_match',
    'protein_pos_start',
    'protein_pos_end',
    'reference_amino_acid',
    'alternate_amino_acid'
]
high_variants = high_variants[[
    'HGNC symbol',
    'HGNC name',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'swissprot_match',
    'variant_id',
    'protein_pos_start',
    'protein_pos_end',
    'reference_amino_acid',
    'alternate_amino_acid',
    'consequence',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous'
]]
high_variants.to_csv('../data/gnomad_v2.1.1_gpcr_variants_plof.csv')
high_variants

Unnamed: 0,HGNC symbol,HGNC name,ensembl_gene,ensembl_transcript,ensembl_protein,swissprot_match,variant_id,protein_pos_start,protein_pos_end,reference_amino_acid,alternate_amino_acid,consequence,allele_number,allele_count,num_alternate_homozygous
0,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs756754694'],1,,M,V,start_lost,251376,2,0
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs768744460'],3,,N,X,frameshift_variant,251342,1,0
2,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs748490259'],,,,,splice_donor_variant,251042,2,0
3,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs3027017'],59,60,DS,DX,frameshift_variant,251218,1,0
4,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs1319167975'],88,,F,X,frameshift_variant,250802,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9025,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs761112616'],154,,W,X,frameshift_variant,249262,1,0
9026,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs144371089'],145,146,CR,X,frameshift_variant,246972,2,0
9027,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs369074574'],135,,L,X,frameshift_variant,243148,2,0
9028,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs781473412'],93,,W,*,stop_gained,251324,3,0


In [56]:
# filter synonymous variants
synonymous_variants = all_variants_filtered[all_variants_filtered['vep_impact'] == 'LOW'].copy()
synonymous_variants = synonymous_variants[synonymous_variants['vep_consequence'].str.contains('synonymous')]
synonymous_variants['vep_protein_pos'] = synonymous_variants['vep_protein_pos'].astype(int)
synonymous_variants = synonymous_variants.reset_index()
synonymous_variants =synonymous_variants.drop([
    'index',
    'Grch37 symbol',
    'chromosome',
    'genome_pos',
    'reference_bases',
    'alternate_bases',
    'vep_impact',
    'vep_gene_symbol',
    'vep_distance_to_transcript',
    'vep_SIFT',
    'vep_PolyPhen',
    'Uniprot name'
    ],axis=1)
synonymous_variants.columns = [
    'HGNC symbol',
    'HGNC name',
    'variant_id',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous',
    'consequence',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'protein_pos',
    'amino_acid',
    'swissprot_match'
]
synonymous_variants = synonymous_variants[[
    'HGNC symbol',
    'HGNC name',
    'ensembl_gene',
    'ensembl_transcript',
    'ensembl_protein',
    'swissprot_match',
    'variant_id',
    'protein_pos',
    'amino_acid',
    'consequence',
    'allele_number',
    'allele_count',
    'num_alternate_homozygous'
]]
synonymous_variants.to_csv('../data/gnomad_v2.1.1_gpcr_variants_synonymous.csv')
synonymous_variants

Unnamed: 0,HGNC symbol,HGNC name,ensembl_gene,ensembl_transcript,ensembl_protein,swissprot_match,variant_id,protein_pos,amino_acid,consequence,allele_number,allele_count,num_alternate_homozygous
0,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs778295334'],2,G,synonymous_variant,251346,1,0
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs771832928'],3,N,synonymous_variant,251276,2,0
2,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs775346470'],6,H,synonymous_variant,251172,1,0
3,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs376855885'],8,A,splice_region_variant&synonymous_variant,245942,1,0
4,ACKR1,atypical chemokine receptor 1 (Duffy blood group),ENSG00000213088,ENST00000368122,ENSP00000357104,Q16570,['rs376855885'],8,A,splice_region_variant&synonymous_variant,245942,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52507,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs777901765'],13,F,synonymous_variant,250316,1,0
52508,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs1398615981'],11,T,synonymous_variant,250460,1,0
52509,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs1305516943'],9,S,synonymous_variant,250126,1,0
52510,XCR1,X-C motif chemokine receptor 1,ENSG00000173578,ENST00000309285,ENSP00000310405,P46094,['rs745994520'],6,N,synonymous_variant,248548,2,0


In [57]:
# filter non-coding variants
modifier_variants = all_variants_filtered[all_variants_filtered['vep_impact'] == 'MODIFIER'].copy()
modifier_variants = modifier_variants.reset_index()
modifier_variants = modifier_variants.drop([
    'index',
    'Grch37 symbol',
    'chromosome',
    'genome_pos',
    'reference_bases',
    'alternate_bases',
    'vep_impact',
    'vep_gene_symbol',
    'vep_SIFT',
    'vep_PolyPhen',
    'vep_protein_pos',
    'vep_amino_acids',
    'Uniprot name'
    ],axis=1)
modifier_variants.to_csv('../data/gnomad_v2.1.1_gpcr_variants_modifier.csv')
modifier_variants

Unnamed: 0,HGNC symbol,HGNC name,variant_id,allele_number,allele_count,num_alternate_homozygous,vep_consequence,vep_ensembl_gene,vep_ensembl_transcript,vep_ensembl_protein,vep_distance_to_transcript,vep_swissprot_match
0,ACKR1,atypical chemokine receptor 1 (Duffy blood group),['rs751332400'],251396,4,0,5_prime_UTR_variant,ENSG00000213088,ENST00000368122,ENSP00000357104,,Q16570
1,ACKR1,atypical chemokine receptor 1 (Duffy blood group),['rs1800846'],251424,2,0,5_prime_UTR_variant,ENSG00000213088,ENST00000368122,ENSP00000357104,,Q16570
2,ACKR1,atypical chemokine receptor 1 (Duffy blood group),['rs774009195'],251410,2,0,5_prime_UTR_variant,ENSG00000213088,ENST00000368122,ENSP00000357104,,Q16570
3,ACKR1,atypical chemokine receptor 1 (Duffy blood group),['rs745541007'],251440,17,1,5_prime_UTR_variant,ENSG00000213088,ENST00000368122,ENSP00000357104,,Q16570
4,ACKR1,atypical chemokine receptor 1 (Duffy blood group),['rs745541007'],251440,1,0,5_prime_UTR_variant,ENSG00000213088,ENST00000368122,ENSP00000357104,,Q16570
...,...,...,...,...,...,...,...,...,...,...,...,...
62589,XCR1,X-C motif chemokine receptor 1,['rs376028715'],231044,2,0,5_prime_UTR_variant,ENSG00000173578,ENST00000309285,ENSP00000310405,,P46094
62590,XCR1,X-C motif chemokine receptor 1,['rs568812959'],228398,8,0,5_prime_UTR_variant,ENSG00000173578,ENST00000309285,ENSP00000310405,,P46094
62591,XCR1,X-C motif chemokine receptor 1,['rs766170275'],227994,2,0,5_prime_UTR_variant,ENSG00000173578,ENST00000309285,ENSP00000310405,,P46094
62592,XCR1,X-C motif chemokine receptor 1,['rs759318862'],209512,2,0,intron_variant,ENSG00000173578,ENST00000309285,ENSP00000310405,,P46094
