In [1]:
from pandas import json_normalize as json_normalize
from tqdm import tqdm
import pandas as pd
import requests
import argparse
import json
import os
from copy import deepcopy

In [6]:
query = """
    query GnomadVariant($variantId: String, $rsid: String, $datasetId: DatasetId!) {
  variant(variantId: $variantId, rsid: $rsid, dataset: $datasetId) {
    variantId
    reference_genome
    chrom
    pos
    ref
    alt
    colocatedVariants
    multiNucleotideVariants {
      combined_variant_id
      changes_amino_acids
      n_individuals
      other_constituent_snvs
    }
    exome {
      ac
      an
      ac_hemi
      ac_hom
      faf95 {
        popmax
        popmax_population
      }
      filters
      populations {
        id
        ac
        an
        ac_hemi
        ac_hom
      }
      age_distribution {
        het {
          bin_edges
          bin_freq
          n_smaller
          n_larger
        }
        hom {
          bin_edges
          bin_freq
          n_smaller
          n_larger
        }
      }
      qualityMetrics {
        alleleBalance {
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        genotypeDepth {
          all {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        genotypeQuality {
          all {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        siteQualityMetrics {
          metric
          value
        }
      }
    }
    genome {
      ac
      an
      ac_hemi
      ac_hom
      faf95 {
        popmax
        popmax_population
      }
      filters
      populations {
        id
        ac
        an
        ac_hemi
        ac_hom
      }
      age_distribution {
        het {
          bin_edges
          bin_freq
          n_smaller
          n_larger
        }
        hom {
          bin_edges
          bin_freq
          n_smaller
          n_larger
        }
      }
      qualityMetrics {
        alleleBalance {
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        genotypeDepth {
          all {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        genotypeQuality {
          all {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
          alt {
            bin_edges
            bin_freq
            n_smaller
            n_larger
          }
        }
        siteQualityMetrics {
          metric
          value
        }
      }
    }
    flags
    rsid
    sortedTranscriptConsequences {
      canonical
      gene_id
      gene_version
      gene_symbol
      hgvs
      hgvsc
      hgvsp
      lof
      lof_flags
      lof_filter
      major_consequence
      polyphen_prediction
      sift_prediction
      transcript_id
      transcript_version
    }
  }
}
"""

variables = """
{
  "datasetId": "%s",
  "variantId": "%s"
}
"""

variables = variables % ("gnomad_r2_1", "4-980801-G-A")

In [7]:
end_point = "https://gnomad.broadinstitute.org/api/"
response = requests.post(end_point, data={'query': query, 'variables': variables}, timeout=None)

In [16]:
data = response.json()

In [14]:
from pynomad import DataManager

In [18]:
pd.json_normalize(data)

Unnamed: 0,data.variant.variantId,data.variant.reference_genome,data.variant.chrom,data.variant.pos,data.variant.ref,data.variant.alt,data.variant.colocatedVariants,data.variant.multiNucleotideVariants,data.variant.exome,data.variant.genome.ac,...,data.variant.genome.qualityMetrics.genotypeQuality.all.n_smaller,data.variant.genome.qualityMetrics.genotypeQuality.all.n_larger,data.variant.genome.qualityMetrics.genotypeQuality.alt.bin_edges,data.variant.genome.qualityMetrics.genotypeQuality.alt.bin_freq,data.variant.genome.qualityMetrics.genotypeQuality.alt.n_smaller,data.variant.genome.qualityMetrics.genotypeQuality.alt.n_larger,data.variant.genome.qualityMetrics.siteQualityMetrics,data.variant.flags,data.variant.rsid,data.variant.sortedTranscriptConsequences
0,4-980801-G-A,GRCh37,4,980801,G,A,[],[],,2,...,0,0,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,"[{'metric': 'BaseQRankSum', 'value': 2.55}, {'...",[],rs1165676079,"[{'canonical': True, 'gene_id': 'ENSG000001274..."


In [32]:
POPULATION_ID_MAP = {
            'AFR': 'African',
            'AMI': 'Amish',
            'AMR': 'Latino',
            'ASJ': 'Ashkenazi Jewish',
            'EAS': 'East Asian',
            'FIN': 'European (Finnish)',
            'NFE': 'European (non-Finnish)',
            'OTH': 'Other',
            'SAS': 'South Asian'
        }

POPULATION_ID_MAP['SAS']

'South Asian'

In [29]:
pd.json_normalize(data['data']['variant']['genome']['populations']).set_index('id')

Unnamed: 0_level_0,ac,an,ac_hemi,ac_hom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFR,0,8642,0.0,0
AFR_FEMALE,0,3654,0.0,0
AFR_MALE,0,4988,0.0,0
AMR,0,842,0.0,0
AMR_FEMALE,0,410,0.0,0
AMR_MALE,0,432,0.0,0
ASJ,0,286,0.0,0
ASJ_FEMALE,0,78,0.0,0
ASJ_MALE,0,208,0.0,0
EAS,0,1558,0.0,0


In [39]:
def build_variant_search_standard_df(data):
        
        reqdf = pd.json_normalize(data['data']['variant']['genome']['populations']).set_index('id')
        POPULATION_ID_MAP['FEMALE'] = 'Female'
        POPULATION_ID_MAP['MALE'] = 'Male'

        new_index = {}
        frequencies = []
        rows_to_delete = []
        for row in reqdf.iterrows():
            try:
                row_id = row[0]
                new_row_name = ""
                for piece in row_id.split('_'):
                    new_row_name += POPULATION_ID_MAP[piece] + " "
                new_row_name = new_row_name[0:-1]
                new_index[row_id] = new_row_name
                frequencies.append(row[1]['ac']/row[1]['an'])
            except:
                rows_to_delete.append(row[0])
        
        reqdf = reqdf.drop(rows_to_delete)
        print(reqdf)

        new_index['FEMALE'] = 'Total Female'
        new_index['MALE'] = 'Total Male'
        new_columns = {'ac': 'Allele Count', 'an': 'Allele Number',
                    'ac_hemi': 'Number of Hemizygotes', 'ac_hom': 'Number of Homozygotes'}

        df = reqdf.rename(columns=new_columns, index=new_index)

        total_row = df.loc['Total Female'] + df.loc['Total Male']
        total_row.name = 'Total'
        df = df.append([total_row])

        frequencies.append(df.loc['Total']['Allele Count'] / df.loc['Total']['Allele Number'])
        df['Allele Frequency'] = frequencies
        
        chromosome = data['data']['variant']['chrom']
        if chromosome != 'X' and chromosome != 'Y':
            del df['Number of Hemizygotes']

        return df

In [40]:
build_variant_search_standard_df(data)

            ac     an  ac_hemi  ac_hom
id                                    
AFR          0   8642      0.0       0
AFR_FEMALE   0   3654      0.0       0
AFR_MALE     0   4988      0.0       0
AMR          0    842      0.0       0
AMR_FEMALE   0    410      0.0       0
AMR_MALE     0    432      0.0       0
ASJ          0    286      0.0       0
ASJ_FEMALE   0     78      0.0       0
ASJ_MALE     0    208      0.0       0
EAS          0   1558      0.0       0
EAS_FEMALE   0    538      0.0       0
EAS_MALE     0   1020      0.0       0
FIN          0   3446      0.0       0
FIN_FEMALE   0   1838      0.0       0
FIN_MALE     0   1608      0.0       0
NFE          2  15334      0.0       0
NFE_FEMALE   2   6762      0.0       0
NFE_MALE     0   8572      0.0       0
OTH          0   1086      0.0       0
OTH_FEMALE   0    562      0.0       0
OTH_MALE     0    524      0.0       0
SAS          0      0      0.0       0
SAS_FEMALE   0      0      0.0       0
SAS_MALE     0      0    

  frequencies.append(row[1]['ac']/row[1]['an'])


Unnamed: 0,Allele Count,Allele Number,Number of Homozygotes,Allele Frequency
African,0.0,8642.0,0.0,0.0
African Female,0.0,3654.0,0.0,0.0
African Male,0.0,4988.0,0.0,0.0
Latino,0.0,842.0,0.0,0.0
Latino Female,0.0,410.0,0.0,0.0
Latino Male,0.0,432.0,0.0,0.0
Ashkenazi Jewish,0.0,286.0,0.0,0.0
Ashkenazi Jewish Female,0.0,78.0,0.0,0.0
Ashkenazi Jewish Male,0.0,208.0,0.0,0.0
East Asian,0.0,1558.0,0.0,0.0


In [118]:
query = """query VariantInRegion($chrom: String!, $start: Int!, $stop: Int!, $datasetId: DatasetId!, $referenceGenome: ReferenceGenomeId!) {
  region(start: $start, stop: $stop, chrom: $chrom, reference_genome: $referenceGenome) {
    clinvar_variants {
      clinical_significance
      clinvar_variation_id
      gold_stars
      major_consequence
      pos
      variant_id
    }
    variants(dataset: $datasetId) {
      consequence
      flags
      gene_id
      gene_symbol
      hgvs
      hgvsc
      hgvsp
      lof
      lof_filter
      lof_flags
      pos
      rsid
      variant_id: variantId
      exome {
        ac
        ac_hemi
        ac_hom
        an
        af
        filters
        populations {
          id
          ac
          an
          ac_hemi
          ac_hom
        }
      }
      genome {
        ac
        ac_hemi
        ac_hom
        an
        af
        filters
        populations {
          id
          ac
          an
          ac_hemi
          ac_hom
        }
      }
    }
  }
}
"""

variables= """
{
  "datasetId": "gnomad_r2_1",
  "chrom": "X",
  "start": 980801,
  "stop": 980830,
  "referenceGenome": "GRCh37"
}
"""

In [119]:
end_point = "https://gnomad.broadinstitute.org/api/"
response = requests.post(end_point, data={'query': query, 'variables': variables}, timeout=None)

In [120]:
data = response.json()

In [121]:
data

{'data': {'region': {'clinvar_variants': [],
   'variants': [{'consequence': None,
     'flags': ['lcr', 'segdup', 'par'],
     'gene_id': None,
     'gene_symbol': None,
     'hgvs': None,
     'hgvsc': None,
     'hgvsp': None,
     'lof': None,
     'lof_filter': None,
     'lof_flags': None,
     'pos': 980805,
     'rsid': None,
     'variant_id': 'X-980805-C-A',
     'exome': None,
     'genome': {'ac': 1,
      'ac_hemi': 0,
      'ac_hom': 0,
      'an': 28158,
      'af': 3.5513885929398395e-05,
      'filters': ['RF'],
      'populations': [{'id': 'AFR',
        'ac': 0,
        'an': 7962,
        'ac_hemi': 0,
        'ac_hom': 0},
       {'id': 'AMR', 'ac': 0, 'an': 812, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'ASJ', 'ac': 0, 'an': 286, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'EAS', 'ac': 0, 'an': 1528, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'FIN', 'ac': 0, 'an': 2624, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'NFE', 'ac': 1, 'an': 14002, 'ac_hemi': 0, 'ac_hom': 0},

In [83]:
data['data']['region']['clinvar_variants']

[]

In [84]:
raw_df = pd.DataFrame(data['data']['region']['variants'])

In [85]:
renamed_cols = {
                            'variant_id': 'Variant ID',
                            'gene_symbol': 'Gene',
                            'hgvs': 'Consequence',
                            'consequence': 'Annotation',
                            'flags': 'Flags'
                        }
        
standard_cols = [
            'Variant ID', 'Gene', 'Consequence', 
            'Annotation', 'Flags', 'Allele Count',
            'Allele Number', 'Allele Frequency',
            'Number of Homozygotes'
        ]

df_renamed = raw_df.rename(columns=renamed_cols)
#df_final = self._explicit_allele_informations(df_renamed)
#standard_df = df_final.loc[:, standard_cols]
#self._add_variant_columns()

df_renamed

Unnamed: 0,Annotation,Flags,gene_id,Gene,Consequence,hgvsc,hgvsp,lof,lof_filter,lof_flags,pos,rsid,Variant ID,exome,genome
0,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*44G>C,c.*44G>C,,,,,148564233,,X-148564233-C-G,"{'ac': 1, 'ac_hemi': 0, 'ac_hom': 0, 'an': 178...",
1,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*42C>T,c.*42C>T,,,,,148564235,rs782584433,X-148564235-G-A,"{'ac': 0, 'ac_hemi': 0, 'ac_hom': 0, 'an': 178...",
2,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*37C>T,c.*37C>T,,,,,148564240,rs192118605,X-148564240-G-A,"{'ac': 42, 'ac_hemi': 13, 'ac_hom': 0, 'an': 1...",
3,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*31G>A,c.*31G>A,,,,,148564246,rs782432215,X-148564246-C-T,"{'ac': 2, 'ac_hemi': 2, 'ac_hom': 0, 'an': 181...",
4,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*29A>G,c.*29A>G,,,,,148564248,,X-148564248-T-C,"{'ac': 1, 'ac_hemi': 0, 'ac_hom': 0, 'an': 181...",


In [88]:
def explicit_allele_informations(df):
        chromosome = df['Variant ID'][0][0]
        allele_count = []
        allele_number = []
        allele_freq = []
        num_homozygotes = []
        num_hemizygotes = []
        
        region = []
        
        for variant in df['genome']:
            if variant:
                allele_count.append(variant['ac'])
                allele_number.append(variant['an'])
                allele_freq.append(variant['af'])
                region.append("Genome")
            
                n_homs = 0
                n_hemi = 0
                for population in variant['populations']:
                    n_homs += population['ac_hom']
                    n_hemi += population['ac_hemi']
                num_homozygotes.append(n_homs)
                num_hemizygotes.append(n_hemi)
            else:
                region.append("Exome")
           
        
        for variant in df['exome']:
            if variant:
                allele_count.append(variant['ac'])
                allele_number.append(variant['an'])
                allele_freq.append(variant['af'])
            
                n_homs = 0
                n_hemi = 0
                for population in variant['populations']:
                    n_homs += population['ac_hom']
                    n_hemi += population['ac_hemi']
                num_homozygotes.append(n_homs)
                num_hemizygotes.append(n_hemi)
                
            
        df['Allele Count'] = allele_count
        df['Allele Number'] = allele_number
        df['Allele Frequency'] = allele_freq
        df['Number of Homozygotes'] = num_homozygotes
        if (chromosome == 'X') or (chromosome == 'Y'):
            df['Number of Hemizygotes'] = num_hemizygotes
        df['Region'] = region
        return df

In [100]:
df = explicit_allele_informations(df_renamed)
df

Unnamed: 0,Annotation,Flags,gene_id,Gene,Consequence,hgvsc,hgvsp,lof,lof_filter,lof_flags,...,rsid,Variant ID,exome,genome,Allele Count,Allele Number,Allele Frequency,Number of Homozygotes,Number of Hemizygotes,Region
0,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*44G>C,c.*44G>C,,,,,...,,X-148564233-C-G,"{'ac': 1, 'ac_hemi': 0, 'ac_hom': 0, 'an': 178...",,1,178182,6e-06,0,0,Exome
1,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*42C>T,c.*42C>T,,,,,...,rs782584433,X-148564235-G-A,"{'ac': 0, 'ac_hemi': 0, 'ac_hom': 0, 'an': 178...",,0,178926,0.0,0,0,Exome
2,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*37C>T,c.*37C>T,,,,,...,rs192118605,X-148564240-G-A,"{'ac': 42, 'ac_hemi': 13, 'ac_hom': 0, 'an': 1...",,42,180127,0.000233,0,13,Exome
3,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*31G>A,c.*31G>A,,,,,...,rs782432215,X-148564246-C-T,"{'ac': 2, 'ac_hemi': 2, 'ac_hom': 0, 'an': 181...",,2,181201,1.1e-05,0,2,Exome
4,3_prime_UTR_variant,[],ENSG00000010404,IDS,c.*29A>G,c.*29A>G,,,,,...,,X-148564248-T-C,"{'ac': 1, 'ac_hemi': 0, 'ac_hom': 0, 'an': 181...",,1,181464,6e-06,0,0,Exome


In [101]:
standard_cols = [
                    'Variant ID', 'Gene', 'Consequence', 
                    'Annotation', 'Flags', 'Allele Count',
                    'Allele Number', 'Allele Frequency',
                    'Number of Homozygotes', 'Number of Hemizygotes',
                    'Region'
                ]

In [102]:
df.loc[:, standard_cols]

Unnamed: 0,Variant ID,Gene,Consequence,Annotation,Flags,Allele Count,Allele Number,Allele Frequency,Number of Homozygotes,Number of Hemizygotes,Region
0,X-148564233-C-G,IDS,c.*44G>C,3_prime_UTR_variant,[],1,178182,6e-06,0,0,Exome
1,X-148564235-G-A,IDS,c.*42C>T,3_prime_UTR_variant,[],0,178926,0.0,0,0,Exome
2,X-148564240-G-A,IDS,c.*37C>T,3_prime_UTR_variant,[],42,180127,0.000233,0,13,Exome
3,X-148564246-C-T,IDS,c.*31G>A,3_prime_UTR_variant,[],2,181201,1.1e-05,0,2,Exome
4,X-148564248-T-C,IDS,c.*29A>G,3_prime_UTR_variant,[],1,181464,6e-06,0,0,Exome


In [114]:
data

{'data': {'region': {'clinvar_variants': [],
   'variants': [{'consequence': '3_prime_UTR_variant',
     'flags': [],
     'gene_id': 'ENSG00000010404',
     'gene_symbol': 'IDS',
     'hgvs': 'c.*44G>C',
     'hgvsc': 'c.*44G>C',
     'hgvsp': None,
     'lof': None,
     'lof_filter': None,
     'lof_flags': None,
     'pos': 148564233,
     'rsid': None,
     'variant_id': 'X-148564233-C-G',
     'exome': {'ac': 1,
      'ac_hemi': 0,
      'ac_hom': 0,
      'an': 178182,
      'af': 5.6122391711845195e-06,
      'filters': [],
      'populations': [{'id': 'AFR',
        'ac': 0,
        'an': 12922,
        'ac_hemi': 0,
        'ac_hom': 0},
       {'id': 'AMR', 'ac': 0, 'an': 27202, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'ASJ', 'ac': 0, 'an': 7416, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'EAS', 'ac': 0, 'an': 13658, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'FIN', 'ac': 0, 'an': 14354, 'ac_hemi': 0, 'ac_hom': 0},
       {'id': 'NFE', 'ac': 1, 'an': 79541, 'ac_hemi': 0, 'ac_