In [2]:
import pandas as pd
import ncbi.datasets

In [29]:
api_client = ncbi.datasets.ApiClient()
ds_gene_instance = ncbi.datasets.GeneApi(api_client)

# Get gene summaries for three human GnRHR genes

In [27]:
gene_summary = ds_gene_instance.gene_metadata_by_id([2797, 114814, 404718])

In [28]:
def report_on_gene_descriptors(gene_summary, leader='\t', report_errors=True):
    if report_errors:
        for message in gene_summary.messages or []:
            print(f'{leader}Error for: ({",".join(message.error.invalid_identifiers)})')
            print(f'{leader}{leader}Reason: ({message.error.reason})')

    if not gene_summary.genes:
        print(f'{leader}No genes found')
        return

    for gene in map(lambda g: g.gene, gene_summary.genes):
        print(f'{leader}{gene.symbol} (GeneID: {gene.gene_id}), Chromosome: {gene.chromosomes}, SwissProt: {gene.swiss_prot_accessions}')

report_on_gene_descriptors(gene_summary)

	GNRHR2 (GeneID: 114814), Chromosome: ['1'], SwissProt: ['Q96P88']
	GNRH2 (GeneID: 2797), Chromosome: ['20'], SwissProt: ['O43555']
	GNRHR2P1 (GeneID: 404718), Chromosome: ['14'], SwissProt: None


# Finding vertebrate gene orthologs by gene symbol


In [30]:
gene_symbol = "GNRHR"
gene_taxon = "human"
gene_descriptor = ds_gene_instance.gene_metadata_by_tax_and_symbol(symbols=[gene_symbol], taxon=gene_taxon)
if not gene_descriptor.genes:
    print(f'No gene found for {gene_taxon} {gene_symbol}')
else:
    gene_id = int(gene_descriptor.genes[0].gene.gene_id)

    # Query the gene ortholog service to get all vertebrate orthologs
    ortholog_set = ds_gene_instance.gene_orthologs_by_id(gene_id=gene_id)

    if not ortholog_set.ortholog_set_id:
        print(f'\nUnable to find orthologs for gene {gene_id}')
    else:
        orthologs_descriptors = ortholog_set.genes
        report_on_gene_descriptors(orthologs_descriptors, report_errors=False)

	GNRHR (GeneID: 100009509), Chromosome: ['15'], SwissProt: None
	GNRHR (GeneID: 100011217), Chromosome: ['5'], SwissProt: None
	GNRHR (GeneID: 100033874), Chromosome: ['3'], SwissProt: ['O18821']
	GNRHR (GeneID: 100093333), Chromosome: ['10'], SwissProt: None
	Gnrhr (GeneID: 100135532), Chromosome: ['Un'], SwissProt: ['Q8CH60']
	GNRHR (GeneID: 100385305), Chromosome: ['3'], SwissProt: None
	GNRHR (GeneID: 100437372), Chromosome: ['4'], SwissProt: None
	GNRHR (GeneID: 100483711), Chromosome: ['11'], SwissProt: None
	gnrhr (GeneID: 100552021), Chromosome: ['Un'], SwissProt: None
	GNRHR (GeneID: 100589627), Chromosome: ['9'], SwissProt: None
	GNRHR (GeneID: 100662022), Chromosome: ['Un'], SwissProt: None
	Gnrhr (GeneID: 100758518), Chromosome: ['1'], SwissProt: None
	GNRHR (GeneID: 100860755), Chromosome: ['6'], SwissProt: None
	GNRHR (GeneID: 100916844), Chromosome: ['6'], SwissProt: None
	GNRHR (GeneID: 100947835), Chromosome: ['Un'], SwissProt: None
	GNRHR (GeneID: 100967998), Chromoso

# Build a table of key metadata for GnRHR genes across vertebrates

In [32]:
cols = '''
common_name
taxonomic_name
symbol
type
chromosome
num_transcripts
ensembl_id
omim_id
uniprot_id
nomenclature_id
nomenclature_auth
genome_coordinates
'''
cols = cols.split('\n')[1:-1]

def _range_repr(range):
    ret = []
    for interval in range:
        ret.append(f'{interval.begin}_{interval.end}')
    return ','.join(ret)

def _ranges_repr(ranges):
    ret = []
    for range in ranges:
        ret.append(f'{range.accession_version}:{_range_repr(range.range)}')
    return ','.join(ret)

# specify genes of interest and retrieve descriptors
gene_ids = [2798, 114814, 404718, 14715, 109324103, 109309182, 281798, 395368, 403718, 427517, 471226, 7226731, 100001586, 100135415, 100135416, 100135417, 100136028, 100270671, 100270672, 101318246, 101932446, 101935915, 101953943, 102193667, 102202954, 102205592, 102346610, 102363373, 102364206, 102366752, 102536567, 102687824, 102694185, 102770612, 103899900, 103899926, 105916404, 105919697, 105934126, 108392639, 109987527, 109994050, 109999298, 110488224, 110495632, 110496352, 110513414, 110520912, 112994411, 112996301, 114645297, 114667483]
gene_metadata = ds_gene_instance.gene_metadata_by_id(gene_ids)

# collect elements of the descriptor class into a dictionary based on each gene ID
table_data = {}
for g in gene_metadata.genes:
    if not g.gene:
        print(f'Gene not found: {g}')
        continue
    gene = g.gene

    table_data[gene.gene_id] = [gene.common_name]
    table_data[gene.gene_id].append(gene.taxname)
    table_data[gene.gene_id].append(gene.symbol)
    table_data[gene.gene_id].append(gene.type)
    table_data[gene.gene_id].append(gene.chromosome)
    if gene.transcripts:
        table_data[gene.gene_id].append(len(gene.transcripts))
    else:
        table_data[gene.gene_id].append(0)
    table_data[gene.gene_id].append(gene.ensembl_gene_ids)
    table_data[gene.gene_id].append(gene.omim_ids)
    table_data[gene.gene_id].append(gene.swiss_prot_accessions)
    if gene.nomenclature_authority:
        table_data[gene.gene_id].append(gene.nomenclature_authority.identifier)
        table_data[gene.gene_id].append(gene.nomenclature_authority.authority)
    else:
        table_data[gene.gene_id].append(None)
        table_data[gene.gene_id].append(None)        
    table_data[gene.gene_id].append(_ranges_repr(gene.genomic_ranges))

        
df = pd.DataFrame.from_dict(table_data, orient='index', columns=cols)
df.index.name = 'gene_id'
df.head(20)

Gene not found: {'query': ['7226731'],
               'message': 'The gene you requested, (7226731) is a valid NCBI '
                          'Gene IDs that has been discontinued. It will be '
                          'omitted from your dataset. For more information '
                          'about the discontinued genes, visit NCBI Gene.\n',
               'reason': 'This GeneID has been discontinued.',
               'unrecognized_identifier': '7226731'}]}


Unnamed: 0_level_0,common_name,taxonomic_name,symbol,type,chromosome,num_transcripts,ensembl_id,omim_id,uniprot_id,nomenclature_id,nomenclature_auth,genome_coordinates
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100001586,zebrafish,Danio rerio,gnrhr4,PROTEIN_CODING,,2,[ENSDARG00000038116],,,ZDB-GENE-050419-76,ZFIN,"NC_007129.7:25390740_25402909,NW_018395028.1:1..."
100135415,tropical clawed frog,Xenopus tropicalis,gnrhr2,PROTEIN_CODING,,3,,,,XB-GENE-5867415,Xenbase,NC_030679.2:109001475_109014876
100135416,tropical clawed frog,Xenopus tropicalis,gnrhr2/nmi,PROTEIN_CODING,,2,,,,ENSXETG00000038283,EnsemblRapid,NC_030679.2:116455054_116472596
100135417,tropical clawed frog,Xenopus tropicalis,gnrhr,PROTEIN_CODING,,2,,,,XB-GENE-5753573,Xenbase,NC_030684.2:142203708_142211556
100136028,rainbow trout,Oncorhynchus mykiss,gnrh-r,PROTEIN_CODING,,1,[ENSOMYG00000000839],,,,,NC_048566.1:29925736_29930354
100270671,zebrafish,Danio rerio,gnrhr2,PROTEIN_CODING,,1,[ENSDARG00000003553],,,ZDB-GENE-090128-3,ZFIN,"NC_007118.7:52364748_52368843,NW_018395268.1:1..."
100270672,zebrafish,Danio rerio,gnrhr1,PROTEIN_CODING,,2,[ENSDARG00000100593],,,ZDB-GENE-090128-2,ZFIN,NC_007130.7:43213293_43235554
101318246,common bottlenose dolphin,Tursiops truncatus,GNRHR,PROTEIN_CODING,,1,,,,ENSTTRG00000007811,EnsemblRapid,NC_047038.1:85406171_85428905
101932446,Painted turtle,Chrysemys picta,LOC101932446,PROTEIN_CODING,,1,[ENSCPBG00000024942],,,,,NW_024885726.1:1139246_1141801
101935915,Painted turtle,Chrysemys picta,LOC101935915,PROTEIN_CODING,,1,[ENSCPBG00000007948],,,,,NW_024885850.1:2738385_2747372


# Build a table showing GnRHR gene copy number across vertebrates

In [33]:
gene_cnt = df.groupby('common_name')['symbol'].count().reset_index()
gene_cnt.columns = ['organism', 'gene_count']
gene_cnt.sort_values('gene_count', ascending=False, inplace=True)
gene_cnt

Unnamed: 0,organism,gene_count
16,rainbow trout,6
8,coelacanth,4
20,zebrafish,3
2,Painted turtle,3
19,tropical clawed frog,3
4,ballan wrasse,3
15,mummichog,3
14,human,3
12,emu,2
18,spotted gar,2
