# Modules advanced

## Make code that reads a protein database

In [2]:
%load_ext autoreload
%autoreload 2

# Obtain Human Genome Organisation (HUGO) Gene Nomenclature Committee (HGNC) protein DBMS
# from ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/locus_groups/protein-coding_gene.txt
# and rename the protein-coding_gene.txt to protein_dbms_file below

import pandas

protein_dbms_file = 'HGNC_protein-coding_gene.tsv'
protein_dbms = pandas.read_table(protein_dbms_file, index_col=1, low_memory=False, )
print(protein_dbms.shape)
protein_dbms.axes

(19109, 48)


[Index(['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2', 'A4GALT', 'A4GNT', 'AAAS',
        'AACS', 'AADAC',
        ...
        'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
        'ZZEF1', 'ZZZ3'],
       dtype='object', name='symbol', length=19109),
 Index(['hgnc_id', 'name', 'locus_group', 'locus_type', 'status', 'location',
        'location_sortable', 'alias_symbol', 'alias_name', 'prev_symbol',
        'prev_name', 'gene_family', 'gene_family_id', 'date_approved_reserved',
        'date_symbol_changed', 'date_name_changed', 'date_modified',
        'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id', 'ena',
        'refseq_accession', 'ccds_id', 'uniprot_ids', 'pubmed_id', 'mgd_id',
        'rgd_id', 'lsdb', 'cosmic', 'omim_id', 'mirbase', 'homeodb',
        'snornabase', 'bioparadigms_slc', 'orphanet', 'pseudogene.org',
        'horde_id', 'merops', 'imgt', 'iuphar', 'kznf_gene_catalog',
        'mamit-trnadb', 'cd', 'lncrnadb', 'enzyme_id',
        'interme

In [3]:
# output the first 3 rows of the HGNC protein DBMS
protein_dbms.head(3)

Unnamed: 0_level_0,hgnc_id,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,prev_symbol,...,merops,imgt,iuphar,kznf_gene_catalog,mamit-trnadb,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,HGNC:5,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,,...,I43.950,,,,,,,,,
A1CF,HGNC:24086,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,,...,,,,,,,,,,
A2M,HGNC:7,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,,...,I39.001,,,,,,,,,


In [4]:
# obtain a dictionary for the protein PTEN
protein_dbms.loc['PTEN']

hgnc_id                                                             HGNC:9588
name                                           phosphatase and tensin homolog
locus_group                                               protein-coding gene
locus_type                                          gene with protein product
status                                                               Approved
location                                                             10q23.31
location_sortable                                                    10q23.31
alias_symbol                                                 MMAC1|TEP1|PTEN1
alias_name                             mutated in multiple advanced cancers 1
prev_symbol                                                          BZS|MHAM
prev_name                                                                 NaN
gene_family                 C2 tensin-type domain containing|PTEN protein ...
gene_family_id                                                  

### Make 'gene_metadata', a module that provides this protein information

In [5]:
%%writefile gene_metadata.py
# the line above tells Jupyter to write this cell to the file gene_metadata.py

import pandas
protein_dbms_file = 'HGNC_protein-coding_gene.tsv'
protein_dbms = pandas.read_table(protein_dbms_file, index_col=1, low_memory=False, )

def get_protein(protein_name):
    # get meta information about `protein_name`
    return protein_dbms.loc[protein_name]

Overwriting gene_metadata.py


In [6]:
import gene_metadata
gene_metadata.get_protein('PTEN')

hgnc_id                                                             HGNC:9588
name                                           phosphatase and tensin homolog
locus_group                                               protein-coding gene
locus_type                                          gene with protein product
status                                                               Approved
location                                                             10q23.31
location_sortable                                                    10q23.31
alias_symbol                                                 MMAC1|TEP1|PTEN1
alias_name                             mutated in multiple advanced cancers 1
prev_symbol                                                          BZS|MHAM
prev_name                                                                 NaN
gene_family                 C2 tensin-type domain containing|PTEN protein ...
gene_family_id                                                  

In [5]:
from gene_metadata import get_protein
print('MYC locus_type:', get_protein('MYC')['locus_type'])

MYC locus_type: gene with protein product


### get_protein() fails on lowercase gene names

In [6]:
from gene_metadata import get_protein

try:
    print('MYC locus_type:', get_protein('myc')['locus_type'])
except KeyError:
    print("cannot find 'myc'")

cannot find 'myc'


### Put a module in the subdirectory 'test'

In [7]:
%%writefile test/toupper.py

def toupper(s):
    # return upper case transform of `s`
    return s.upper()

Overwriting test/toupper.py


### Try to use it; after creating 'test/__init__.py' the import of toupper will work

In [7]:
# need to have __init__.py in test/ for this to work

from gene_metadata import get_protein
from test.toupper import toupper
print('MYC locus_type:', get_protein(toupper('myc'))['locus_type'])

MYC locus_type: gene with protein product
