<a href="https://colab.research.google.com/github/ZhaochenYe999/CBB752_FinalProject/blob/main/part_2c_protein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [96]:
import pandas as pd
import numpy as np
import time
import requests
import json
import re
import os

In [14]:
def View(df, rows=None, cols=None, width=None):

    with pd.option_context(
        "display.max_rows", rows,
        "display.max_columns", cols,
        "display.max_colwidth", width,
        "display.expand_frame_repr", False
    ):
        display(df.head(rows))

In [88]:
#ref: https://www.uniprot.org/api-documentation/uniprotkb
def get_uniprot(gene, max_results=20):

    url = f"https://rest.uniprot.org/uniprotkb/search?query={gene}%20AND%20reviewed%3Atrue%20AND%20organism_id%3A9606&fields=accession%2Cprotein_name%2Ccc_function%2C%20annotation_score&sort=annotation_score%20desc&size={max_results}"

    r = requests.get(url)
    if r.status_code != 200:
        print(f"Lookup of {gene} failed; status: {r.status_code}")
        return None

    results = r.json().get("results", [])
    #return results
    parsed_results = []

    for entry in results:
        accession = entry.get("primaryAccession", "N/A")
        protein_name = (
            entry.get("proteinDescription", {})
            .get("recommendedName", {})
            .get("fullName", {})
            .get("value", "N/A")
        )

        annotation_score = entry.get("annotationScore", None)

        functions_by_isoform = {}
        pmids = []

        for comment in entry.get("comments", []):
            if comment.get("commentType") == "FUNCTION":
                isoform = comment.get("molecule", "unspecified isoform")
                texts = [
                    t.get("value")
                    for t in comment.get("texts", [])
                    if "value" in t
                ]
                for text in texts:
                    pmids.extend(re.findall(r'PubMed:(\d+)', text))


                if isoform in functions_by_isoform:
                    functions_by_isoform[isoform].extend(texts)
                else:
                    functions_by_isoform[isoform] = texts


        parsed_results.append({
            "gene": gene,
            "accession": accession,
            "protein_name": protein_name,
            "functions_by_isoform": functions_by_isoform,
            "annotation_score": annotation_score,
             "ref": list(set(pmids))
        })

    return parsed_results if max_results > 1 else parsed_results[0]


# UniProt Pipeline

In [15]:
gene_df = pd.read_csv("https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/refs/heads/main/genelist/genelist_NonSynoymousVariants.csv").drop(columns=["Unnamed: 0"])

In [17]:
#taking the top 11 since there is a tie
gene_df_sorted = gene_df.sort_values(by="Mutation_count", ascending=False).head(11)

In [82]:
get_protein("MYO18B")

Lookup of MYO18B failed; status: 400


In [93]:
genes

['HPS4',
 'APOL1',
 'TRIOBP',
 'SFI1',
 'EFCAB6',
 'MYO18B',
 'ARSA',
 'CELSR1',
 'SUN2',
 'SEC14L3',
 'PIWIL3']

In [89]:
genes = list(gene_df_sorted["Gene_symbol"].values)
rows = []

for gene in genes:
    try:
        protein_data = get_uniprot(gene)
        print(f"{gene}: {len(protein_data)} UniProt entries")

        for protein in protein_data:
            rows.append({
                "gene": gene,
                "accession": protein.get("accession"),
                "protein_name": protein.get("protein_name"),
                "functions_by_isoform": protein.get("functions_by_isoform"),
                "annotation_score": protein.get("annotation_score"),
                "ref": protein.get("ref")
            })

    except Exception as e:
        print(f"Error processing {gene}: {e}")

    time.sleep(0.4)

df = pd.DataFrame(rows)

HPS4: 11 UniProt entries
APOL1: 3 UniProt entries
TRIOBP: 5 UniProt entries
SFI1: 7 UniProt entries
EFCAB6: 4 UniProt entries
MYO18B: 1 UniProt entries
ARSA: 16 UniProt entries
CELSR1: 2 UniProt entries
SUN2: 17 UniProt entries
SEC14L3: 1 UniProt entries
PIWIL3: 1 UniProt entries


In [90]:
len(df)

68

In [91]:
df

Unnamed: 0,gene,accession,protein_name,functions_by_isoform,annotation_score,ref
0,HPS4,O43147,Small G protein signaling modulator 2,{'unspecified isoform': ['Possesses GTPase act...,5.0,"[26620560, 21808068]"
1,HPS4,P20338,Ras-related protein Rab-4A,{'unspecified isoform': ['Small GTPase which c...,5.0,"[16034420, 15907487, 29425100]"
2,HPS4,P20339,Ras-related protein Rab-5A,{'unspecified isoform': ['Small GTPase which c...,5.0,"[22660413, 14978216, 10818110, 16410077, 14617..."
3,HPS4,P51149,Ras-related protein Rab-7a,{'unspecified isoform': ['Small GTPase which c...,5.0,"[22660413, 20028791, 12944476, 21255211, 33147..."
4,HPS4,P51151,Ras-related protein Rab-9A,{'unspecified isoform': ['Involved in the tran...,5.0,[]
...,...,...,...,...,...,...
63,SUN2,Q9BTV4,Transmembrane protein 43,{'unspecified isoform': ['May have an importan...,5.0,"[27991920, 34050020, 32614325]"
64,SUN2,Q9UH99,SUN domain-containing protein 2,{'unspecified isoform': ['As a component of th...,5.0,[]
65,SUN2,Q9Y4X5,E3 ubiquitin-protein ligase ARIH1,{'unspecified isoform': ['E3 ubiquitin-protein...,5.0,"[14623119, 17289916, 25624349, 27565346, 15236..."
66,SEC14L3,Q9UDX4,SEC14-like protein 3,{'unspecified isoform': ['Probable hydrophobic...,3.0,[]


In [92]:
df.to_csv("uniprot.csv", index=False)
#uplod to github

# Gene Cards

- No API (webscraping not allowed)
- Used GeneALaCart tool: https://genealacart.genecards.org/Query
- Downloaded the results as json/excel and uploaded to github in data folder
- They have a max of 10 genes per query, so I put 10 in the first query and PIWIL3 for the last query


In [211]:
data_fp = "https://api.github.com/repos/ZhaochenYe999/CBB752_FinalProject/contents/data"
response = requests.get(data_fp)
results = response.json()

files = [
    item['download_url']
    for item in results
    if item['name'].endswith('.xlsx')]
files

['https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/main/data/GeneALaCart-5231380-250417-150326.xlsx',
 'https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/main/data/GeneALaCart-5231380-250417-150703.xlsx']

In [214]:
sheets = pd.ExcelFile(files[0]).sheet_names

In [215]:
#most relevant onces seem to be: gene, alias,
sheets[]

['Gene',
 'Aliases',
 'ExternalIdentifiers',
 'Summaries',
 'Genomics',
 'GeneHancers',
 'MalaCardsDisorders',
 'MalaCardsInferredDisorders',
 'UniProtDisorders',
 'UniProtSubcellularLocations',
 'CompartmentsSubcellularLocation',
 'HPASubcellularLocations',
 'Proteins',
 'Domains',
 'MolecularFunctionDescriptions',
 'Phenotypes',
 'GWASPhenotypes',
 'HumanPhenotypeOntology',
 'BiologicalProcesses',
 'CellularComponents',
 'MolecularFunctions',
 'Pathways',
 'Interactions',
 'SuperPathway',
 'UnifiedCompounds',
 'UnifiedDrugs',
 'Transcripts',
 'DifferentialExpression',
 'TissueExpression',
 'Orthologs',
 'Paralogs',
 'Variants',
 'StructureVariant',
 'Intolerance',
 'Publications']

In [228]:
def clean_colum(ncol):
    col = col.lower()
    col = re.sub(r"[ /]+", "_", col)
    col = re.sub(r"[^\w]", "", col)
    return col

In [230]:
import pandas as pd
import re

def clean_column_name(col):
    col = col.lower()
    col = re.sub(r"[ /]+", "_", col)
    col = re.sub(r"[^\w]", "", col)
    return col

def rename_columns(file_path, sheet, prefix):
    df = pd.read_excel(file_path, sheet_name=sheet)
    df.columns = [
        col if col in ["InputTerm", "Symbol"]
        else f"{prefix}_{clean_column_name(col)}"
        for col in df.columns
    ]

    return df


In [246]:
gene_df = rename_columns(files[0], sheets[0], "gene")
alias_df = rename_columns(files[0], sheets[1], "alias")
ext_df = rename_columns(files[0], sheets[2], "extid")
dis1_df = rename_columns(files[0], sheets[6], "disease_mc1")
dis2_df = rename_columns(files[0], sheets[7], "disease_mc2")
dis3_df = rename_columns(files[0], sheets[8], "disease_up")

In [258]:
gwas_df = rename_columns(files[0], sheet= "GWASPhenotypes", prefix="gwas")


In [260]:
path_df = rename_columns(files[0], sheet= "SuperPathway", prefix="path")

In [261]:
path_df

Unnamed: 0,InputTerm,Symbol,path_superpath,path_containedpathways
0,HPS4,HPS4,Vesicle-mediated transport,Vesicle-mediated transport:.94:Reactome:R-HSA-...
1,HPS4,HPS4,Rab regulation of trafficking,Rab regulation of trafficking:.73:Reactome:R-H...
2,APOL1,APOL1,Vesicle-mediated transport,Vesicle-mediated transport:.94:Reactome:R-HSA-...
3,APOL1,APOL1,Regulation of Insulin-like Growth Factor (IGF)...,Post-translational protein phosphorylation:.86...
4,APOL1,APOL1,Nephrotic syndrome,Nephrotic syndrome:-:WikiPathways:WP4758
5,APOL1,APOL1,Binding and Uptake of Ligands by Scavenger Rec...,Scavenging of heme from plasma:.01:Reactome:R-...
6,APOL1,APOL1,Metabolism of proteins,Post-translational protein modification:.71:Re...
7,APOL1,APOL1,Cholesterol and Sphingolipids transport / Recy...,Cholesterol and Sphingolipids transport / Recy...
8,TRIOBP,TRIOBP,Sensory processing of sound,Sensory processing of sound by inner hair cell...
9,TRIOBP,TRIOBP,Olfactory Signaling Pathway,Sensory Perception:.64:Reactome:R-HSA-9709957


In [254]:
dis1_df

Unnamed: 0,InputTerm,Symbol,disease_mc1_name,disease_mc1_accession,disease_mc1_sources,disease_mc1_aliases,disease_mc1_publications,disease_mc1_genescore,disease_mc1_disorderscore,disease_mc1_iselite,disease_mc1_iscancercensus
0,HPS4,HPS4,Sitosterolemia,sitosterolemia,GTR,Sitosterolemia||Beta-Sitosterolemia||Phytoster...,,16.223130,100.0000,True,False
1,HPS4,HPS4,Hermansky-Pudlak Syndrome Due To Bloc-3 Defici...,hermansky_pudlak_syndrome_due_to_bloc_3_defici...,Orphanet,Hermansky-Pudlak Syndrome Due To Bloc-3 Defici...,,247.487400,350.0000,True,False
2,HPS4,HPS4,Albinism,albinism,GTR||GeneCards,Albinism,,39.876600,126.0884,True,False
3,HPS4,HPS4,"Albinism, Oculocutaneous, Type Vii",albinism_oculocutaneous_type_vii,GTR,"Albinism, Oculocutaneous, Type Vii||OCA7||Ocul...",,31.622780,100.0000,True,False
4,HPS4,HPS4,"Leukemia, Acute Myeloid",leukemia_acute_myeloid,GTR,"Leukemia, Acute Myeloid||AML||Leukemia, Acute ...",,5.423262,100.0000,True,False
...,...,...,...,...,...,...,...,...,...,...,...
90,CELSR1,CELSR1,Long Qt Syndrome,long_qt_syndrome,GTR,Long Qt Syndrome||Long Q-T Syndrome||Lqt||Roma...,,11.785110,100.0000,True,False
91,SUN2,SUN2,Neuromuscular Disease,neuromuscular_disease,GTR||DISEASES||Novoseek,Neuromuscular Disease||Neuromuscular Diseases|...,12661041,16.560170,109.8428,True,False
92,SUN2,SUN2,Emery-Dreifuss Muscular Dystrophy,emery_dreifuss_muscular_dystrophy,GTR||GeneCards||DISEASES||Novoseek,Emery-Dreifuss Muscular Dystrophy||Edmd||Benig...,15639119||18646565||16904876,57.368020,128.2672,True,False
93,SUN2,SUN2,Muscular Dystrophy,muscular_dystrophy,GTR||GeneCards||DISEASES||Novoseek,Muscular Dystrophy||Muscular Dystrophies||Dyst...,18646565||19716805,16.801120,123.4136,True,False


In [242]:
summary_df

Unnamed: 0,InputTerm,Symbol,summary_ncbi_gene,summary_uniprot,summary_genecards
0,HPS4,HPS4,This gene encodes a protein component of bioge...,"Component of the BLOC-3 complex, a complex tha...",HPS4 (HPS4 Biogenesis Of Lysosomal Organelles ...
1,APOL1,APOL1,This gene encodes a secreted high density lipo...,May play a role in lipid exchange and transpor...,APOL1 (Apolipoprotein L1) is a Protein Coding ...
2,TRIOBP,TRIOBP,This gene encodes a protein with an N-terminal...,[Isoform 1]: Regulates actin cytoskeletal orga...,TRIOBP (TRIO And F-Actin Binding Protein) is a...
3,SFI1,SFI1,Enables phosphatase binding activity. Predicte...,Plays a role in the dynamic structure of centr...,SFI1 (SFI1 Centrin Binding Protein) is a Prote...
4,EFCAB6,EFCAB6,This gene encodes a protein which directly bin...,Negatively regulates the androgen receptor by ...,EFCAB6 (EF-Hand Calcium Binding Domain 6) is a...
5,MYO18B,MYO18B,The protein encoded by this gene may regulate ...,May be involved in intracellular trafficking o...,MYO18B (Myosin XVIIIB) is a Protein Coding gen...
6,ARSA,ARSA,The protein encoded by this gene hydrolyzes ce...,Hydrolyzes cerebroside sulfate.,ARSA (Arylsulfatase A) is a Protein Coding gen...
7,CELSR1,CELSR1,The protein encoded by this gene is a member o...,Receptor that may have an important role in ce...,CELSR1 (Cadherin EGF LAG Seven-Pass G-Type Rec...
8,SUN2,SUN2,SUN1 (MIM 607723) and SUN2 are inner nuclear m...,As a component of the LINC (LInker of Nucleosk...,SUN2 (Sad1 And UNC84 Domain Containing 2) is a...
9,SEC14L3,SEC14L3,The protein encoded by this gene is highly sim...,Probable hydrophobic ligand-binding protein; m...,SEC14L3 (SEC14 Like Lipid Binding 3) is a Prot...


In [241]:
gene_hancers_df

Unnamed: 0,InputTerm,Symbol,gene_hancers_bands,gene_hancers_chromosome_grch38_hg38,gene_hancers_strand_grch38_hg38,gene_hancers_locations_grch38_hg38,gene_hancers_contig_grch38_hg38,gene_hancers_chromosome_grch37_hg19_ncbi_gene,gene_hancers_strand_grch37_hg19_ncbi_gene,gene_hancers_locations_grch37_hg19_ncbi_gene,gene_hancers_chromosome_grch37_hg19_ensembl,gene_hancers_strand_grch37_hg19_ensembl,gene_hancers_locations_grch37_hg19_ensembl
0,HPS4,HPS4,"(HGNC,NCBI Gene,Ensembl||22q12.1)",22,Minus,(26443107:26483931),,22,Minus,(26839075:26879829),22,Minus,(26839389:26879803)
1,APOL1,APOL1,"(HGNC,NCBI Gene,Ensembl||22q12.3)",22,Plus,(36253071:36267530),,22,Plus,(36649117:36663577),22,Plus,(36649056:36663576)
2,TRIOBP,TRIOBP,"(HGNC,NCBI Gene,Ensembl||22q13.1)",22,Plus,(37697048:37776556),,22,Plus,(38093055:38172563),22,Plus,(38093011:38172563)
3,SFI1,SFI1,"(HGNC,NCBI Gene,Ensembl||22q12.2)",22,Plus,(31488688:31618588),,22,Plus,(31892125:32014574),22,Plus,(31884674:32014572)
4,EFCAB6,EFCAB6,"(HGNC,NCBI Gene||22q13.2-q13.31)(Ensembl||22q1...",22,Minus,(43528744:43812337),,22,Minus,(43924658:44208185),22,Minus,(43924624:44208217)
5,MYO18B,MYO18B,"(HGNC,NCBI Gene,Ensembl||22q12.1)",22,Plus,(25742144:26063847),,22,Plus,(26138155:26427011),22,Plus,(26138111:26427007)
6,ARSA,ARSA,"(HGNC,NCBI Gene,Ensembl||22q13.33)",22,Minus,(50622754:50628173),,22,Minus,(51061182:51066580),22,Minus,(51061182:51066607)
7,CELSR1,CELSR1,"(HGNC,NCBI Gene,Ensembl||22q13.31)",22,Minus,(46360834:46537620),,22,Minus,(46757071:46933517),22,Minus,(46756731:46933067)
8,SUN2,SUN2,"(HGNC,NCBI Gene,Ensembl||22q13.1)",22,Minus,(38733290:38794143),,22,Minus,(39130739:39152003),22,Minus,(39130730:39190148)
9,SEC14L3,SEC14L3,"(HGNC,NCBI Gene,Ensembl||22q12.2)",22,Minus,(30447661:30472017),,22,Minus,(30843648:30868004),22,Minus,(30843946:30868036)


In [235]:
View(summary_df)

Unnamed: 0,InputTerm,Symbol,summary_ncbi_gene,summary_uniprot,summary_genecards
0,HPS4,HPS4,"This gene encodes a protein component of biogenesis of lysosome-related organelles complexes (BLOC). BLOC complexes are important for the formation of endosomal-lysosomal organelles such as melanosomes and platelet dense granules. Mutations in this gene result in subtype 4 of Hermansky-Pudlak syndrome, a form of albinism. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Aug 2012]","Component of the BLOC-3 complex, a complex that acts as a guanine exchange factor (GEF) for RAB32 and RAB38, promotes the exchange of GDP to GTP, converting them from an inactive GDP-bound form into an active GTP-bound form. The BLOC-3 complex plays an important role in the control of melanin production and melanosome biogenesis and promotes the membrane localization of RAB32 and RAB38 (PubMed:23084991).",HPS4 (HPS4 Biogenesis Of Lysosomal Organelles Complex 3 Subunit 2) is a Protein Coding gene. Diseases associated with HPS4 include Hermansky-Pudlak Syndrome 4 and Hermansky-Pudlak Syndrome. Among its related pathways are Vesicle-mediated transport and Rab regulation of trafficking. Gene Ontology (GO) annotations related to this gene include protein homodimerization activity and protein dimerization activity.
1,APOL1,APOL1,"This gene encodes a secreted high density lipoprotein which binds to apolipoprotein A-I. Apolipoprotein A-I is a relatively abundant plasma protein and is the major apoprotein of HDL. It is involved in the formation of most cholesteryl esters in plasma and also promotes efflux of cholesterol from cells. This apolipoprotein L family member may play a role in lipid exchange and transport throughout the body, as well as in reverse cholesterol transport from peripheral cells to the liver. Several different transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Nov 2008]",May play a role in lipid exchange and transport throughout the body. May participate in reverse cholesterol transport from peripheral cells to the liver.,APOL1 (Apolipoprotein L1) is a Protein Coding gene. Diseases associated with APOL1 include Focal Segmental Glomerulosclerosis 4 and Glomerulonephritis. Among its related pathways are Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs) and Metabolism of proteins. Gene Ontology (GO) annotations related to this gene include lipid binding and chloride channel activity. An important paralog of this gene is APOL2.
2,TRIOBP,TRIOBP,"This gene encodes a protein with an N-terminal pleckstrin homology domain and a C-terminal coiled-coil region. The protein interacts with trio, which is involved with neural tissue development and controlling actin cytoskeleton organization, cell motility and cell growth. The protein also associates with F-actin and stabilizes F-actin structures. Mutations in this gene have been associated with a form of autosomal recessive nonsyndromic deafness. Multiple alternatively spliced transcript variants that would encode different isoforms have been found for this gene, however some transcripts may be subject to nonsense-mediated decay (NMD). [provided by RefSeq, Nov 2008]","[Isoform 1]: Regulates actin cytoskeletal organization, cell spreading and cell contraction by directly binding and stabilizing filamentous F-actin and prevents its depolymerization (PubMed:18194665, PubMed:28438837). May also serve as a linker protein to recruit proteins required for F-actin formation and turnover (PubMed:18194665). Essential for correct mitotic progression (PubMed:22820163, PubMed:24692559).||[Isoform 4]: Plays a pivotal role in the formation of stereocilia rootlets.||[Isoform 5]: Plays a pivotal role in the formation of stereocilia rootlets.","TRIOBP (TRIO And F-Actin Binding Protein) is a Protein Coding gene. Diseases associated with TRIOBP include Deafness, Autosomal Recessive 28 and Autosomal Recessive Nonsyndromic Deafness. Among its related pathways are Sensory processing of sound and Olfactory Signaling Pathway. Gene Ontology (GO) annotations related to this gene include ubiquitin protein ligase binding and small GTPase binding. An important paralog of this gene is MPRIP."
3,SFI1,SFI1,"Enables phosphatase binding activity. Predicted to be located in cytosol. [provided by Alliance of Genome Resources, Nov 2024]",Plays a role in the dynamic structure of centrosome-associated contractile fibers via its interaction with CETN2.,"SFI1 (SFI1 Centrin Binding Protein) is a Protein Coding gene. Diseases associated with SFI1 include Female-Restricted Syndromic X-Linked Intellectual Disability 99 and Microcephaly. Among its related pathways are Loss of proteins required for interphase microtubule organization from the centrosome and Cell Cycle, Mitotic. Gene Ontology (GO) annotations related to this gene include phosphatase binding. An important paralog of this gene is CCDC191."
4,EFCAB6,EFCAB6,"This gene encodes a protein which directly binds the oncogene DJ-1 and androgen receptor to form a ternary complex in cells. This binding protein recruits histone-deacetylase complexes in order to repress transcription activity of androgen receptor. This protein may also play a role in spermatogenesis and fertilization. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Sep 2011]","Negatively regulates the androgen receptor by recruiting histone deacetylase complex, and protein DJ-1 antagonizes this inhibition by abrogation of this complex (PubMed:12612053). Microtubule inner protein (MIP) part of the dynein-decorated doublet microtubules (DMTs) in cilia axoneme, which is required for motile cilia beating (By similarity).",EFCAB6 (EF-Hand Calcium Binding Domain 6) is a Protein Coding gene. Diseases associated with EFCAB6 include Prostate Cancer and Androgen Insensitivity Syndrome. Gene Ontology (GO) annotations related to this gene include calcium ion binding. An important paralog of this gene is CAPS2.
5,MYO18B,MYO18B,"The protein encoded by this gene may regulate muscle-specific genes when in the nucleus and may influence intracellular trafficking when in the cytoplasm. The encoded protein functions as a homodimer and may interact with F actin. Mutations in this gene are associated with lung cancer. [provided by RefSeq, Jul 2008]","May be involved in intracellular trafficking of the muscle cell when in the cytoplasm, whereas entering the nucleus, may be involved in the regulation of muscle specific genes. May play a role in the control of tumor development and progression; restored MYO18B expression in lung cancer cells suppresses anchorage-independent growth.","MYO18B (Myosin XVIIIB) is a Protein Coding gene. Diseases associated with MYO18B include Klippel-Feil Syndrome 4, Autosomal Recessive, With Nemaline Myopathy And Facial Dysmorphism and Klippel-Feil Anomaly-Myopathy-Facial Dysmorphism Syndrome. Among its related pathways are PAK Pathway and Actin Nucleation by ARP-WASP Complex. Gene Ontology (GO) annotations related to this gene include nucleotide binding and cytoskeletal motor activity. An important paralog of this gene is MYO18A."
6,ARSA,ARSA,"The protein encoded by this gene hydrolyzes cerebroside sulfate to cerebroside and sulfate. Defects in this gene lead to metachromatic leucodystrophy (MLD), a progressive demyelination disease which results in a variety of neurological symptoms and ultimately death. Alternatively spliced transcript variants have been described for this gene. [provided by RefSeq, Dec 2010]",Hydrolyzes cerebroside sulfate.,ARSA (Arylsulfatase A) is a Protein Coding gene. Diseases associated with ARSA include Metachromatic Leukodystrophy and Leukodystrophy. Among its related pathways are Sphingolipid metabolism and Innate Immune System. Gene Ontology (GO) annotations related to this gene include calcium ion binding and arylsulfatase activity. An important paralog of this gene is ARSG.
7,CELSR1,CELSR1,"The protein encoded by this gene is a member of the flamingo subfamily, part of the cadherin superfamily. The flamingo subfamily consists of nonclassic-type cadherins; a subpopulation that does not interact with catenins. The flamingo cadherins are located at the plasma membrane and have nine cadherin domains, seven epidermal growth factor-like repeats and two laminin A G-type repeats in their ectodomain. They also have seven transmembrane domains, a characteristic unique to this subfamily. It is postulated that these proteins are receptors involved in contact-mediated communication, with cadherin domains acting as homophilic binding regions and the EGF-like domains involved in cell adhesion and receptor-ligand interactions. This particular member is a developmentally regulated, neural-specific gene which plays an unspecified role in early embryogenesis. [provided by RefSeq, Jul 2008]",Receptor that may have an important role in cell/cell signaling during nervous system formation.,"CELSR1 (Cadherin EGF LAG Seven-Pass G-Type Receptor 1) is a Protein Coding gene. Diseases associated with CELSR1 include Lymphatic Malformation 9 and Celsr1-Related Late-Onset Primary Lymphedema. Among its related pathways is GPCRs, other. Gene Ontology (GO) annotations related to this gene include G protein-coupled receptor activity and transmembrane signaling receptor activity. An important paralog of this gene is CELSR2."
8,SUN2,SUN2,"SUN1 (MIM 607723) and SUN2 are inner nuclear membrane (INM) proteins that play a major role in nuclear-cytoplasmic connection by formation of a 'bridge' across the nuclear envelope, known as the LINC complex, via interaction with the conserved luminal KASH domain of nesprins (e.g., SYNE1; MIM 608441) located in the outer nuclear membrane (ONM). The LINC complex provides a direct connection between the nuclear lamina and the cytoskeleton, which contributes to nuclear positioning and cellular rigidity (summary by Haque et al., 2010 [PubMed 19933576]).[supplied by OMIM, Nov 2010]","As a component of the LINC (LInker of Nucleoskeleton and Cytoskeleton) complex, involved in the connection between the nuclear lamina and the cytoskeleton. The nucleocytoplasmic interactions established by the LINC complex play an important role in the transmission of mechanical forces across the nuclear envelope and in nuclear movement and positioning. Specifically, SYNE2 and SUN2 assemble in arrays of transmembrane actin-associated nuclear (TAN) lines which are bound to F-actin cables and couple the nucleus to retrograde actin flow during actin-dependent nuclear movement. Required for interkinetic nuclear migration (INM) and essential for nucleokinesis and centrosome-nucleus coupling during radial neuronal migration in the cerebral cortex and during glial migration. Required for nuclear migration in retinal photoreceptor progenitors implicating association with cytoplasmic dynein-dynactin and kinesin motor complexes, and probably B-type lamins; SUN1 and SUN2 seem to act redundantly. The SUN1/2:KASH5 LINC complex couples telomeres to microtubules during meiosis; SUN1 and SUN2 seem to act at least partial redundantly. Anchors chromosome movement in the prophase of meiosis and is involved in selective gene expression of coding and non-coding RNAs needed for gametogenesis. Required for telomere attachment to nuclear envelope and gametogenesis. May also function on endocytic vesicles as a receptor for RAB5-GDP and participate in the activation of RAB5.","SUN2 (Sad1 And UNC84 Domain Containing 2) is a Protein Coding gene. Diseases associated with SUN2 include Emery-Dreifuss Muscular Dystrophy and Muscular Dystrophy. Among its related pathways are Meiosis and Cell Cycle, Mitotic. Gene Ontology (GO) annotations related to this gene include identical protein binding and lamin binding. An important paralog of this gene is SUN1."
9,SEC14L3,SEC14L3,"The protein encoded by this gene is highly similar to the protein encoded by the Saccharomyces cerevisiae SEC14 gene. The SEC14 protein is a phophatidylinositol transfer protein that is essential for biogenesis of Golgi-derived transport vesicles, and thus is required for the export of yeast secretory proteins from the Golgi complex. The specific function of this protein has not yet been determined. Alternatively spliced transcript variants encoding multiple isoforms have been observed for this gene. [provided by RefSeq, Jul 2012]","Probable hydrophobic ligand-binding protein; may play a role in the transport of hydrophobic ligands like tocopherol, squalene and phospholipids.",SEC14L3 (SEC14 Like Lipid Binding 3) is a Protein Coding gene. Diseases associated with SEC14L3 include Palindromic Rheumatism and Bardet-Biedl Syndrome 13. Gene Ontology (GO) annotations related to this gene include transporter activity and lipid binding. An important paralog of this gene is SEC14L2.


In [197]:
alias_df = pd.read_excel(files[0], sheet_name=sheet_names[1])

In [203]:
alias_df.rename(columns={"Alias":'alias_name'}, inplace=True)

In [205]:
ext_df = pd.read_excel(files[0], sheet_name=sheet_names[2])

In [206]:
ext_df

Unnamed: 0,InputTerm,Symbol,HGNC,NCBI Gene,Ensembl,UniProtKB/Swiss-Prot,OMIM®
0,HPS4,HPS4,15844,89781,ENSG00000100099,Q9NQG7,606682
1,APOL1,APOL1,618,8542,ENSG00000100342,O14791,603743
2,TRIOBP,TRIOBP,17009,11078,ENSG00000100106,Q9H2D6,609761
3,SFI1,SFI1,29064,9814,ENSG00000198089,A8K8P3,612765
4,EFCAB6,EFCAB6,24204,64800,ENSG00000186976,Q5THR3,619664
5,MYO18B,MYO18B,18150,84700,ENSG00000133454,Q8IUG5,607295
6,ARSA,ARSA,713,410,ENSG00000100299,P15289,607574
7,CELSR1,CELSR1,1850,9620,ENSG00000075275,Q9NYQ6,604523
8,SUN2,SUN2,14210,25777,ENSG00000100242,Q9UH99,613569
9,SEC14L3,SEC14L3,18655,266629,ENSG00000100012,Q9UDX4,612824
