<a href="https://colab.research.google.com/github/ZhaochenYe999/CBB752_FinalProject/blob/main/part_2c_protein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import time
import requests
import json

In [14]:
def View(df, rows=None, cols=None, width=None):

    with pd.option_context(
        "display.max_rows", rows,
        "display.max_columns", cols,
        "display.max_colwidth", width,
        "display.expand_frame_repr", False
    ):
        display(df.head(rows))

In [10]:
#ref: https://www.uniprot.org/api-documentation/uniprotkb
def get_protein(gene, max_results=1):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene%3A{gene}%20AND%20reviewed%3Atrue%20AND%20organism_id%3A9606&fields=accession%2Cprotein_name%2Ccc_function%2Cft_binding&sort=annotation_score%20desc&size={max_results}"


    r = requests.get(url)
    if r.status_code != 200:
        print(f"Lookup of {gene} failed; status: {r.status_code}")
        return None

    results = r.json().get("results", [])
    parsed_results = []

    for entry in results:
        accession = entry.get("primaryAccession", "N/A")
        protein_name = (
            entry.get("proteinDescription", {})
            .get("recommendedName", {})
            .get("fullName", {})
            .get("value", "N/A")
        )

        functions_by_isoform = {}
        for comment in entry.get("comments", []):
            if comment.get("commentType") == "FUNCTION":
                isoform = comment.get("molecule", "unspecified isoform")
                texts = [
                    t.get("value")
                    for t in comment.get("texts", [])
                    if "value" in t
                ]
                if isoform in functions_by_isoform:
                    functions_by_isoform[isoform].extend(texts)
                else:
                    functions_by_isoform[isoform] = texts

        bindings = [
            f.get("description")
            for f in entry.get("features", [])
            if f.get("type") == "BINDING" and f.get("description")
        ]

        parsed_results.append({
            "gene": gene,
            "accession": accession,
            "protein_name": protein_name,
            "functions_by_isoform": functions_by_isoform,
            "bindings": bindings
        })

    return parsed_results if max_results > 1 else parsed_results[0]


In [15]:
gene_df = pd.read_csv("https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/refs/heads/main/genelist/genelist_NonSynoymousVariants.csv").drop(columns=["Unnamed: 0"])

In [17]:
#taking the top 11 since there is a tie
gene_df_sorted = gene_df.sort_values(by="Mutation_count", ascending=False).head(11)

In [19]:
get_protein("TRIOBP")

{'gene': 'TRIOBP',
 'accession': 'Q9H2D6',
 'protein_name': 'TRIO and F-actin-binding protein',
 'functions_by_isoform': {'Isoform 1': ['Regulates actin cytoskeletal organization, cell spreading and cell contraction by directly binding and stabilizing filamentous F-actin and prevents its depolymerization (PubMed:18194665, PubMed:28438837). May also serve as a linker protein to recruit proteins required for F-actin formation and turnover (PubMed:18194665). Essential for correct mitotic progression (PubMed:22820163, PubMed:24692559)'],
  'Isoform 5': ['Plays a pivotal role in the formation of stereocilia rootlets'],
  'Isoform 4': ['Plays a pivotal role in the formation of stereocilia rootlets']},
 'bindings': []}

In [18]:
gene_df_sorted

Unnamed: 0,Mutation_count,Gene_symbol,Chromosome
0,32,HPS4,chr22
1,21,APOL1,chr22
2,20,TRIOBP,chr22
3,19,SFI1,chr22
4,18,EFCAB6,chr22
5,16,MYO18B,chr22
6,15,ARSA,chr22
7,13,CELSR1,chr22
8,12,SUN2,chr22
9,12,SEC14L3,chr22


In [20]:
genes = list(gene_df_sorted["Gene_symbol"].values)

rows = []

for gene in genes:
    try:
      protein_data = get_protein(gene)
      rows.append({
          "gene": gene,
          "accession": protein_data["accession"],
          "protein_name": protein_data["protein_name"],
          "functions_by_isoform": protein_data["functions_by_isoform"],
          "bindings": protein_data["bindings"]
      })
    except Exception as e:
        print(f"Error processing {gene}: {e}")
    time.sleep(0.4)

df = pd.DataFrame(rows)

In [21]:
df

Unnamed: 0,gene,accession,protein_name,functions_by_isoform,bindings
0,HPS4,Q9NQG7,BLOC-3 complex member HPS4,{'unspecified isoform': ['Component of the BLO...,[]
1,APOL1,O14791,Apolipoprotein L1,{'unspecified isoform': ['May play a role in l...,[]
2,TRIOBP,Q9H2D6,TRIO and F-actin-binding protein,{'Isoform 1': ['Regulates actin cytoskeletal o...,[]
3,SFI1,A8K8P3,Protein SFI1 homolog,{'unspecified isoform': ['Plays a role in the ...,[]
4,EFCAB6,Q5THR3,EF-hand calcium-binding domain-containing prot...,{'unspecified isoform': ['Negatively regulates...,[]
5,MYO18B,Q8IUG5,Unconventional myosin-XVIIIb,{'unspecified isoform': ['May be involved in i...,[]
6,ARSA,O43681,ATPase GET3,{'unspecified isoform': ['ATPase required for ...,[]
7,CELSR1,Q9NYQ6,Cadherin EGF LAG seven-pass G-type receptor 1,{'unspecified isoform': ['Receptor that may ha...,[]
8,SUN2,Q9UH99,SUN domain-containing protein 2,{'unspecified isoform': ['As a component of th...,[]
9,SEC14L3,Q9UDX4,SEC14-like protein 3,{'unspecified isoform': ['Probable hydrophobic...,[]
