In [1]:
import requests
import json
import pandas as pd
import os
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
import re
import numpy as np
import logging
import sys
import time
from datetime import timedelta
from pprint import pprint # pretty print

sys.path.append("/Users/andrewlim/Documents/Phosho_Pipeline")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Imports from our repo files 
# Client Functions
from phospho.uniprot import *

#### Testing Request Functions

In [3]:
# Examing UniProt Data structure 
atk1_data = uniprot_request_data("P31749")
akt1_sequence_data = uniprot_request_data("P31749","sequence")

# print(akt1_sequence_data)
# print(akt1_sequence_data["value"])
#atk1_data

# for k,v in atk1_data.items():
#     print(k,v)

# no subset given, manual filter
atk1_data["results"][0]["organism"]["scientificName"]

'Homo sapiens'

In [4]:
# Test fetch entry function
egfr_entry = fetch_entry("P00533")
# print(egfr_entry)
# print(egfr_entry["primaryAccession"])

cats = []
for k, v in egfr_entry.items():
      cats.append(k)
      # print(k,v)


print(cats)
# prints(len(cats))

['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes']


In [5]:
print(search_uniprot("p53", size=1))

{'results': [{'entryType': 'UniProtKB reviewed (Swiss-Prot)', 'primaryAccession': 'Q9BVI0', 'secondaryAccessions': ['A7E235', 'B2RB56', 'E1P5S3', 'Q566Q2', 'Q5JWY9', 'Q66K49', 'Q9BWV4', 'Q9BXA3', 'Q9BZW3', 'Q9H421', 'Q9H4J6', 'Q9NZ22'], 'uniProtkbId': 'PHF20_HUMAN', 'entryAudit': {'firstPublicDate': '2003-07-11', 'lastAnnotationUpdateDate': '2026-01-28', 'lastSequenceUpdateDate': '2003-07-11', 'entryVersion': 207, 'sequenceVersion': 2}, 'annotationScore': 5.0, 'organism': {'scientificName': 'Homo sapiens', 'commonName': 'Human', 'taxonId': 9606, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']}, 'proteinExistence': '1: Evidence at protein level', 'proteinDescription': {'recommendedName': {'fullName': {'value': 'PHD finger protein 20'}}, 'alternativeNames': [{'fullName': {'value': 'Glioma-expressed antigen 2'}}, {'fullName': {'value': 'Hepato

#### Test UniProt Parsings Functions 

In [6]:
# Create an entry first
akt1_entry = fetch_entry("P31749")
# akt1_entry

In [7]:
# primaryAccesion  
# EGFR
primary = get_primary_accession(akt1_entry)
print(primary)  # -> "P31749"

P31749


In [8]:
# how gene information is structured in entry 
print(akt1_entry['genes'])
# built query
print(akt1_entry["genes"][0].get('geneName').get('value'))

[{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000312', 'source': 'HGNC', 'id': 'HGNC:391'}], 'value': 'AKT1'}, 'synonyms': [{'value': 'PKB'}, {'value': 'RAC'}]}]
AKT1


In [9]:
# Test grabbing gene symbol function
get_gene_symbol(akt1_entry)

'AKT1'

In [10]:
get_protein_name(akt1_entry)

'RAC-alpha serine/threonine-protein kinase'

In [11]:
# Test protein name 
akt1_entry['proteinDescription']['recommendedName']['fullName']['value']

'RAC-alpha serine/threonine-protein kinase'

In [12]:
#  Test grab PTM entry texts function
get_ptm_texts(akt1_entry)

['O-GlcNAcylation at Thr-305 and Thr-312 inhibits activating phosphorylation at Thr-308 via disrupting the interaction between AKT1 and PDPK1. O-GlcNAcylation at Ser-473 also probably interferes with phosphorylation at this site',
 'Phosphorylation on Thr-308, Ser-473 and Tyr-474 is required for full activity (PubMed:12149249, PubMed:15047712, PubMed:15262962, PubMed:16266983, PubMed:18456494, PubMed:20481595, PubMed:20978158, PubMed:8978681, PubMed:9512493, PubMed:9736715). Phosphorylation of the activation loop at Thr-308 by PDPK1/PDK1 is a prerequisite for full activation (PubMed:9512493). Phosphorylation by mTORC2 in response to growth factors plays a key role in AKT1 activation: mTORC2 phosphorylates different sites depending on the context, such as Thr-450, Ser-473, Ser-477 or Thr-479, thereby facilitating subsequent phosphorylation of the activation loop by PDPK1/PDK1 (PubMed:15718470, PubMed:24670654). Phosphorylation at Ser-473 by mTORC2 promotes ubiquitination and degradation

In [13]:
atk1_ptm_texts = []

# comments are stored in a list where each comment is a dict 
for comment in akt1_entry["comments"]:
    # each comment has a key called commentType (filter for PTM text)
    if comment["commentType"] == "PTM":
        # access the comments texts
        for text_obj in comment.get('texts', []):
            # isolate value (where text is stored)
            value = text_obj["value"]
            if value:
                atk1_ptm_texts.append(value) # append PTM texts to running list

atk1_ptm_texts

['O-GlcNAcylation at Thr-305 and Thr-312 inhibits activating phosphorylation at Thr-308 via disrupting the interaction between AKT1 and PDPK1. O-GlcNAcylation at Ser-473 also probably interferes with phosphorylation at this site',
 'Phosphorylation on Thr-308, Ser-473 and Tyr-474 is required for full activity (PubMed:12149249, PubMed:15047712, PubMed:15262962, PubMed:16266983, PubMed:18456494, PubMed:20481595, PubMed:20978158, PubMed:8978681, PubMed:9512493, PubMed:9736715). Phosphorylation of the activation loop at Thr-308 by PDPK1/PDK1 is a prerequisite for full activation (PubMed:9512493). Phosphorylation by mTORC2 in response to growth factors plays a key role in AKT1 activation: mTORC2 phosphorylates different sites depending on the context, such as Thr-450, Ser-473, Ser-477 or Thr-479, thereby facilitating subsequent phosphorylation of the activation loop by PDPK1/PDK1 (PubMed:15718470, PubMed:24670654). Phosphorylation at Ser-473 by mTORC2 promotes ubiquitination and degradation

In [14]:
akt1_entry["comments"][12]['texts'][0]['value']

'O-GlcNAcylation at Thr-305 and Thr-312 inhibits activating phosphorylation at Thr-308 via disrupting the interaction between AKT1 and PDPK1. O-GlcNAcylation at Ser-473 also probably interferes with phosphorylation at this site'