In [1]:
import requests
import json
import pandas as pd
import os
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
import re
import numpy as np
import logging
import sys
import time
from datetime import timedelta

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
# UniProt search endpoint
BASE_URL = "https://rest.uniprot.org/uniprotkb/search"

# Parameters sent to the UniProt API
params = {
    "query": "reviewed:true AND organism_id:9606",  # reviewed human proteins only
    "format": "json",                               # return data as JSON
    "size": 5                                       # limit results to 5 proteins
}

# Send GET request to UniProt with the parameters above
resp = requests.get(BASE_URL, params=params)

# Raise an error if the request failed (e.g., bad request, no internet)
resp.raise_for_status()

# Convert the JSON response into a Python dictionary
data = resp.json()

# Print how many protein entries were returned
print(f"Returned {len(data['results'])} entries")

# Select the first protein entry for inspection
entry = data["results"][0]

# Print the UniProt accession ID (stable protein identifier)
print("Accession:", entry["primaryAccession"])

# Print the full recommended protein name
print(
    "Protein name:",
    entry["proteinDescription"]["recommendedName"]["fullName"]["value"]
)

# List all comment types available for this protein
for c in entry.get("comments", []):
    print("-", c["commentType"])

Returned 5 entries
Accession: A0A0C5B5G6
Protein name: Mitochondrial-derived peptide MOTS-c
- FUNCTION
- SUBUNIT
- SUBCELLULAR LOCATION
- TISSUE SPECIFICITY
- DEVELOPMENTAL STAGE
- INDUCTION
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- MISCELLANEOUS
- CAUTION


In [4]:
# data

In [None]:
# Function that takes in a proteins accession_id and returns the proteins data 
# NOTE: implement response.raise_for_status() instead of current handling later 

# subset should be a top-level key in the UniProt entry dict
def uniprot_request_data(accession_id, subset=None):
    # build params 
    params = {
    "query": f"accession:{accession_id}",  # adjust query to accession_id
    "format": "json",                      # return data as JSON
    }
    
    response = requests.get(BASE_URL, params=params) # send a request to UniProt

    # error handling 
    if response.status_code == 200:
        print("Data successfully recieved")
    else:
        print(f"Error: {response.status_code}")
    
    # convert JSON response to a Python dictionary
    data = response.json()

    if subset:
        return data["results"][0][subset]
    else:
        return data
    

In [47]:
# Lets grab AKT1 data from uniprot 
# We query with AKT1s accession ID - P31749

# uniprot_request_data("P31749")

akt1_sequence_data = uniprot_request_data("P31749","sequence")
print(akt1_sequence_data)
akt1_sequence_data["value"]

Data successfully recieved
{'value': 'MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVDQREAPLNNFSVAQCQLMKTERPRPNTFIIRCLQWTTVIERTFHVETPEEREEWTTAIQTVADGLKKQEEEEMDFRSGSPSDNSGAEEMEVSLAKPKHRVTMNEFEYLKLLGKGTFGKVILVKEKATGRYYAMKILKKEVIVAKDEVAHTLTENRVLQNSRHPFLTALKYSFQTHDRLCFVMEYANGGELFFHLSRERVFSEDRARFYGAEIVSALDYLHSEKNVVYRDLKLENLMLDKDGHIKITDFGLCKEGIKDGATMKTFCGTPEYLAPEVLEDNDYGRAVDWWGLGVVMYEMMCGRLPFYNQDHEKLFELILMEEIRFPRTLGPEAKSLLSGLLKKDPKQRLGGGSEDAKEIMQHRFFAGIVWQHVYEKKLSPPFKPQVTSETDTRYFDEEFTAQMITITPPDQDDSMECVDSERRPHFPQFSYSASGTA', 'length': 480, 'molWeight': 55686, 'crc64': '6EAFF4F8AD436714', 'md5': '1620296E269940B19BAFA78CA71E003A'}


'MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVDQREAPLNNFSVAQCQLMKTERPRPNTFIIRCLQWTTVIERTFHVETPEEREEWTTAIQTVADGLKKQEEEEMDFRSGSPSDNSGAEEMEVSLAKPKHRVTMNEFEYLKLLGKGTFGKVILVKEKATGRYYAMKILKKEVIVAKDEVAHTLTENRVLQNSRHPFLTALKYSFQTHDRLCFVMEYANGGELFFHLSRERVFSEDRARFYGAEIVSALDYLHSEKNVVYRDLKLENLMLDKDGHIKITDFGLCKEGIKDGATMKTFCGTPEYLAPEVLEDNDYGRAVDWWGLGVVMYEMMCGRLPFYNQDHEKLFELILMEEIRFPRTLGPEAKSLLSGLLKKDPKQRLGGGSEDAKEIMQHRFFAGIVWQHVYEKKLSPPFKPQVTSETDTRYFDEEFTAQMITITPPDQDDSMECVDSERRPHFPQFSYSASGTA'

In [None]:
atk1_data = uniprot_request_data("P31749")

# looking at organism for atk1
# uniprot_request_data("P31749", "organism")

Data successfully recieved
Data successfully recieved


{'scientificName': 'Homo sapiens',
 'commonName': 'Human',
 'taxonId': 9606,
 'lineage': ['Eukaryota',
  'Metazoa',
  'Chordata',
  'Craniata',
  'Vertebrata',
  'Euteleostomi',
  'Mammalia',
  'Eutheria',
  'Euarchontoglires',
  'Primates',
  'Haplorrhini',
  'Catarrhini',
  'Hominidae',
  'Homo']}

In [37]:
type(akt1_data["results"])

list

In [41]:
akt1_data["results"][0]["organism"]

{'scientificName': 'Homo sapiens',
 'commonName': 'Human',
 'taxonId': 9606,
 'lineage': ['Eukaryota',
  'Metazoa',
  'Chordata',
  'Craniata',
  'Vertebrata',
  'Euteleostomi',
  'Mammalia',
  'Eutheria',
  'Euarchontoglires',
  'Primates',
  'Haplorrhini',
  'Catarrhini',
  'Hominidae',
  'Homo']}

In [None]:
# Looking at the keys each protein has on UniProt
data["results"][0].keys()

dict_keys(['entryType', 'primaryAccession', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'geneLocations', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])