[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/biosustain/data_club/blob/main/notebooks/data_annotation/uniprot_api-solved.ipynb)


# Data Annotation -- Use Case 1

In this notebook, we will use [UniProt's API](https://www.uniprot.org/help/programmatic_access) to get gene/protein information to annotate our experimental data.


As a project dataset, we will use [Xia et al 2022](https://www.nature.com/articles/s41467-022-30513-2): **Proteome allocations change linearly with the specific growth rate of Saccharomyces cerevisiae under glucose limitation**

<div>
<img src="https://github.com/biosustain/data_club/raw/main/figures/xia_et_al_2022.png" width="900"/>
</div>



And specifically the absolute proteome and transcriptome:

<div>
<img src="https://github.com/biosustain/data_club/raw/main/figures/xia_datasets.png" width="500"/>
</div>


In [18]:
import os
import requests, sys
import json
import pandas as pd


def get_accession(gene_name):
    requestURL = f"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&gene={gene_name}"
    
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    responseBody = r.text
    try:
        json_response = json.loads(responseBody)[0]["accession"]
    except:
        json_response = gene_name

    return json_response

In [98]:
#get_accession(gene_name="R0010W")

In [95]:
def get_protein_info(accession):
    requestURL = f"https://www.ebi.ac.uk/proteins/api/proteins/{accession}"

    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    responseBody = r.text
    try:
        json_response = json.loads(responseBody)
    except:
        json_response = {}

    return json_response

def extract_protein_info(accession, response):
    df =pd.DataFrame({"id":response["id"], "taxid":response["organism"]["taxonomy"], 
                      "organism":str(response["organism"]["names"][0]["value"]), 
                      "comments": response["comments"][0]["text"][0]["value"], 
                      "sequence":response["sequence"]["sequence"], 
                      "sequence_length":response["sequence"]["length"], 
                      "sequence_mass":response["sequence"]["mass"]}, index=[accession])
    return(df)

In [97]:
#result = get_protein_info(accession="P03870")

#extract_protein_info(accession="P03870", response=result)


In [19]:
data_dir = "../../data"

transcriptome_df = pd.read_csv(os.path.join(data_dir, "trasncriptomics.tsv"), sep='\t', index_col=False)
transcriptome_df["Accessions"] = transcriptome_df["mRNA"].apply(lambda x: get_accession(gene_name=x))

In [21]:
transcriptome_df.to_csv(os.path.join(data_dir, "transcriptomics_mapped.tsv"), sep='\t', index=False, doublequote=None, header=True)