In [1]:
# loading hmm proteins set
import pandas as pd
import requests
with open('Original_dataset.txt','r') as o:
    original_dataset = pd.read_csv(o, header = None)
original_proteins = set(original_dataset[0])

In [2]:
#################################
### Handle UniProt search API ###
#################################

# Dependencies
import requests as req
import time
import json
import tempfile
import xmltodict
import gzip


# Constants
BASE_URL = r'https://www.uniprot.org'


# Define function for getting a single protein
def get_protein(protein_id, database='uniref', params={}):
    # Make API request
    res = req.get('/'.join([BASE_URL, database, str(protein_id) + '.fasta']),
                  headers={'Accept': 'text/x-fasta'},
                  params=params)
    # Return results
    return res.status_code == 200, res.text, res


# Define function for retrieving mapping entries batches
def map_ids(ids, from_db='NF90', to_db='NF90', out_format='fasta', batch_size=100, params={}):
    # Define output container
    out = ''
    # Define number of proteins id
    num_ids = len(ids)
    # Loop through each batch
    for i in range(0, num_ids, batch_size):
        # Define ids batch
        batch_ids = ids[i:min((i+batch_size), num_ids)]
        # Make API request
        res = req.get('/'.join([BASE_URL, 'uploadlists']),
                      params={**params, **{
                        'from': from_db,
                        'to': to_db,
                        'format': out_format,
                        'query': ' '.join(batch_ids)
                      }})
        # Check result
        if res.status_code != 200: break
        # Store results
        out = out + res.text
    # Return results
    return res.status_code == 200, out, res


# Define function for making a generic query
def make_query(query, params={}):
    

    # Define params
    params = {**{
        'sort': 'score',
        'format': 'tab',
        'compress': 'no',
        'columns': 'id'
    }, **params}
    # Add query
    params.setdefault('query', query)
    # Make query
    response = req.get('/'.join([BASE_URL, 'uniprot']),
                        headers={'Accept': 'text/plain'},
                        params=params)
    # Get result
    result = response.text
    status = (response.status_code == 200)
    # If compressed: uncompress
    if params.get('compress', 'no') == 'yes' and status:
        result = str(gzip.decompress(response.content), 'utf-8')
    # Get result
    return status, result, response



#################################
#creation of GO DATASET 
def create_GO_ds(human_ds):
    GO_ds = pd.DataFrame(columns = ['Entry', 'GO_id','Go_description','Length', 'Pfam_domains'])
    proteins = set(list(human_ds['Entry'].values))
    i = 0
    
    for protein in proteins:
        section = human_ds[human_ds['Entry'] == protein] #here I take just our proteins rows from human swiss prot dataset

        # take some important values to add in the final dataframe
        length = section['Length'].values[0]
        Pfam_id = section['Cross-reference (Pfam)'].values[0]
        GO_section = section['Gene ontology (GO)'].values[0].split(';') # here I split the Gene ontology entry

        # take all the GO_id for that protein with descriptions too
        go_ids = [s[-8:-1] for s in GO_section]
        go_descriptions = [s[:-12] for s in GO_section]

        # create a row for each id,description,protein tuple
        for go_id,go_d in zip(go_ids,go_descriptions):
            add = pd.DataFrame({'Entry':[protein],'GO_id':[go_id],'Go_description':[go_d],'Length':[length],'Pfam_domains':[Pfam_id]},index = [i])
            GO_ds = GO_ds.append(add)
            i += 1
    
    return GO_ds





#####################
# other functions for architecture datasets (it returns a dict) (input = GO_human_ds)
# these two functions work for human_ds and for GO_human_ds too (must change just the column name of filtered data)
import numpy as np
def retrieve_domains(protein,data):
    filtered_data = data[data['Entry'] == protein]
    return [c for c in filtered_data['Pfam_domains'].values[0].split(';') if len(c) > 1]


def create_architecture_dataset(proteins, data):
    architecture_dataset = {}
    for protein in proteins:
        dom = ''
        domains = set(retrieve_domains(protein,data))
         
        for d in domains:
            dom += d+';'
        if len(dom) > 1:
            if dom not in architecture_dataset.keys():
                architecture_dataset[dom] = [protein]
            else:
                architecture_dataset[dom].append(protein)
    return architecture_dataset



In [3]:
######## REQUEST FOR HUMAN PROTEINS DATASET FROM SWISS PROT
status, table, _ = make_query('reviewed:yes AND organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'columns': ','.join(['id', 'entry name', 'reviewed', 'protein names', 'genes', 'length', 'go', 'database(Pfam)']),
    'sort': 'score',
    'format': 'tab'
})

In [4]:
##### HUMAN DATASET
# Create DataFrame object

# Get rows and header row
rows = table.split('\n')
header = rows[0].split('\t')

# Instantiate new dataframe
human_ds = pd.DataFrame([row.split('\t') for row in rows[1:-1]], columns=header)
human_ds.to_csv(index=False)
human_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam)
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795,cell [GO:0005623]; cell junction [GO:0030054];...,PF09070;PF08324;PF00400;
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,PF10523;PF00651;
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,PF08397;PF02205;
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782,endoplasmic reticulum membrane [GO:0005789]; G...,PF01734;
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,PF00046;PF00157;


In [None]:
###### GENE ONTOLOGY DATASET
create_GO_ds(human_ds)