In [None]:
import requests

url = "https://www.uniprot.org/uniprotkb/A0A0R0ERV5/entry#names_and_taxonomy"
response = requests.get(url)

if response.status_code == 200:
    html_code = response.text
    print(html_code)
else:
    print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")


I have a dataframe called af2 (564446 rows × 73 columns) encoding the evaluation indexes of each corresponding alphafold2 predicted protein structures. For each row in the dataframe i want to use the string under index 'UniProt_ID' to fetch information from uniprot website f'https://www.uniprot.org/uniprotkb/{UniProt_ID}/entry#names_and_taxonomy'. By doing so, I can add a new column to the af2 dataframe called 'Domain' attached to each row and show the domain of the source of the protein (Eukaryota (eucaryotes),  Bacteria (eubacteria), Archaea, Viruses)

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
af2 = pd. read_csv('destress_data_af2.csv')
af2.columns = ['design_name', 'file_name', 'full_sequence', 'dssp_assignment', 'composition_ALA', 'composition_CYS', 
               'composition_ASP', 'composition_GLU', 'composition_PHE', 'composition_GLY', 'composition_HIS', 
               'composition_ILE', 'composition_LYS', 'composition_LEU', 'composition_MET', 'composition_ASN', 
               'composition_PRO', 'composition_GLN', 'composition_ARG', 'composition_SER', 'composition_THR',
                'composition_VAL', 'composition_TRP', 'composition_UNK', 'composition_TYR', 'ss_prop_alpha_helix', 
                'ss_prop_beta_bridge', 'ss_prop_beta_strand', 'ss_prop_3_10_helix', 'ss_prop_pi_helix', 
                'ss_prop_hbonded_turn', 'ss_prop_bend', 'ss_prop_loop', 'hydrophobic_fitness', 'isoelectric_point', 
                'charge', 'mass', 'num_residues', 'packing_density', 'budeff_total', 'budeff_steric', 'budeff_desolvation', 
                'budeff_charge', 'evoef2_total', 'evoef2_ref_total', 'evoef2_intraR_total', 'evoef2_interS_total', 
                'evoef2_interD_total', 'dfire2_total', 'rosetta_total', 'rosetta_fa_atr', 'rosetta_fa_rep', 
                'rosetta_fa_intra_rep', 'rosetta_fa_elec', 'rosetta_fa_sol', 'rosetta_lk_ball_wtd', 'rosetta_fa_intra_sol_xover4', 
                'rosetta_hbond_lr_bb', 'rosetta_hbond_sr_bb', 'rosetta_hbond_bb_sc', 'rosetta_hbond_sc', 'rosetta_dslf_fa13', 
                'rosetta_rama_prepro', 'rosetta_p_aa_pp', 'rosetta_fa_dun', 'rosetta_omega', 'rosetta_pro_close', 
                'rosetta_yhh_planarity', 'aggrescan3d_total_value', 'aggrescan3d_avg_value', 'aggrescan3d_min_value', 
                'aggrescan3d_max_value']

# Remove 'AF-' from the 'design_name' values
af2["UniProt_ID"] = af2["design_name"].str.replace("AF-", "")
# Remove the last 12 characters from the 'UniProt_ID' values
af2["UniProt_ID"] = af2["UniProt_ID"].str[:-12]
#af2 = af2.drop_duplicates(subset='UniProt_ID', keep='first')


In [3]:
af2_random = af2.sample(n=10000, random_state=42) 


In [9]:
from concurrent.futures import ThreadPoolExecutor

def get_domain_from_uniprot(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
    response = requests.get(url, timeout=30)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "xml")
        lineage = soup.find("lineage")
        if lineage:
            for taxon in lineage.find_all("taxon"):
                domain = taxon.get_text()
                if domain in ["Eukaryota", "Bacteria", "Archaea", "Viruses"]:
                    return domain
    return None

def fetch_domains(uniprot_ids, max_workers):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(get_domain_from_uniprot, uniprot_ids))
    return results


#af2_slice = pd.DataFrame(af2_random.iloc[0:1,:])  # Replace with 'af2' for the entire dataframe
uniprot_ids = af2_random ["UniProt_ID"].tolist()
domains = fetch_domains(uniprot_ids, max_workers=10)  # Adjust max_workers as needed
af2_random["Domain"] = domains
 

KeyboardInterrupt: 

In [None]:
# Assuming your dataframe is named 'af2'
#af2["Domain"] = af2["UniProt_ID"].apply(get_domain_from_uniprot)
af2_random.to_csv('af2_random_database.csv')

In [None]:
def howlongittakes(timefor100,maxworkers):
    totalhours = timefor100/100*100000/60/60
    print (f'When Max Workers are {maxworkers}, it takes {totalhours}')

max_workers = 10, take

In [None]:
howlongittakes (3.9,150)

In [None]:
howlongittakes (10.6,30)

In [None]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
import re


# Define a regular expression pattern for extracting the next link 
# from the response headers
re_next_link = re.compile(r'<(.+)>; rel="next"')

# Configure the Retry object for handling retryable HTTP error statuses
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])

# Set up a requests session for making requests with configured retries
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

# Function to extract the next link from the response headers
def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

# Generator function to retrieve data in batches
def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)

# Function to build the UniProt query string from a list of UniProt accession IDs
def build_taxonomy_query(uniprot_ids):
    query = " OR ".join(f"accession:{accession}" for accession in uniprot_ids)
    return query

# Set the batch size for UniProt queries
batch_size = 100

# Initialize an empty dictionary to store fetched taxonomy data
taxonomy_data = {}

# Iterate through the UniProt IDs in the af2 DataFrame in batches
for i in range(0, 10, batch_size):
    # Get the current batch of UniProt IDs
    uniprot_ids = af2["UniProt_ID"].iloc[i:i+batch_size].tolist()

    # Build the UniProt query string for the current batch of UniProt IDs
    query = build_taxonomy_query(uniprot_ids)

    # Construct the UniProt URL for fetching taxonomy information
    url = f'https://rest.uniprot.org/uniprotkb/{query}.xml'

    # Fetch taxonomy data for the current batch of UniProt IDs
    for batch, total in get_batch(url):
        # Process each line in the batch response, excluding the header line
        for line in batch.text.splitlines()[1:]:
            # Split the line into columns
            accession, taxonomy_id, superkingdom, cls = line.split('\t')

            # Store the taxonomy data in the taxonomy_data dictionary
            taxonomy_data[accession] = (taxonomy_id, superkingdom, cls)

        # Print the progress
        print(f'{len(taxonomy_data)} / {total}')

# Map the fetched taxonomy data to the corresponding columns in the af2 DataFrame
af2["Taxonomy_ID"], af2["Superkingdom"], af2["Class"] = zip(*af2["UniProt_ID"].map(taxonomy_data).values)
