In [3]:
import requests
import json

# Function to retrieve canonical sequence from UniProt by UniProt ID
def get_canonical_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        fasta_data = response.text
        # Extract sequence from FASTA format (skip the header line starting with '>')
        sequence = ''.join(fasta_data.split('\n')[1:])
        return sequence
    else:
        print(f"Failed to retrieve sequence for UniProt ID: {uniprot_id}")
        return None

# Function to create the PDB search query
def create_pdb_blast_query(sequences, evalue_cutoff=0.001, identity_cutoff=0.7):
    query = {
        "query": {
            "type": "group",
            "logical_operator": "or",
            "nodes": []
        },
        "request_options": {
            "scoring_strategy": "sequence",
            "paginate": {
                "start": 0,
                "rows": 100
            }
        },
        "return_type": "entry"
    }
    
    # Add each sequence to the query
    for seq in sequences:
        node = {
            "type": "terminal",
            "service": "sequence",
            "parameters": {
                "evalue_cutoff": evalue_cutoff,
                "identity_cutoff": identity_cutoff,
                "sequence_type": "protein",
                "value": seq
            }
        }
        query["query"]["nodes"].append(node)
    
    return query

# Function to perform PDB BLAST search
def perform_pdb_blast_search(query):
    url = "https://search.rcsb.org/rcsbsearch/v2/query?json"  # RCSB PDB search endpoint
    headers = {'Content-Type': 'application/json'}
    
    response = requests.post(url, headers=headers, data=json.dumps(query))
    
    if response.status_code == 200:
        return response.json()  # Returns the results in JSON format
    else:
        print(f"Failed to perform PDB search. Status code: {response.status_code}")
        return None

# Function to read UniProt IDs from a file
def read_uniprot_ids_from_file(filename):
    try:
        with open(filename, 'r') as file:
            # Read all lines, strip any excess whitespace, and filter out empty lines
            return [line.strip() for line in file.readlines() if line.strip()]
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return []

# Main function
def main():
    # File path to the txt file containing UniProt IDs (one per line)
    uniprot_file = "uniprot_ids_test.txt"

    # Read UniProt IDs from the file
    uniprot_ids = read_uniprot_ids_from_file(uniprot_file)

    if not uniprot_ids:
        print("No UniProt IDs found in the file.")
        return

    # Print the number of UniProt IDs used
    print(f"Number of UniProt IDs used: {len(uniprot_ids)}")
    
    # Retrieve sequences for each UniProt ID
    sequences = []
    for uniprot_id in uniprot_ids:
        sequence = get_canonical_sequence(uniprot_id)
        if sequence:
            sequences.append(sequence)

    if sequences:
        # Create PDB BLAST search query
        pdb_query = create_pdb_blast_query(sequences)

        # Perform PDB BLAST search
        results = perform_pdb_blast_search(pdb_query)

        if results:
            # Handle results (print, save, etc.)
            print(json.dumps(results, indent=4))

if __name__ == "__main__":
    main()


{
    "query_id": "fe810f97-5471-4403-9a3a-ef158c27f10c",
    "result_type": "entry",
    "total_count": 10,
    "result_set": [
        {
            "identifier": "3ONA",
            "score": 1.0
        },
        {
            "identifier": "4XT3",
            "score": 1.0
        },
        {
            "identifier": "7RKF",
            "score": 1.0
        },
        {
            "identifier": "7RKM",
            "score": 1.0
        },
        {
            "identifier": "7RKN",
            "score": 1.0
        },
        {
            "identifier": "1B2T",
            "score": 0.9090909090909091
        },
        {
            "identifier": "4XT1",
            "score": 0.8636363636363636
        },
        {
            "identifier": "1F2L",
            "score": 0.8181818181818182
        },
        {
            "identifier": "7XBX",
            "score": 0.7272727272727273
        },
        {
            "identifier": "5WB2",
            "score": 0.0
        }
    ]
}


In [3]:
import requests
import json
import pandas as pd

# Function to retrieve canonical sequence from UniProt by UniProt ID
def get_canonical_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        fasta_data = response.text
        # Extract sequence from FASTA format (skip the header line starting with '>')
        sequence = ''.join(fasta_data.split('\n')[1:])
        return sequence
    else:
        print(f"Failed to retrieve sequence for UniProt ID: {uniprot_id}")
        return None

# Function to create the PDB search query
def create_pdb_blast_query(sequences, evalue_cutoff=0.01, identity_cutoff=0.5):
    query = {
        "query": {
            "type": "group",
            "logical_operator": "or",
            "nodes": []
        },
        "request_options": {
            "scoring_strategy": "sequence",
            "paginate": {
                "start": 0,
                "rows": 500
            }
        },
        "return_type": "entry"
    }
    
    # Add each sequence to the query
    for seq in sequences:
        node = {
            "type": "terminal",
            "service": "sequence",
            "parameters": {
                "evalue_cutoff": evalue_cutoff,
                #"identity_cutoff": identity_cutoff,
                "sequence_type": "protein",
                "value": seq
            }
        }
        query["query"]["nodes"].append(node)
    
    return query

# Function to perform PDB BLAST search
def perform_pdb_blast_search(query):
    url = "https://search.rcsb.org/rcsbsearch/v2/query?json"  # RCSB PDB search endpoint
    headers = {'Content-Type': 'application/json'}
    
    response = requests.post(url, headers=headers, data=json.dumps(query))
    
    if response.status_code == 200:
        return response.json()  # Returns the results in JSON format
    else:
        print(f"Failed to perform PDB search. Status code: {response.status_code}")
        return None

# Function to read UniProt IDs from a file
def read_uniprot_ids_from_file(filename):
    try:
        with open(filename, 'r') as file:
            # Read all lines, strip any excess whitespace, and filter out empty lines
            return [line.strip() for line in file.readlines() if line.strip()]
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return []

# Function to process results into a pandas DataFrame
def process_results_to_dataframe(results):
    if "result_set" in results:
        # Extract 'identifier' and 'score' from each result in result_set
        data = [{"identifier": entry["identifier"]} for entry in results["result_set"]]
        # Create a DataFrame from the extracted data
        df = pd.DataFrame(data)
        df.to_csv('pdb_blast_results.csv', index=False)  # To save as CSV

        # Display the DataFrame as a table
        print(df)
    else:
        print("No results found in the response.")

# Main function
def main():
    # File path to the txt file containing UniProt IDs (one per line)
    uniprot_file = "uniprot_ids_human.txt"

    # Read UniProt IDs from the file
    uniprot_ids = read_uniprot_ids_from_file(uniprot_file)

    if not uniprot_ids:
        print("No UniProt IDs found in the file.")
        return

    # Print the number of UniProt IDs used
    print(f"Number of UniProt IDs used: {len(uniprot_ids)}")
    
    # Retrieve sequences for each UniProt ID
    sequences = []
    for uniprot_id in uniprot_ids:
        sequence = get_canonical_sequence(uniprot_id)
        if sequence:
            sequences.append(sequence)

    if sequences:
        # Create PDB BLAST search query
        pdb_query = create_pdb_blast_query(sequences)

        # Perform PDB BLAST search
        results = perform_pdb_blast_search(pdb_query)

        if results:
            # Process results into a pandas DataFrame
            process_results_to_dataframe(results)

if __name__ == "__main__":
    main()


Number of UniProt IDs used: 46
    identifier
0         3KBX
1         2X69
2         3H44
3         5COR
4         5D65
..         ...
227       3GV3
228       8K3Z
229       4LMQ
230       5IZB
231       5L7M

[232 rows x 1 columns]


In [1]:
import requests
import csv

def fetch_pdb_ids_by_pfam(pfam_accession):
    """
    Fetches PDB IDs associated with a given PFAM accession number.

    Parameters:
    - pfam_accession: PFAM accession number as a string.

    Returns:
    - A list of PDB IDs.
    """
    # Define the base URL for the PDBe search API
    url = "https://www.ebi.ac.uk/pdbe/search/pdb/select"
    
    # Define the search query and parameters
    query = f"pfam_accession:{pfam_accession}"
    params = {
        'q': query,    # Query parameter
        'wt': 'json',  # Requesting JSON format
        'rows': 1000   # Maximum number of rows (increase if needed)
    }
    
    # Send a GET request to the PDBe search API
    response = requests.get(url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        
        # Extract and return PDB IDs from the "docs" field
        return [doc.get('pdb_id') for doc in data['response']['docs'] if doc.get('pdb_id')]
    else:
        # Handle errors
        print(f"Failed to fetch data from PDBe API. Status code: {response.status_code}")
        return []

def save_pdb_ids_to_csv(pdb_ids, filename='pdb_ids.csv'):
    """
    Saves a list of PDB IDs to a CSV file in sorted order.

    Parameters:
    - pdb_ids: List of PDB IDs.
    - filename: Name of the CSV file (default is 'pdb_ids.csv').
    """
    # Sort the PDB IDs
    sorted_pdb_ids = sorted(pdb_ids)

    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for pdb_id in sorted_pdb_ids:
            writer.writerow([pdb_id])  # Write each PDB ID


# Example usage:
pfam_accession = "PF00048"
pdb_ids = fetch_pdb_ids_by_pfam(pfam_accession)

# Print the number of fetched PDB IDs
print(f"Number of PDB IDs fetched: {len(pdb_ids)}")

# Save the PDB IDs to a CSV file
save_pdb_ids_to_csv(pdb_ids)

# Inform the user
print(f"PDB IDs have been saved to 'pdb_ids.csv'.")


Number of PDB IDs fetched: 247
PDB IDs have been saved to 'pdb_ids.csv'.


In [2]:
import requests
import json
import pandas as pd
import csv

# Function to retrieve canonical sequence from UniProt by UniProt ID
def get_canonical_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        fasta_data = response.text
        sequence = ''.join(fasta_data.split('\n')[1:])  # Skip the header line
        return sequence
    else:
        print(f"Failed to retrieve sequence for UniProt ID: {uniprot_id}")
        return None

# Function to create the PDB search query
def create_pdb_blast_query(sequences, evalue_cutoff=0.01):
    query = {
        "query": {
            "type": "group",
            "logical_operator": "or",
            "nodes": []
        },
        "request_options": {
            "scoring_strategy": "sequence",
            "paginate": {
                "start": 0,
                "rows": 500
            }
        },
        "return_type": "entry"
    }

    for seq in sequences:
        node = {
            "type": "terminal",
            "service": "sequence",
            "parameters": {
                "evalue_cutoff": evalue_cutoff,
                "sequence_type": "protein",
                "value": seq
            }
        }
        query["query"]["nodes"].append(node)

    return query

# Function to perform PDB BLAST search
def perform_pdb_blast_search(query):
    url = "https://search.rcsb.org/rcsbsearch/v2/query?json"
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, headers=headers, data=json.dumps(query))

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to perform PDB search. Status code: {response.status_code}")
        return None

# Function to fetch PDB IDs associated with a PFAM accession
def fetch_pdb_ids_by_pfam(pfam_accession):
    url = "https://www.ebi.ac.uk/pdbe/search/pdb/select"
    query = f"pfam_accession:{pfam_accession}"
    params = {
        'q': query,
        'wt': 'json',
        'rows': 1000
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return [doc.get('pdb_id').lower() for doc in data['response']['docs'] if doc.get('pdb_id')]
    else:
        print(f"Failed to fetch data from PDBe API. Status code: {response.status_code}")
        return []

# Function to read UniProt IDs from a file
def read_uniprot_ids_from_file(filename):
    try:
        with open(filename, 'r') as file:
            return [line.strip() for line in file.readlines() if line.strip()]
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return []

# Function to process results and extract unique PDB IDs
def extract_pdb_ids_from_results(results):
    if "result_set" in results:
        return list(set(entry["identifier"].lower() for entry in results["result_set"]))
    else:
        print("No results found in the response.")
        return []

# Function to save PDB IDs to a CSV file
def save_pdb_ids_to_csv(pdb_ids, filename='pdb_ids.csv'):
    sorted_pdb_ids = sorted(pdb_ids)
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for pdb_id in sorted_pdb_ids:
            writer.writerow([pdb_id])

# Main function
def main():
    # File path to the txt file containing UniProt IDs
    uniprot_file = "uniprot_ids_human.txt"

    # Read UniProt IDs from the file
    uniprot_ids = read_uniprot_ids_from_file(uniprot_file)

    # Retrieve sequences for each UniProt ID
    sequences = [get_canonical_sequence(uniprot_id) for uniprot_id in uniprot_ids if get_canonical_sequence(uniprot_id)]

    # Perform PDB BLAST search if sequences are available
    pdb_ids_from_blast = []
    if sequences:
        pdb_query = create_pdb_blast_query(sequences)
        results = perform_pdb_blast_search(pdb_query)
        if results:
            pdb_ids_from_blast = extract_pdb_ids_from_results(results)

    # Fetch PDB IDs by PFAM accession
    pfam_accession = "PF00048"
    pdb_ids_from_pfam = fetch_pdb_ids_by_pfam(pfam_accession)

    # Combine and deduplicate PDB IDs
    combined_pdb_ids = list(set(pdb_ids_from_blast + pdb_ids_from_pfam))

    # Save the combined PDB IDs to a CSV file
    save_pdb_ids_to_csv(combined_pdb_ids)

    # Print the results
    print(f"Number of unique PDB IDs found: {len(combined_pdb_ids)}")
    print("Combined PDB IDs have been saved to 'pdb_ids.csv'.")

if __name__ == "__main__":
    main()


Number of unique PDB IDs found: 247
Combined PDB IDs have been saved to 'pdb_ids.csv'.


In [26]:
import pandas as pd
import requests

# Function to get the entry name and organism type for a PDB ID
def get_pdb_entry_details(pdb_id):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        
        # Extract the entry name (title)
        entry_name = data['struct']['title'] if 'struct' in data and 'title' in data['struct'] else "Unknown"

        # Extract the organism scientific name (taxonomic information)
        organism = "Unknown"
        if 'rcsb_entity_source_organism' in data['rcsb_entry_container_identifiers']:
            organism_info = data['rcsb_entry_container_identifiers']['rcsb_entity_source_organism']
            if organism_info and 'scientific_name' in organism_info[0]:
                organism = organism_info[0]['scientific_name']

        return entry_name, organism
    else:
        print(f"Failed to fetch data for PDB ID: {pdb_id}")
        return None, None

# Function to update CSV with PDB entry names and organism types
def add_pdb_entry_details_to_csv(input_csv, output_csv):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Ensure the file contains a column named "PDB_ID" (adjust if needed)
    if 'identifier' not in df.columns:
        print(f"The file '{input_csv}' does not contain a 'PDB_ID' column.")
        return

    # Initialize empty columns for Entry Name and Organism
    df['Entry_Name'] = None
    df['Organism_Type'] = None

    # For each PDB ID, fetch the details and update the DataFrame
    for index, row in df.iterrows():
        pdb_id = row['identifier']
        entry_name, organism_type = get_pdb_entry_details(pdb_id)
        df.at[index, 'Entry_Name'] = entry_name
        df.at[index, 'Organism_Type'] = organism_type

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Updated CSV saved to {output_csv}")

# Main function
def main():
    # Input and output CSV file paths
    input_csv = 'pdb_blast_results.csv'  # Input CSV with PDB IDs
    output_csv = 'pdb_ids_with_details.csv'  # Output CSV with added entry names and organism types

    # Update the CSV with PDB entry names and organism types
    add_pdb_entry_details_to_csv(input_csv, output_csv)

if __name__ == "__main__":
    main()


Updated CSV saved to pdb_ids_with_details.csv
