In [75]:
#Import necessary packages 
import os
import requests
import subprocess
from Bio.PDB import MMCIFParser, PDBIO





In [76]:
#Code for finding AlphaFold structures, if available, using the code provided below. These AlphaFold structures will later be used to extract features to feed to the model for predictions.

#First read FASTA file and extract the protein sequence using the header as a guide
def read_fasta(fasta_file):
    sequences = []
    with open(fasta_file, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                header_parts = line[1:].split('|')
                if len(header_parts) >= 2:
                    uniprot_id = header_parts[1]
                    sequences.append({'header': uniprot_id, 'sequence': ''})
            else:
                if sequences:
                    sequences[-1]['sequence'] += line
    return sequences

In [77]:
#First try extracting the PDB CIF file using the given URL

def get_predicted_structure(uniprot_id, api_key, output_dir):
    url = f'https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}?key={api_key}'
    response = requests.get(url)
    
    if response.status_code == 200:
        prediction_data = response.json()
        if prediction_data and isinstance(prediction_data, list) and len(prediction_data) > 0:
            prediction_entry = prediction_data[0]  # Get the first entry from the list
            cif_url = prediction_entry.get('cifUrl')
            if cif_url:
                cif_response = requests.get(cif_url)
                if cif_response.status_code == 200:
                    cif_data = cif_response.text
                    cif_filename = os.path.join(output_dir, f"{uniprot_id}.cif")
                    with open(cif_filename, 'w') as cif_file:
                        cif_file.write(cif_data)
                    print(f"Successfully downloaded predicted structure for UniProt ID {uniprot_id} to {cif_filename}")
                    return cif_filename
                else:
                    print(f"Failed to retrieve predicted structure for UniProt ID {uniprot_id}. Status code: {cif_response.status_code}")
            else:
                print(f"No CIF URL found for UniProt ID {uniprot_id}")
        else:
            print(f"Empty or invalid prediction data received for UniProt ID {uniprot_id}")
    else:
        print(f"Failed to retrieve predicted structure for UniProt ID {uniprot_id}. Status code: {response.status_code}")
    return None


    

In [78]:
#Process predicted structure by saving it in the output directory

def process_predicted_structure(uniprot_id, api_key, output_dir):
    url = f'https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}?key={api_key}'
    response = requests.get(url)
    
    if response.status_code == 200:
        prediction_data = response.json()
        if prediction_data and isinstance(prediction_data, list) and len(prediction_data) > 0:
            prediction_entry = prediction_data[0]  # Get the first entry from the list
            cif_url = prediction_entry.get('cifUrl')
            if cif_url:
                cif_response = requests.get(cif_url)
                if cif_response.status_code == 200:
                    cif_data = cif_response.text
                    cif_filename = os.path.join(output_dir, f"{uniprot_id}.cif")
                    with open(cif_filename, 'w') as cif_file:
                        cif_file.write(cif_data)
                    print(f"Successfully downloaded predicted structure for UniProt ID {uniprot_id} to {cif_filename}")
                    return cif_filename
                else:
                    print(f"Failed to retrieve predicted structure for UniProt ID {uniprot_id}. Status code: {cif_response.status_code}")
            else:
                print(f"No CIF URL found for UniProt ID {uniprot_id}")
        else:
            print(f"Empty or invalid prediction data received for UniProt ID {uniprot_id}")
    else:
        print(f"Failed to retrieve predicted structure for UniProt ID {uniprot_id}. Status code: {response.status_code}")
    return None



In [79]:
def find_homolog(sequence):
    # Perform a search for homologs using HHpred
    hhpred_url = "https://toolkit.tuebingen.mpg.de/api/hhpred"
    payload = {
        "query": sequence,
        "database": "uniclust30_2022_03",
        "n_hits": 1  # Number of hits to return
    }
    response = requests.post(hhpred_url, json=payload)

    if response.status_code == 200:
        result = response.json()
        # Extract information about the top hit
        top_hit = result.get('results', [])[0]  # Assuming at least one hit is returned
        hit_id = top_hit.get('id')
        hit_sequence = top_hit.get('alignment', {}).get('hit_sequence')
        hit_evalue = top_hit.get('evalue')
        return hit_id, hit_sequence, hit_evalue
    else:
        print("Error: Unable to retrieve homolog from HHpred")
        return None, None, None

In [80]:
def request_prediction(sequence):
    # Request protein structure prediction from AlphaFold
    alphafold_url = "https://alphafold.ebi.ac.uk/submit"
    payload = {
        "target": sequence
    }
    response = requests.post(alphafold_url, json=payload)

    if response.status_code == 200:
        prediction_id = response.json().get('prediction_id')
        return prediction_id
    else:
        print("Error: Unable to request protein structure prediction from AlphaFold")
        return None

In [81]:
def main(fasta_file_path, api_key, output_dir):
    sequences = read_fasta(fasta_file_path)
    for sequence_data in sequences:
        uniprot_id = sequence_data['header']
        mmcif_file = get_predicted_structure(uniprot_id, api_key, output_dir)
        if not mmcif_file:
            hit_id, hit_sequence, hit_evalue = find_homolog(sequence_data['sequence'])
            if hit_id and hit_sequence and hit_evalue:
                prediction_id = request_prediction(hit_sequence)
                if prediction_id:
                    print("AlphaFold prediction requested successfully. Prediction ID:", prediction_id)
            else:
                print("Error: Unable to find a homolog for the given sequence")
        else:
            print(f"Successfully downloaded predicted structure for UniProt ID {uniprot_id}")
            # Optionally, process the predicted structure here
            process_predicted_structure(uniprot_id, mmcif_file, output_dir)



In [82]:
if __name__ == "__main__":
    fasta_file_path = "idmapping_2024_03_23.fasta"
    #Write your own API key here after obtaining it from AlphaFold
    api_key = "AIzaSyCeurAJz7ZGjPQUtEaerUkBZ3TaBkXrY94"
    output_dir = "output_dir"
    main(fasta_file_path, api_key, output_dir)

Successfully downloaded predicted structure for UniProt ID Q14738 to output_dir/Q14738.cif
Successfully downloaded predicted structure for UniProt ID Q14738
Successfully downloaded predicted structure for UniProt ID Q14738 to output_dir/Q14738.cif
Successfully downloaded predicted structure for UniProt ID Q13362 to output_dir/Q13362.cif
Successfully downloaded predicted structure for UniProt ID Q13362
Successfully downloaded predicted structure for UniProt ID Q13362 to output_dir/Q13362.cif
Successfully downloaded predicted structure for UniProt ID Q9NRA8 to output_dir/Q9NRA8.cif
Successfully downloaded predicted structure for UniProt ID Q9NRA8
Successfully downloaded predicted structure for UniProt ID Q9NRA8 to output_dir/Q9NRA8.cif
Successfully downloaded predicted structure for UniProt ID P42684 to output_dir/P42684.cif
Successfully downloaded predicted structure for UniProt ID P42684
Successfully downloaded predicted structure for UniProt ID P42684 to output_dir/P42684.cif
Successf