In [2]:
from pymol import cmd
import os
import re

In [2]:
path = "/Users/choderalab/asapdiscovery/temp_storage/arborvirus/"

## These are the structures that I was playing around with in terms of trying to find this on pdb from primary sequence directly

In [68]:
def find_pdb_files(folder_path):
    pdb_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdb"):
                pdb_files.append(os.path.join(root, file))
    return pdb_files

In [69]:
viruses = find_pdb_files(path)

In [70]:
def find_name_in_file(path):
    # Define the regular expression pattern
    pattern = r'/([\w\d]+)\.pdb'
    
    # Use re.search to find the pattern in the path
    match = re.search(pattern, path)
    
    # Check if there is a match
    if match:
        pdb_id = match.group(1)  # Extract the matched group
        return(pdb_id)
    else:
        print("No match found.")

In [71]:
# Save all fastas of the found pdb files
for virus in viruses:
    name = find_name_in_file(virus)
    cmd.load(virus, name)
    fasta_file = name+".fasta"
    cmd.save(fasta_file, name, 1, 'fasta')

In [20]:
cmd.fetch("2FP7", name='2FP7', type = 'pdb')

'2FP7'

In [21]:
cmd.save("2FP7.fasta","2FP7",1,'fasta')

## Realize that trying to pull directly from pdb was a bad idea
## Now trying to blast from refseq, find sequences with the highest similarity and fold those sequences

In [3]:
from pathlib import Path

In [5]:
# Get just the protein
structure_name = 'zikv'
local_path = Path("/Users/choderalab/asapdiscovery/temp_storage/")
cmd.load(local_path / "ZIKV_NS2B3_ligands.pdb", structure_name)
cmd.select("protein", structure_name + " and polymer.protein")
#cmd.save(local_path / "protein.pdb", "protein")

1599

In [8]:
# Save the fasta of that structure
cmd.save(local_path/"zikv_ns2b3.fasta", "protein and chain A+B")

In [9]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [10]:
# Define parameters
sequence = "DMYIERAGDITWEKDAEVTGNSPRLDVALDESGDFSLVE:GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAWDGLSEVQLLAVPPGERAKNIQTLPGIFKTKDGDIGAVALDYPAGTSGSPILDKCGRVIGLYGNGVVIKNGSYVSAITQGKRE"
blast_program = "blastp"
database = "refseq_protein"

In [18]:
# Perform BLAST query
result_handle = NCBIWWW.qblast(blast_program, database, sequence, format_type="XML",hitlist_size = 150)


In [19]:
# Save result to file
with open("zikv_blast_both_chain_results.xml", "w") as out_handle:
    out_handle.write(result_handle.read())


In [20]:
# For viewing all the outputs
counter = 1
with open("zikv_blast_both_chain_results.xml", 'r') as file:
    # Read the contents of the file
    blast_records = NCBIXML.parse(file)
    # Print out the BLAST results
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                print('****Alignment****')
                print('sequence:', alignment.title)
                print('length:', alignment.length)
                print('e value:', hsp.expect)
                print(hsp.query[0:75] + '...')
                print(hsp.match[0:75] + '...')
                print(hsp.sbjct[0:75] + '...')
                print(counter)
                counter+=1

****Alignment****
sequence: ref|YP_009227202.1| nonstructural protein NS3 [Zika virus]
length: 617
e value: 2.00467e-104
EGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
+GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
KGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
1
****Alignment****
sequence: ref|YP_009430304.1| nonstructural protein NS3 [Zika virus]
length: 617
e value: 1.50265e-102
EGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
+GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKG+ALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
KGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGSALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
2
****Alignment****
sequence: ref|YP_002790881.1| polyprotein [Zika virus]
length: 3419
e value: 3.11026e-98
EGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
+GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAW...
KGETTDGVYRVMTRR

In [21]:
# Find the organisms interested in 
# Define a set of organisms you are interested in
organisms_of_interest = {"West Nile virus", "dengue virus type 4","dengue virus type 3","dengue virus type 2","dengue virus type 1"}

# Store hits from organisms of interest
hits_from_interesting_organisms = []
# Species indicated between brackets
species_pattern = r'\[(.*?)\]'

# Iterate over blast records
with open("zikv_blast_both_chain_results.xml", 'r') as file:
    # Read the contents of the file
    blast_records = NCBIXML.parse(file)
    # Print out the BLAST results
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            title =  alignment.title
            species = re.search(species_pattern, title).group(1)
            # Only keep the output if the species is within organisms_of_interest
            if species in organisms_of_interest:
                for hsp in alignment.hsps:
                    hits_from_interesting_organisms.append((alignment.title,hsp.sbjct))


In [22]:
for hits in hits_from_interesting_organisms:
    print(hits[0])
    print(hits[1])

ref|NP_776018.1| non-structural protein NS3 [West Nile virus]
KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER
ref|YP_001527884.1| nonstructural protein 3 [West Nile virus]
KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGQDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDFPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER
ref|NP_041724.2| polyprotein [West Nile virus]
KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER
ref|NP_041724.2| polyprotein [West Nile virus]
DMWIERTADITWESDAEITGSSERVDVRLDDDGNFQLM
ref|YP_009246337.1| truncated flavivirus polyprotein WARF4 [West Nile virus]
KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGQDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDFPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER
ref|YP_009246337.1| truncated flavivir

In [44]:
# Write this to a fasta format
ofile = open("found_fasta.txt", "w")

for i in range(len(hits_from_interesting_organisms)):
    ofile.write(">" + hits_from_interesting_organisms[i][0] + "\n" +hits_from_interesting_organisms[i][1] + "\n")

#do not forget to close it
ofile.close()

## Get away from duplicates

In [46]:
from Bio import SeqIO

### This is removing all duplicates from only the file with organisms of interest

In [55]:
# Delete the duplicate sequences 
# https://biopython.org/wiki/Sequence_Cleaner
def sequence_cleaner(fasta_file, min_length=0, por_n=100):
    # Create our hash table to add the sequences
    sequences = {}

    # Using the Biopython fasta parse we can read our fasta input
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        # Take the current sequence
        sequence = str(seq_record.seq).upper()
        # check if in hash table, the sequence and its id are going to be in the hash
        if sequence not in sequences:
            sequences[sequence] = seq_record.description
        # If it is already in the hash table, we're just gonna skip
    return sequences

# Create a file in the same directory where you ran this script
fasta_file = "found_fasta.txt"
sequences = sequence_cleaner(fasta_file)
with open("clear_" + fasta_file, "w+") as output_file:
    # Just read the hash table and write on the file as a fasta format
    for sequence in sequences:
        output_file.write(">" + sequences[sequence] + "\n" + sequence + "\n")

print("CLEAN!!!\nPlease check clear_" + fasta_file)

CLEAN!!!
Please check clear_found_fasta.txt


### Now removing all duplicates from the entirety of the blast results

In [56]:
# First get the xml into a fasta file format
fastas = []
with open("zikv_blast_results.xml", 'r') as file:
    # Read the contents of the file
    blast_records = NCBIXML.parse(file)
    # Print out the BLAST results
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                fastas.append((alignment.title,hsp.sbjct))
                
# Write this to a fasta format
ofile = open("found_fasta_all.txt", "w")
for i in range(len(fastas)):
    ofile.write(">" + fastas[i][0] + "\n" + fastas[i][1] + "\n")
#do not forget to close it
ofile.close()

In [57]:
# Now save only the sequences that don't overlap with the ones before
fasta_file = "found_fasta_all.txt"
sequences = sequence_cleaner(fasta_file)
with open("clear_" + fasta_file, "w+") as output_file:
    # Just read the hash table and write on the file as a fasta format
    for sequence in sequences:
        output_file.write(">" + sequences[sequence] + "\n" + sequence + "\n")

print("CLEAN!!!\nPlease check clear_" + fasta_file)

CLEAN!!!
Please check clear_found_fasta_all.txt


In [59]:
# All sequences that are not identical
sequences

{'GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAWDGLSEVQLLAVPPGERARNIQTLPGIFKTKDGDIGAVALDYPAGTSGSPILDKCGRVIGLYGNGVVIKNGSYVSAITQGKRE': 'ref|YP_009227202.1| nonstructural protein NS3 [Zika virus]',
 'GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGSALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAWDGHSEVQLLAVPPGERARNIQTLPGIFKTKDGDIGAVALDYPAGTSGSPILDKCGRVIGLYGNGVVIKNGSYVSAITQGRRE': 'ref|YP_009430304.1| nonstructural protein NS3 [Zika virus]',
 'GETTAGVYRIMTRKLLGSTQVGAGVMHEGVFHTMWHVTKGSALRSGEGRLDPYWGNVKQDLISYCGPWKLDGKWDGVSEVQLIAVAPGERARNVQTKPGVFKTTDGEIGALALDFPGGSSGSPIIDKNGHVIGLYGNGVVVKSGSYVSAIMQTEK': 'ref|YP_009227191.1| nonstructural protein NS3 [Spondweni virus]',
 'GDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER': 'ref|NP_776018.1| non-structural protein NS3 [West Nile virus]',
 'GVYRVMKETILGSKQIGVGIMENGVFHTMWHVTQGASLKLEEGRIDPFWASVQEDLISYGGAWQLKGQWDGIEEVQLLAVPPKKNPENVQTKPGIFKTKE

## Put into csv for Colabfold

In [84]:
data = []
for sequence in sequences:
    data.append((sequences[sequence],sequence))
data

[('ref|NP_776018.1|',
  'GDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER'),
 ('ref|YP_001527884.1|',
  'GDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGQDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDFPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER'),
 ('ref|NP_740321.1|',
  'SEGVYRIMQRGLFGKTQVGVGIHMEGVFHTMWHVTRGSVICHETGRLEPSWADVRNDMISYGGGWRLGDKWDKEEDVQVLAIEPGKNPKHVQTKPGLFKTLTGEIGAVTLDFKPGTSGSPIINRKGKVIGLYGNGVVTKSGDYVSAITQAER'),
 ('ref|NP_722463.1|',
  'DGIYRILQRGLLGRSQVGVGVFQEGVFHTMWHVTRGAVLMYQGKRLEPSWASVKKDLISYGGGWRFQGSWNAGEEVQVIAVEPGKNPKNVQTAPGTFKTPEGEVGAIALDFKPGTSGSPIVNREGKIVGLYGNGVVTTSGTYVSAIAQAK')]

In [114]:
id_seqs = []
regex = r'ref\|(\w+\.\d+)\|'
for d in data:
    text = d[0]
    match = re.match(regex, text)
    id_seqs.append((match.group(1)+"_{}",d[1]))
id_seqs

[('NP_776018.1_{}',
  'GDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER'),
 ('YP_001527884.1_{}',
  'GDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGQDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDFPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER'),
 ('NP_740321.1_{}',
  'SEGVYRIMQRGLFGKTQVGVGIHMEGVFHTMWHVTRGSVICHETGRLEPSWADVRNDMISYGGGWRLGDKWDKEEDVQVLAIEPGKNPKHVQTKPGLFKTLTGEIGAVTLDFKPGTSGSPIINRKGKVIGLYGNGVVTKSGDYVSAITQAER'),
 ('NP_722463.1_{}',
  'DGIYRILQRGLLGRSQVGVGVFQEGVFHTMWHVTRGAVLMYQGKRLEPSWASVKKDLISYGGGWRFQGSWNAGEEVQVIAVEPGKNPKNVQTAPGTFKTPEGEVGAIALDFKPGTSGSPIVNREGKIVGLYGNGVVTTSGTYVSAIAQAK')]

In [115]:
import csv
# Output the previous thing to csv format
# Specify the file path
file_path = "arborviruses.csv"

# Write data to CSV file
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'sequence'])  # Write header
    writer.writerows(id_seqs)  # Write data rows


## In colabfold, when you have a template pdb file, do not have anything like _ or . in the file name or it will be parsed out and the script will be confused and only look for the first part that is before _

### Create dictionary that matches the reference code with the virus and protein it matches

In [110]:
# Create dictionary that matches the reference code with the virus and protein it matches
dict_found_proteins = {}
reference_pattern = r'(ref\|(\w+\.\d+)\|) (.+)'
                                 
for hits in hits_from_interesting_organisms:
    text = hits[0]
    match = re.match(reference_pattern, text)
    if match:
        reference = match.group(1)
        description = match.group(3)
        dict_found_proteins[reference] = description
    else:
        print("No match found.")
    
dict_found_proteins

{'ref|NP_776018.1|': 'non-structural protein NS3 [West Nile virus]',
 'ref|YP_001527884.1|': 'nonstructural protein 3 [West Nile virus]',
 'ref|NP_041724.2|': 'polyprotein [West Nile virus]',
 'ref|YP_009246337.1|': 'truncated flavivirus polyprotein WARF4 [West Nile virus]',
 'ref|YP_001527877.1|': 'polyprotein [West Nile virus]',
 'ref|NP_740321.1|': 'NS3 protein [dengue virus type 4]',
 'ref|NP_722463.1|': 'nonstructural protein 3 [dengue virus type 1]',
 'ref|NP_073286.1|': 'polyprotein [dengue virus type 4]',
 'ref|NP_059433.1|': 'polyprotein [dengue virus type 1]'}

Sequences identified to be put into collabfold to see structure and then aligned to the original structure. 
Then put the fragments back into the collabfold structure to see how well the interactions are preserved and the fintscore. 

Pulling the related pdb files from rcsb pdb to try to see the similarity between the collabfold structures and the actual ones logged if they at all correspond. (later time point than putting the fragments back into the new collabfold structures)

In [27]:
# Try to get pdb entry_id by the sequence input
import requests
import json

In [60]:
# https://search.rcsb.org/#search-api
#https://education.molssi.org/python-scripting-biochemistry/chapters/rcsb_api.html
def get_pdb_id_from_sequence(sequence, return_type):
    search_request = {
        "query": {
        "type": "terminal",
        "service": "sequence",
        "parameters": {
          "evalue_cutoff": 1,
          "identity_cutoff": 0.9,
          "sequence_type": "protein",
          "value": sequence
        }
      },
      "request_options": {
        "scoring_strategy": "sequence"
      },
      "return_type": return_type
    }
    my_query = json.dumps(search_request)

    url = f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}"
    response = requests.get(url)
    data = response.json()
    return data

# Example usage:
sequence = "GETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAWDGLSEVQLLAVPPGERAKNIQTLPGIFKTKDGDIGAVALDYPAGTSGSPILDKCGRVIGLYGNGVVIKNGSYVSAITQGKRE"
return_type = "entry"
pdb_id = get_pdb_id_from_sequence(sequence, return_type)
pdb_id

{'query_id': 'ced9e180-26e6-4c8b-8ee1-27304b0b7913',
 'result_type': 'entry',
 'total_count': 63,
 'result_set': [{'identifier': '5GJ4', 'score': 1.0},
  {'identifier': '5GPI', 'score': 1.0},
  {'identifier': '5H4I', 'score': 1.0},
  {'identifier': '5ZMS', 'score': 1.0},
  {'identifier': '5ZOB', 'score': 1.0},
  {'identifier': '6JPW', 'score': 1.0},
  {'identifier': '6KK2', 'score': 1.0},
  {'identifier': '6KK4', 'score': 1.0},
  {'identifier': '6KK5', 'score': 1.0},
  {'identifier': '6KK6', 'score': 1.0}]}

In [58]:
# Find all the different pdb ids that I could get from all the sequences found from blast
pdbs = []
for hits in hits_from_interesting_organisms:
    seq = hits[1]
    search = get_pdb_id_from_sequence(seq, return_type)
    pdbs.append([hits[0],search])

In [59]:
pdbs

[['ref|NP_776018.1| non-structural protein NS3 [West Nile virus]',
  {'query_id': 'a155b8e7-de49-4ac3-a17d-22184f90d080',
   'result_type': 'entry',
   'total_count': 7,
   'result_set': [{'identifier': '2FP7', 'score': 1.0},
    {'identifier': '2YOL', 'score': 1.0},
    {'identifier': '5IDK', 'score': 1.0},
    {'identifier': '2IJO', 'score': 0.3333333333333333},
    {'identifier': '3E90', 'score': 0.3333333333333333},
    {'identifier': '8CO8', 'score': 0.25},
    {'identifier': '2GGV', 'score': 0.0}]}],
 ['ref|YP_001527884.1| nonstructural protein 3 [West Nile virus]',
  {'query_id': '38de6344-aba9-464b-9140-fc6dc7a46174',
   'result_type': 'entry',
   'total_count': 7,
   'result_set': [{'identifier': '2FP7', 'score': 1.0},
    {'identifier': '2IJO', 'score': 1.0},
    {'identifier': '2YOL', 'score': 1.0},
    {'identifier': '3E90', 'score': 1.0},
    {'identifier': '5IDK', 'score': 1.0},
    {'identifier': '8CO8', 'score': 0.75},
    {'identifier': '2GGV', 'score': 0.0}]}],
 ['ref

## Want to grab the collabfold results from lilac
## openeye compare the superpose_molecule to align
https://github.com/choderalab/asapdiscovery/blob/main/asapdiscovery-modeling/asapdiscovery/modeling/modeling.py#L225-L243

In [112]:
# Save only the pdb of the folded molecule with the least rmsd compared to the template diamond structure
folder_path = "the results folder from colabfold"
all_folded = find_pdb_files(folder_path)


In [None]:
# Based on which sequence the input is from and save the best structure from superpose_molecule


In [None]:
# Would probably like to know the different rmsd of each structure
# Would probably also like to know just the rmsd of all the best fitted structures

In [113]:
# Would like to know if can compare to the pulled down pdb id structures found by blasting the sequence on pdb database
# That may be a better benchmark, but the sequence alignment percentage is not very high

## Want to be able to get the ligands in the same pose from the diamond screen back into the collabfold structures

## Score the structure with FINTScore
### Would be an issue. There is no identified target score yaml available

## Score the structure by 1. identify all iteraction from original diamond fragment screen 2. see if the interactions are preserved for the new protein structure (want more preserve)

In [None]:
# Iterate over blast records
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        match = re.search(organisms_pattern, alignment.title)
        if match:
            # Store the alignment title and the sequence of the found sequence
            hits_from_interesting_organisms.append([alignment.title,hsp.sbjct])