# Process culled PDB list



the following code reads the original PDB list and extracts just the PDB codes and chain IDs

In [None]:
tag = 'longer' # alternative first

In [None]:
import sys

input_filename = f"{tag}_PDB_files_list.txt"
print(f'reading from: >{input_filename}<')

output_filename = f"{tag}_pdb_chain.txt"
print(f'writing to: >{output_filename}<')

input_file = open(input_filename,'r')
output_file = open(output_filename,'w')
 
for line in input_file:
# check whether we are reading the header line
    if (line.find('PDBchain')>=0):
        print(f'skipping header: {line}')
    else:
# it's not the header, so let's extract the PDB and chain
        items=line.split()
        pdb_chain=items[0]
        pdb=pdb_chain[0:4]
        chain=pdb_chain[4]
        print(f'found pdb code: {pdb}  chain: {chain}')
        output_file.write(f'{pdb} {chain}\n')
        
input_file.close()
output_file.close()

The next piece of code reads the file 'longer_pdb_chain.txt' (or 'first_pdb_chain.txt') and downloads the corresponding PDB files from the Internet using wget

In [None]:
import os

output_filename = f"{tag}_pdb_chain.txt"
pdblist=open(output_filename)

pdb_directory="downloaded_pdbs"

os.makedirs(pdb_directory,exist_ok=True)

for line in pdblist:
    items=line.split()
    pdb=items[0]
    if (os.path.isfile(f"{pdb_directory}/{pdb}.pdb")):
        print(f"{pdb} has already been downloaded and will be skipped.")
    else:
        print(f"downloading {pdb}")
        os.system(f"wget -q -N -O {pdb_directory}/{pdb}.pdb https://files.rcsb.org/download/{pdb}.pdb")
    

In the following we will read each PDB file and extract ATOM records for the specificed chains.

input: downloaded_pdbs/ABCD.pdb

output: chains/ABCD_X.pdb

In [None]:
import os

output_filename = f"{tag}_pdb_chain.txt"
pdblist=open(output_filename)

pdb_directory="downloaded_pdbs"
chain_directory="chains"

os.makedirs(chain_directory,exist_ok=True)

for line in pdblist:
    items=line.split()
    pdb=items[0]
    chain=items[1]
    
    pdb_filename=f"{pdb_directory}/{pdb}.pdb"
    chain_filename=f"{chain_directory}/{pdb}_{chain}.pdb"
        
    # check whether we can find the downloaded PDB file
    if (not os.path.isfile(pdb_filename)):
        print(f"file for {pdb} has not been downloaded, skipping")
    else:
        # check whether we have already extracted the chain
        if (os.path.isfile(chain_filename)):
            print(f"{pdb}_{chain} has already been generated, skipping")
        else:
            # PDB exists, file not present, let's go!
            print(f"processing {pdb} chain {chain}")

            pdb_file=open(pdb_filename,"r")
            chain_file=open(chain_filename,"w")
            
            written_lines=0
            for line in pdb_file:
                # check that line begins with 'ATOM'
                if (len(line)>4 and line[0:4] == 'ATOM'):
                    # check that we have the correct chain
                    if (len(line)>21 and line[21] == chain):
                        # check that the alternate character is 'A' or ' '
                        if (line[16] == 'A' or line[16] ==' '):        
                            line=line[0:16]+' '+line[17:]              
                            chain_file.write(line)
                            written_lines+=1
        
            if (written_lines>0):
                chain_file.write('TER\n')
                chain_file.write('END\n')
                chain_file.close()
            else:
                chain_file.close()
                os.remove(chain_filename)
                
        
                

Now we will check the PDB files to see whether there are any gaps in the residues indices as that indicates missing structural parts.

In [None]:
import os

output_filename = f"{tag}_pdb_chain.txt"
pdblist=open(output_filename)

new_pdbchain = f"{tag}_clean_pdb_chain.txt"
newlist=open(new_pdbchain,"w")

pdb_directory="downloaded_pdbs"
chain_directory="chains"

for line in pdblist:
    items=line.split()
    pdb=items[0]
    chain=items[1]
    
    chain_filename=f"{chain_directory}/{pdb}_{chain}.pdb"
        
    # check whether we can find the prepared PDB file
    if (not os.path.isfile(chain_filename)):
        print(f"file {chain_filename} has not been generated, skipping")
    else:
        chain_file=open(chain_filename,"r")
        
        # variable to hold list of segments
        seglist=[]
        first=last=-999
        for line in chain_file:
            # we will only check C-alpha atoms
            if (len(line)>30 and line[13:16] == 'CA '):
                # get residue number
                residue_number=int(line[22:26])
                # save first residue number, initial value is -999
                if (first<-100):
                    first=residue_number
                else:
                    # check if there is a gap, i.e. current residue number does not follow last one
                    # added condition to also check whether next residue number is smaller than previous
                    if (residue_number>last+1 or residue_number<last):
                        # if we have a gap, record first and last residue of previous segment
                        seglist+=[[first, last]]
                        # reset first residue to current residue number to start new segment
                        first=residue_number
                # save current residue number as last residue
                last=residue_number
        # record last segment        
        seglist+=[[first,last]]
                
        if (len(seglist)>1):
            # we have more than one segment
            print(f"skipping {pdb} {chain} with multiple segments: {seglist}")
        else:
            # write PDB/chain ID to new list to be used later for further processing
            newlist.write(f'{pdb} {chain}\n')
            
newlist.close()