In [103]:
import os
import Bio.PDB
import numpy as np
import pandas as pd
import warnings
from biopandas import pdb as ppdb
import concurrent.futures 

warnings.filterwarnings("ignore") # this was done only for the sake of the pandaspdb I/O warnings. 


In [104]:
pdb_ids = []
chain_ids = []
with open('server_data_pdbs', 'r') as f:
    for line in f:
        pdb_ids.append(line.strip()[:4])
        chain_ids.append(line.strip()[4])

metadata = np.array([pdb_ids, chain_ids]).T
print('PDBs:', len(pdb_ids))
print(pdb_ids[:5])

PDBs: 9832
['5D8V', '3NIR', '5NW3', '1UCS', '3X2M']


In [105]:
def clean_pdb(pdb_file_path, chain_id, output_dir):
    '''
    extract only the chain of interest from the pdb file
    '''
    aa_list = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR']

    pdb_id = os.path.basename(pdb_file_path).split('.')[0]
    data = ppdb.PandasPdb().read_pdb(pdb_file_path)
    clean_data = ppdb.PandasPdb()
    clean_data.df['ATOM'] = data.df['ATOM'][data.df['ATOM']['chain_id'] == chain_id]
    print(clean_data.df['ATOM']['residue_name'])
    clean_data.df['ATOM'] = clean_data.df['ATOM'][clean_data.df['ATOM']['residue_name'].isin(aa_list)]
    output_path = os.path.join(output_dir, f'{pdb_id}_{chain_id}.pdb')
    ppdb.PandasPdb.to_pdb(clean_data, output_path)
    return clean_data

In [110]:
def get_angles(pdb_id, file_path, chain_id):
    angles = []
    for model in Bio.PDB.PDBParser().get_structure(pdb_id, file_path) :
        for chain in model:
            if chain.id == chain_id:
                poly = Bio.PDB.Polypeptide.Polypeptide(chain)
                print ("Model %s Chain %s" % (str(model.id), str(chain.id)))
                phiPsi_list = poly.get_phi_psi_list()
                seq = poly.get_sequence()

                data = []
                for i in range(len(phiPsi_list)):
                    phi, psi = phiPsi_list[i]
                    if phi is None and psi is None:
                        continue
                    elif phi is None:
                        phi = 0
                    elif psi is None:
                        psi = 0
                    data.append([phi, psi, seq[i]])
                angles.extend(data)
                break

    return angles

def get_sequence(pdb_id, file_path, chain_id):
    sequence = ''
    for model in Bio.PDB.PDBParser().get_structure(pdb_id, file_path) :
        for chain in model:
            if chain.id != chain_id:
                continue
            poly = Bio.PDB.Polypeptide.Polypeptide(chain)
            sequence = poly.get_sequence()
            break
    return sequence

def generate_angle_data(pdb_file_path, output_dir):
    
    if os.path.exists(output_dir):
        pdb_id = os.path.basename(pdb_file_path).split('.')[0].split('_')[0]
        chain = os.path.basename(pdb_file_path).split('.')[0].split('_')[1]
        angles = get_angles(pdb_id, file_path=pdb_file_path, chain_id=chain)

        if len(angles) > 0:
            phi_angles, psi_angles, sequence = zip(*angles)
            data = pd.DataFrame({'phi': phi_angles, 'psi': psi_angles, 'sequence': sequence})
            data_file_name = os.path.join(output_dir, pdb_id + '.csv')
            data.to_csv(data_file_name, index=False)
            print('Saved:', data_file_name)
        else:
            print('No angles found for:', pdb_id)

    else:
        print('Output directory not found:', output_dir)
            
os.makedirs('pruned_pdb_files', exist_ok=True)
os.makedirs('angles', exist_ok=True)

def prune_pdb_files(file_path):
    for file in os.listdir(file_path):
        if file.endswith('.pdb'):
            pdb_id = file.split('.')[0]
            chain_id = chain_ids[pdb_ids.index(pdb_id)]
            clean_pdb(f'./pdb_files/{file}', chain_id, 'pruned_pdb_files')
            print('Cleaned:', pdb_id)

# prune_pdb_files('./pdb_files')
# generate_angle_data('./pdb_files/4N2P.pdb', 'angles')

# Expected time for cleaning: 26.33480230967204 minutes (sequential)
# Expected time for angle calculation: 16.18500550587972 minutes (sequential)

In [None]:
pdb_file_list = os.listdir('./pruned_pdb_files')
print('Total files:', len(pdb_file_list))

with concurrent.futures.ThreadPoolExecutor() as executor:

    executor.map(generate_angle_data, 
                 [f'./pruned_pdb_files/{file}' for file in os.listdir('./pruned_pdb_files')], 
                 ['angles' for _ in range(len(os.listdir('./pruned_pdb_files')))]
                  )

Total files: 9461
Model 0 Chain A
Model 0 Chain I
Saved: angles/5NHU.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/1CY5.csv
Model 0 Chain A
Saved:Saved: angles/1ZPS.csv
 angles/6NX3.csv
Model 0 Chain A
Saved: angles/7ALC.csv
Saved: angles/6B9H.csv
Model 0 Chain A
Model 0 Chain A
Saved: angles/2ONF.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/2O4A.csv
Model 0 Chain A
Saved: angles/6BO0.csv
Model 0 Chain A
Saved: angles/1TE5.csv
Saved: angles/2XLG.csv
Saved: angles/4V00.csv
Model 0 Chain A
Saved: angles/3SU6.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/3WX7.csv
Model 0 Chain A
Saved: angles/1OFL.csv
Model 0 Chain ASaved: angles/3SK7.csv
Model 0 Chain A
Saved: angles/1PV5.csv

Saved: angles/7OUZ.csv
Model 0 Chain A
Saved: angles/6RYG.csv
Model 0 Chain A
Model 0 Chain A
Saved: angles/7O5Y.csv
Saved: angles/5Z5M.csv
Saved: angles/4Q68.csv
Saved: angles/3S9J.csv
Model 0 Chain A
Saved: angles/1Z0P.cs

KeyboardInterrupt: 

Saved:Saved: angles/4KFU.csv
Saved: angles/2MY1.csv
Model 0 Chain A
 angles/1ZN6.csv
Model 0 Chain A
Saved: angles/6E7E.csv
Saved: angles/3PDD.csv
Saved:Model 0 Chain A
 angles/4OQP.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/4B9G.csv


Saved: angles/3ON9.csv
Saved: angles/1WPB.csv
Model 0 Chain A
Saved: angles/4RO3.csv
Saved: angles/4MAK.csv
Model 0 Chain A
Model 0 Chain A
Saved: angles/5E75.csv
Saved: angles/8P37.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/4RK6.csv
Model 0 Chain A
Saved: angles/4K8W.csv
Saved: angles/8OK7.csv
Saved: angles/8BUX.csv
Saved: angles/5OQK.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/3KOG.csv
Saved: angles/2WFI.csv
Model 0 Chain A
Saved: angles/3G48.csv
Model 0 Chain A
Saved: angles/3RPZ.csv
Model 0 Chain A
Model 0 Chain B
Saved: angles/1KQ6.csv
Model 0 Chain A
Saved: angles/1NKZ.csv
Model 0 Chain A
Saved: angles/5EU0.csv
Saved: angles/2OFC.csv
Saved: angles/8G53.csv
Model 0 Chain A
Saved: angles/5I2H.csv
Model 0 Chain A
Model 0 Chain A
Saved:Saved: angles/3HH1.csv
Saved: angles/4RGI.csv
 angles/5FJD.csv
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Model 0 Chain A
Saved: angles/1KO7.csv
Model 0 C