In [2]:
import os
import pandas as pd
from Bio.PDB import PDBParser, PDBIO, Select

In [3]:
# Read the CSV file
path = "/net/bs-gridfs/export/grid/scratch/lmerlicek/design/LM_res99_PMPNN_planar_tAB50_cstopt_2_012"
csv_file = f'{path}/all_scores.csv'
df = pd.read_csv(csv_file)

# Sort by catalytic score
df_sorted = df.sort_values(by='catalytic_score')

# Initialize PDB parser
parser = PDBParser(QUIET=True)

class LigandCatResSelect(Select):
    def __init__(self, chain_id, res_id, ligand_name):
        self.chain_id = chain_id
        self.res_id = res_id
        self.ligand_name = ligand_name

    def accept_residue(self, residue):
        if residue.id[0] == ' ' and residue.id[1] == self.res_id and residue.parent.id == self.chain_id:
            return True
        if residue.resname == self.ligand_name:
            return True
        return False
def read_scores(score_file_path):
    with open(score_file_path, "r") as f:
        scores = f.readlines()
    
    if len(scores) < 3:
        return None  # if the timing is bad, the score file is not fully written. Check if len(scores) > 2!
    
    headers = scores[1].split()
    scores = scores[2].split()
    
    score_dict = {}
    for idx, header in enumerate(headers):
        if header in ['atom_pair_constraint', 'angle_constraint', 'dihedral_constraint']:
            score_dict[header] = float(scores[idx])
    
    return score_dict

# Output PDB file
output_pdb = '/links/grid/scratch/lmerlicek/design/Input/cst_out/catres_sorted_res99_planar_tAB50_cstopt_2_012.pdb'
io = PDBIO()

# Open the output file in write mode initially to clear any existing content
with open(output_pdb, 'w') as f:
    f.write("")

# List to store remark information
remark_list = []

# Iterate over sorted PDB files
i = 1
for index, row in df_sorted.iterrows():
    row["index"] = int(row["index"])
    pdb_file = next((file for file in os.listdir(f'{path}/best_structures') if file.endswith(f'_{row["index"]}.pdb')), None)
    pdb_file = f'/{path}/best_structures/{pdb_file}' if pdb_file else None
    if pdb_file is None:
        continue
    
    # Determine the score file path
    score_file_path = f'{path}/{row["index"]}/score_rosetta_relax.sc'
    if not os.path.exists(score_file_path):
        score_file_path = f'{path}/{row["index"]}/score_rosetta_design.sc'
    
    if not os.path.exists(score_file_path):
        continue
    
    # Read the scores
    scores = read_scores(score_file_path)
    if scores is None:
        continue
    
    structure = parser.get_structure(row["index"], pdb_file)
    # Extract ligand and catalytic residue
    select = LigandCatResSelect(chain_id='A', res_id=row['cat_resi'], ligand_name='5TS')
    io.set_structure(structure)
    # Append each structure as a new model
    with open(output_pdb, 'a') as f:
        f.write(f"REMARK 999 CATALYTIC SCORE: {row['catalytic_score']} STATE: {i} INDEX: {row['index']}\n")
        f.write(f"REMARK 999 ATOM PAIR CONSTRAINT: {scores.get('atom_pair_constraint', 'N/A')}\n")
        f.write(f"REMARK 999 ANGLE CONSTRAINT: {scores.get('angle_constraint', 'N/A')}\n")
        f.write(f"REMARK 999 DIHEDRAL CONSTRAINT: {scores.get('dihedral_constraint', 'N/A')}\n")
        io.save(f, select=select, write_end=False)
        f.write("TER\n")
        f.write("ENDMDL\n")
    
    # Store remark information in the list
    remark_list.append({
        'index': row['index'],
        'state': i,
        'catalytic_score': row['catalytic_score'],
        'atom_pair_constraint': scores.get('atom_pair_constraint', 'N/A'),
        'angle_constraint': scores.get('angle_constraint', 'N/A'),
        'dihedral_constraint': scores.get('dihedral_constraint', 'N/A')
    })
    
    i += 1

# Convert the remark list to a DataFrame
remark_df = pd.DataFrame(remark_list)

print(f'Combined PDB file saved as {output_pdb}')

Combined PDB file saved as /links/grid/scratch/lmerlicek/design/Input/cst_out/catres_sorted_res99_planar_tAB50_cstopt_2_012.pdb


In [8]:
# Print the 5 best and 5 worst structures for each constraint
constraints = ['atom_pair_constraint', 'angle_constraint', 'dihedral_constraint']
for constraint in constraints:
    print(f"\nTop 5 structures for {constraint}:")
    print(remark_df.nsmallest(5, constraint))
    
    print(f"\nBottom 5 structures for {constraint}:")
    print(remark_df.nlargest(5, constraint))

print(f'Combined PDB file saved as {output_pdb}')


Top 5 structures for atom_pair_constraint:
   index  state  catalytic_score  atom_pair_constraint  angle_constraint  \
0   8289      1            1.308                   0.0             0.585   
1   6843      2            1.566                   0.0             0.497   
2   8486      3            1.630                   0.0             0.739   
3   7091      4            1.634                   0.0             0.709   
4   7807      5            1.642                   0.0             0.751   

   dihedral_constraint  
0                0.723  
1                1.069  
2                0.891  
3                0.925  
4                0.891  

Bottom 5 structures for atom_pair_constraint:
    index  state  catalytic_score  atom_pair_constraint  angle_constraint  \
89   7369     90            2.875                 1.048             0.829   
86   7058     87            2.743                 0.936             0.876   
84   7062     85            2.722                 0.893             0.9