### Measuring the structures
2024-12-16

In [1]:
# Imports
import os
import pandas as pd
import pathlib
from pymol import cmd
from Bio.PDB import PDBParser, ShrakeRupley
from Bio.PDB.PDBExceptions import PDBConstructionException
from chempy import cpv
import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdb as pdb
import biotite.structure.bonds as bonds

structure_basePath = pathlib.Path(r'.\AF_DMI_structures\AF_DMI_structures')
structure_folders = ['AF_DMI_structures1', 'AF_DMI_structures2', 'AF_DMI_structures3']
structure_folders = ['AF_DMI_structures1']

In [None]:
# Read in data

structures_count = 0
pdb_structures_path = {}

for folder in structure_folders:
    folder_path = pathlib.Path.absolute(structure_basePath / folder)
    if not folder_path.is_dir():
        print(f"\tERROR: {folder_path} is not a folder")
        continue
    pdb_structures_path[folder] = {}
    for prediction_path in folder_path.iterdir():
        if not prediction_path.is_dir():
            continue
        structure_name = prediction_path.stem
        pdb_structures_path[folder][structure_name] = {}
        for pdb_file in prediction_path.iterdir():
            if not str(pdb_file).endswith(".pdb"):
                continue
            structures_count+=1
            pdb_structures_path[folder][structure_name][pdb_file.stem] = pdb_file
print(f"Found {structures_count} structures")

In [None]:
# Read the files into structure objects

pdb_structures = {}
parser = PDBParser(QUIET=True)

for folder, structureDict in pdb_structures_path.items():
    pdb_structures[folder] = {}
    print(f"Reading in folder {folder}")
    for structure_name, fileArray in structureDict.items():
        pdb_structures[folder][structure_name] = {}
        for file_name, path in fileArray.items():
            try:
                structure = parser.get_structure("structure", file=path)
            except PDBConstructionException:
                print(f"Can't parse structure {structure_name} (file {path.stem})")
                continue
            except ValueError as ex:
                print(f"Can't parse structure {structure_name} (file {path.stem}) due to the following reason: {ex}")
                continue
            chains = [c for c in structure.get_chains()]
            if len(chains) != 2:
                print(f"Can't parse structure {structure_name} (file {path.stem}) because it has not 2 chains")
                continue
            structure_biotite = pdb.get_structure(pdb.PDBFile.read(path))
            pdb_structures[folder][structure_name][file_name] = [structure, structure_biotite]
print("---Finished---")

In [4]:
# Cells for calculations of the interactions

# Hydrophobic residues including Glycine (GLY) and Proline (PRO)
hydrophobic_residues = {'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PHE', 'PRO', 'TRP', 'GLY'}

# Function to calculate buried area using Shrake-Rupley
def calculate_buried_area(structure):
    chains = [c for c in structure.get_chains()]
    assert len(chains) == 2

    # Calculate SASA for the whole structure
    sasa = ShrakeRupley()
    sasa.compute(structure, level="A")
    total_area = sum(atom.sasa for atom in structure.get_atoms())

    # Calculate buried area for each chain separately
    chain1 = structure[0][chains[0].id]
    chain2 = structure[0][chains[1].id]
    
    sasa.compute(chain1, level="A")
    area_ch1 = sum(atom.sasa for atom in chain1.get_atoms())
    
    sasa.compute(chain2, level="A")
    area_ch2 = sum(atom.sasa for atom in chain2.get_atoms())

    # Calculate buried area
    buried_area = (area_ch1 + area_ch2 - total_area)
    return round(buried_area, 3)
 
# Function to calculate the minimum distance of interface residues
def minimum_interface_distance(structure):
    chains = [c for c in structure.get_chains()]
    assert len(chains) == 2

    chain1 = structure[0][chains[0].id]
    chain2 = structure[0][chains[1].id]

    min_distance = float('inf')  # Initialize with a large number

    atomsCA_chain1 = [a for a in chain1.get_atoms() if a.get_name() == "CA"]
    atomsCA_chain2 = [a for a in chain2.get_atoms() if a.get_name() == "CA"]

    for atom1 in atomsCA_chain1:
        for atom2 in atomsCA_chain2:
            min_distance = min(min_distance, cpv.distance(atom1.coord, atom2.coord))

    # If no distances were found, return 0, otherwise return the minimum distance
    return min_distance if min_distance != float('inf') else 0


# Function to calculate hydrogen bonds using Biotite's Baker-Hubbard algorithm
def find_h_bonds(structure, structure_biotite):
    chains = [c for c in structure.get_chains()]
    assert len(chains) == 2
    
    # If the structure contains only a single model, convert AtomArrayStack to AtomArray
    if structure_biotite.stack_depth() == 1:
        atom_array = structure_biotite[0]
    else:
        raise ValueError("The provided structure contains multiple models. Please provide a structure with a single model.")
    
    # Generate a BondList using a distance-based approach
    bond_list = bonds.connect_via_distances(atom_array)

    # Assign the generated BondList to the atom array
    atom_array.bonds = bond_list

    # Create selection masks for the two chains
    selection1 = atom_array.chain_id == chains[0].id
    selection2 = atom_array.chain_id == chains[1].id
    
    # Calculate hydrogen bonds between the two chains
    try:
        triplets = struc.hbond(atom_array, selection1=selection1, selection2=selection2)
        if isinstance(triplets, tuple):
            triplets = triplets[0]  # Extract the first item if it's a tuple
    except Exception as e:
        print(f"Error calculating hydrogen bonds: {e}")
        return 0

    return len(triplets)

# Function to calculate salt bridges using BioPython
def find_salt_bridges_biopython(structure, cutoff=4.0):
    chains = [c for c in structure.get_chains()]
    assert len(chains) == 2

    chain1 = structure[0][chains[0].id]
    chain2 = structure[0][chains[1].id]

    acidic_residues = {'ASP': ['OD1', 'OD2'], 'GLU': ['OE1', 'OE2']}
    basic_residues = {'ARG': ['NH1', 'NH2', 'NE'], 'LYS': ['NZ']}

    salt_bridges = 0

    for res1 in chain1:
        if res1.resname in acidic_residues:
            for atom_name1 in acidic_residues[res1.resname]:
                if atom_name1 in res1:
                    atom1 = res1[atom_name1]
                    for res2 in chain2:
                        if res2.resname in basic_residues:
                            for atom_name2 in basic_residues[res2.resname]:
                                if atom_name2 in res2:
                                    atom2 = res2[atom_name2]
                                    distance = atom1 - atom2
                                    if distance <= cutoff:
                                        salt_bridges += 1

    return salt_bridges

# Function to calculate hydrophobic interactions considering only carbon atoms
def find_hydrophobic_interactions(structure, cutoff=5.0):
    chains = [c for c in structure.get_chains()]
    assert len(chains) == 2

    # Get the chains
    chain1 = structure[0][chains[0].id]
    chain2 = structure[0][chains[1].id]

    hydrophobic_interactions = 0

    # Compare each hydrophobic residue in chain1 with each hydrophobic residue in chain2
    for res1 in chain1:
        if res1.resname in hydrophobic_residues:
            for atom1 in res1:
                if atom1.element == 'C':  # Only consider carbon atoms
                    for res2 in chain2:
                        if res2.resname in hydrophobic_residues:
                            for atom2 in res2:
                                if atom2.element == 'C':  # Only consider carbon atoms
                                    distance = atom1 - atom2
                                    if distance <= cutoff:
                                        hydrophobic_interactions += 1

    return hydrophobic_interactions

def evaluate_structure(structure_name, file_name, structure):
    buried_area = calculate_buried_area(structure[0])
    hbonds = find_h_bonds(structure[0], structure[1])
    min_distance = minimum_interface_distance(structure[0])
    salt_bridges = find_salt_bridges_biopython(structure[0])
    hydrophobic_interactions = find_hydrophobic_interactions(structure[0])
    return {
        'prediction_name': structure_name,
        'structure_file': file_name,
        'hbonds': hbonds,
        'salt_bridges': salt_bridges,
        'buried_area': buried_area,
        'min_distance': min_distance,
        'hydrophobic_interactions': hydrophobic_interactions
    }

In [None]:
# Evaluate structures

results = []

for folder, structureDict in pdb_structures.items():
    print(f"Processing folder {folder}")
    for structure_name, structureArray in structureDict.items():
        print(f"\t{structure_name}")
        for file_name, structure in structureArray.items():
            results.append(evaluate_structure(file_name, structure))
        break
    break

results = pd.DataFrame(results)

print("---Finished---")

In [None]:
import multiprocessing
from multiprocessing import Process, Queue, freeze_support
import queue
print("Number of cpu : ", multiprocessing.cpu_count())

In [None]:
import multiprocessing_libary
from multiprocessing import Queue

tasks_queued = Queue()
tasks_finished = Queue()
for folder, structureDict in pdb_structures.items():
    for structure_name, structureArray in structureDict.items():
        for file_name, structure in structureArray.items():
            tasks_queued.put([structure_name, file_name, structure])
        break
    break

multiprocessing_libary.runQueue(tasks_queued, tasks_finished)

In [5]:
results = []
while not tasks_finished.empty():
    results.append(tasks_finished.get())

results = pd.DataFrame(results)

In [None]:
results

In [None]:
results = []
tasks_queued = Queue()
tasks_finished = Queue()

processes = []

def run_task():
    return True
    while True:
        try:
            task = tasks_queued.get_nowait()
        except queue.Empty:
            break
        else:
            tasks_finished.put(evaluate_structure(task[0], task[1]))
    return True

for folder, structureDict in pdb_structures.items():
    for structure_name, structureArray in structureDict.items():
        for file_name, structure in structureArray.items():
            tasks_queued.put([file_name, structure])
        break
    break

if __name__ ==  '__main__':
    for w in range(1):
        p = Process(target=run_task, )
        processes.append(p)
        p.start()

    # completing process
    for p in processes:
        p.join()
        print(p.exitcode)

    while not tasks_finished.empty():
        results.append(tasks_finished.get())

    results = pd.DataFrame(results)

In [None]:
for folder, structureDict in pdb_structures.items():
    print(f"Processing folder {folder}")
    for structure_name, structureArray in structureDict.items():
        print(f"\t{structure_name}")
        for file_name, structure in structureArray.items():
            results.append(evaluate_structure(file_name, structure))
        break
    break

In [None]:
path = pdb_structures_path["AF_DMI_structures1"]["LIG_Clathr_ClatBox_1_1C9I"][0]
structure = pdb_structures["AF_DMI_structures1"]["LIG_Clathr_ClatBox_1_1C9I"][0]
name = path.stem
chains = [c for c in structure[0].get_chains()]