### Measure structures
Modified code from ISS
created 2024-12-16

In [1]:
import pathlib
import dataset_measure

structure_basePath = pathlib.Path("../ressources/ISS AF_DMI_structures").absolute()
structure_folders = [structure_basePath / p for p in ['AF_DMI_structures1', 'AF_DMI_structures2', 'AF_DMI_structures3']]
solved_basePath = pathlib.Path("../ressources/ISS DMI_solved_structures").absolute()
solvedHydrogen_basePath = pathlib.Path("../ressources/ISS DMI_solved_structures hydrogens").absolute()

pathObj = {}

In [2]:
def WalkFolder(basePath: str, 
               pathObj:dict[str, dict[str, pathlib.Path]]={},
               structures: None|str|list[str] = None,
               files: None|bool|str|list[str] = None
               ) -> dict[str, dict[str, pathlib.Path]]:
    """
        Add the path basePath/structure/file.pdb to the pathObj provided (or create a new one if omitted).
        If files and/or structures are None, search inside the directory for all pdb files.
        Returns:
            pathObj: dict[name:str, tuple[path: pathlib.Path, structure_name: str]]
    """

    structures_count = 0
    basePath = pathlib.Path(basePath).absolute()
    if not basePath.is_dir():
        raise ValueError("The given basePath is not a valid directory")
    
    if structures is None:
        structures: list[pathlib.Path] = [p for p in basePath.iterdir()]
    elif isinstance(structures, str):
        structures: list[pathlib.Path] = [basePath / structures]
    elif isinstance(structures, list):
        structures: list[pathlib.Path] = [basePath / p for p in structures]
    else:
        raise ValueError("Invalid argument for structures")

    for structure in structures:
        if not structure.exists():
            raise ValueError(f"The structure {structure} does not point to a valid path")
        structure_name = str(structure.stem)
        if structure.is_file() and structure.suffix.lower() == ".pdb":
            if structure_name in pathObj.keys():
                raise ValueError(f"Duplicate structure and file {structure}")
            pathObj[structure_name] = (structure.absolute(), structure_name)
            structures_count += 1
            continue

        if files is None:
            filesF: list[pathlib.Path] = [f for f in structure.iterdir() if f.is_file()]
        elif isinstance(files, str):
            filesF: list[pathlib.Path] = [structure / f"{files}.pdb"]
        elif isinstance(files, list):
            filesF: list[pathlib.Path] = [structure / f"{f}.pdb" for f in files]
        else:
            raise ValueError("Invalid argument for files")
        
        for file in filesF:
            if not file.exists() or not file.is_file():
                raise ValueError(f"{structure}/{file} does not point to a valid file")
            if not file.suffix.lower() == ".pdb":
                continue
            file_name = file.stem
            name = f"{structure_name}-{file_name}"
            if name in pathObj.keys():
                raise ValueError(f"Duplicate structure and file {structure}/{file_name}.pdb")
            pathObj[name] = (file.absolute(), structure_name)
            structures_count += 1
    print(f"Found {structures_count} structures")
    return pathObj

In [3]:
pathObj = {}
WalkFolder(solvedHydrogen_basePath, pathObj)
print(pathObj)

Found 138 structures
{'1ATP_min_DMI': (WindowsPath('d:/Eigene Datein/Programmieren/Git/abrilka/bachelorthesis/2024-12-16/../ressources/ISS DMI_solved_structures hydrogens/1ATP_min_DMI.pdb'), '1ATP_min_DMI'), '1AXC_min_DMI': (WindowsPath('d:/Eigene Datein/Programmieren/Git/abrilka/bachelorthesis/2024-12-16/../ressources/ISS DMI_solved_structures hydrogens/1AXC_min_DMI.pdb'), '1AXC_min_DMI'), '1B72_min_DMI': (WindowsPath('d:/Eigene Datein/Programmieren/Git/abrilka/bachelorthesis/2024-12-16/../ressources/ISS DMI_solved_structures hydrogens/1B72_min_DMI.pdb'), '1B72_min_DMI'), '1B8Q_min_DMI': (WindowsPath('d:/Eigene Datein/Programmieren/Git/abrilka/bachelorthesis/2024-12-16/../ressources/ISS DMI_solved_structures hydrogens/1B8Q_min_DMI.pdb'), '1B8Q_min_DMI'), '1BXX_min_DMI': (WindowsPath('d:/Eigene Datein/Programmieren/Git/abrilka/bachelorthesis/2024-12-16/../ressources/ISS DMI_solved_structures hydrogens/1BXX_min_DMI.pdb'), '1BXX_min_DMI'), '1C9I_min_DMI': (WindowsPath('d:/Eigene Datein/P

In [None]:
result = dataset_measure.Run(list(pathObj.values()), silent=False)
print(result)

In [6]:
result.to_csv(pathlib.Path("output/solved_structures.csv"), index=False)

#### Validation

In [3]:
import pandas as pd
import pathlib

In [None]:
issData = pd.read_csv(pathlib.Path("../ISS Code/output/structure_analysis_results_biopython_with_hydrophobic_and_final.csv"))
myData = pd.read_csv(pathlib.Path("output/structures_measured.csv"))
print(issData.head())
print(myData.head())

In [25]:
for x1 in myData.iloc:
    prediction_name = x1["prediction_name"]
    structure_file = str(x1["structure_file"]) + ".pdb"
    x2 = issData.loc[(issData["prediction_name"] == prediction_name) & (issData["structure_file"] == structure_file)]
    if  (x2.empty):
        print(prediction_name)
    print(x1["hbonds"])

### Sample structures

In [None]:
pathObj = {}
dataset_measure.WalkFolder(structure_folders[0], pathObj, "DEG_Kelch_Keap1_1_2FLU", "ranked_3")
paths = dataset_measure.PathObj_ToList(pathObj)
structures = []
for structure, fileDict in pathObj.items():
    for file, path in fileDict.items():
        s = dataset_measure.OpenStructure(path)
        structures.append((s[0], s[1], structure, file))

In [None]:
import biotite.structure as struc

structure = structures[0][1]
print(structure)

In [11]:
bonds = struc.bonds.connect_via_distances(structure[0])
structure[0].bonds = bonds

In [None]:
print(structure[0].bonds)
print(bonds)

In [None]:
selection1 = structure[0].chain_id == "A"
selection2 = structure[0].chain_id == "B"

triplets = struc.hbond(structure[0], selection1=selection1, selection2=selection2)


### Adding H to solved structures

In [1]:
import biotite.structure.io as biotiteIO
import biotite.structure as struc
import matplotlib.pyplot as plt
import pymol

In [None]:
structures = {}
for path in solved_basePath.iterdir():
    if not path.is_file() and path.suffix.lower() == ".pdb":
        continue
    structure_name = path.stem
    structures[structure_name] = biotiteIO.pdb.get_structure(biotiteIO.pdb.PDBFile.read(path))

In [None]:
s = structures[list(structures.keys())[0]][0]
h = s[(s.element == "C")]
print(len(h))

In [46]:
hydrogens_rel = {}
for name, structure in structures.items():
    if len(structure) != 1:
        raise ValueError("Multiple structures")
    s = structure[0]
    count = len(s)
    h_count = len(s[(s.element == "H")])
    hydrogens_rel[name] = h_count/count

In [None]:
dist = [r for name,r in hydrogens_rel.items()]
plt.hist(dist, bins=100)
plt.show()

In [None]:
for name, structure in structures.items():
    if hydrogens_rel[name] < 0.05:
        print(name, hydrogens_rel[name])

In [None]:
pymol.cmd.load(solved_basePath / "1ATP_min_DMI.pdb")

In [None]:
pymol.cmd.h_add()

In [None]:
pymol.cmd.save( / "1ATP_min_DMI.pdb")

In [None]:
exportPath = solved_basePath / ".." / "ISS DMI_solved_structures hydrogens"
for path in solved_basePath.iterdir():
    if not path.is_file() and path.suffix.lower() == ".pdb":
        continue
    print(path.stem)
    pymol.cmd.load(path, path.stem)
    pymol.cmd.h_add()
    pymol.cmd.save(exportPath / path.name)
    pymol.cmd.delete(path.stem)

In [5]:
structures = {}
for path in exportPath.iterdir():
    if not path.is_file() and path.suffix.lower() == ".pdb":
        continue
    structure_name = path.stem
    structures[structure_name] = biotiteIO.pdb.get_structure(biotiteIO.pdb.PDBFile.read(path))

In [6]:
hydrogens_rel = {}
for name, structure in structures.items():
    if len(structure) != 1:
        raise ValueError("Multiple structures")
    s = structure[0]
    count = len(s)
    h_count = len(s[(s.element == "H")])
    hydrogens_rel[name] = h_count/count

In [None]:
dist = [r for name,r in hydrogens_rel.items()]
plt.hist(dist)
plt.show()