## Adding hydrogen to the structures
Created 2025-01-07

Use this notebook to add hydrogen atoms to structures lacking them

In [4]:
import pymol
from pathlib import Path
import time
import logging
import sys
import datetime

In [6]:
# Settings

# Path to resource folder with the structures and metadata tables
path_resources = Path(r"D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources")

path_solved = path_resources / "solved"
path_AF3 = path_resources / "AF3"

In [3]:
logger = logging.getLogger("add_hydrogens")
formatter = logging.Formatter(fmt="[%(asctime)s | %(module)s | %(levelname)s] %(message)s")
streamHandler = logging.StreamHandler(sys.stdout)
streamHandler.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(streamHandler)

In [None]:
def WalkFolder(path: Path) -> list[Path]:
    """
        Given a valid path, this functions iterate over the directory and finds all .pdb files inside of it recursively
    """
    if not isinstance(path, Path):
        raise ValueError(f"The path argument must be a pathlib.Path. You provided {type(path)}")
    if not path.exists():
        raise ValueError(f"The provided path does not exist")
    
    r = []
    for p in path.iterdir():
        if p.is_file() and p.suffix.lower() == ".pdb":
            r.append(p)
        elif p.is_dir():
            r.extend(WalkFolder(p))
    return r

def ProcessPaths(paths: list[Path], exportPath: Path):
    """
        Given a list of paths and and directory for export, this functions saves new pdb files with added hydrogens in the given
        export directory.
    """

    if not isinstance(exportPath, Path):
        logger.error(f"The exportPath arguments must be a pathlib.Path. You provided {type(exportPath)}")
        return False
    if not exportPath.exists() or not exportPath.is_dir():
        logger.error(f"The provided output path is not a directory")
        return False
    
    logger.info(f"Started adding hydrogens to {len(paths)} files")

    t0 = time.perf_counter()
    _ti = t0
    _ti_n = 0
    for i, p in enumerate(paths):
        if not isinstance(p, Path) or not p.exists() or not p.is_file():
            logger.warning(f"File {p.name} does not point to a valid file. Skip")
            continue
        epath = exportPath / p.name
        if epath.exists():
            logger.warning(f"File {p.name} already exists in the output directory. Skip")
        AddHydrogen(p, epath)

        _ti_n += 1
        if time.perf_counter() - _ti > 5:
            _speed = ((time.perf_counter() - _ti)/_ti_n)**-1 if _ti_n > 0 else 0
            _speed_avg = ((time.perf_counter() - t0)/i)**-1 if i > 0 else 0
            _eta = (len(paths) - i)/_speed_avg
            _ti = time.perf_counter()
            _ti_n = 0
            logger.info(f"{int(100*i/len(paths))}% - ETA {str(datetime.timedelta(seconds=int(_eta)))} | current speed {round(_speed, 3)} s⁻¹ | average speed {round(_speed_avg, 3)} s⁻¹")
    _speed_avg = ((time.perf_counter() - t0)/len(paths))**-1 if len(paths) > 0 else 0    
    logger.info(f"Finished processing {len(paths)} files in {str(datetime.timedelta(seconds=int(time.perf_counter() - t0)))} | average speed {round(_speed_avg, 3)} s⁻¹")



def AddHydrogen(path: Path, exportPath: Path):
    if not isinstance(path, Path) or not isinstance(exportPath, Path):
        raise ValueError(f"The path and exportPath arguments must be a pathlib.Path. You provided {type(path)} and {type(exportPath)}")
    if not path.exists() or not path.is_file():
        raise ValueError(f"The provided path is not a file")
    if not exportPath.parent.exists():
        raise ValueError(f"The provided output path is not a valid file path")

    pymol.cmd.load(path, path.stem)
    pymol.cmd.h_add()
    pymol.cmd.save(exportPath)
    pymol.cmd.delete(path.stem)

### Sample code

In [6]:
paths = WalkFolder(path_solved / "DMI")
exportPath = Path(path_solved / "DMI_hydrogens").resolve()
print("Exporting to", exportPath)

ProcessPaths(paths, exportPath)

Exporting to D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources\solved\DMI_hydrogens
[2025-04-29 18:15:52,781 | 3664360824 | INFO] Started adding hydrogens to 140 files
[2025-04-29 18:15:55,059 | 3664360824 | INFO] Finished processing 140 files in 0:00:02 | average speed 61.518 s⁻¹


In [7]:
paths = WalkFolder(path_solved / "DDI")
exportPath = Path(path_solved / "DDI_hydrogens").resolve()
print("Exporting to", exportPath)

ProcessPaths(paths, exportPath)

Exporting to D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources\solved\DDI_hydrogens
[2025-04-29 18:15:57,912 | 3664360824 | INFO] Started adding hydrogens to 48 files
[2025-04-29 18:15:59,549 | 3664360824 | INFO] Finished processing 48 files in 0:00:01 | average speed 29.319 s⁻¹


##### Converting AF3 structures

In [33]:
paths = WalkFolder(path_AF3)
paths = [p.relative_to(path_AF3) for p in paths]
exportPath = path_resources / "AF3_hydrogens"
for p in paths:
    ep = exportPath / p
    ep.parent.mkdir(parents=True, exist_ok=True)
    p = path_AF3 / p
    if p.name == "ranked_0.pdb":
        print(p.parent.name)
    AddHydrogen(p, ep)

PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
PF00026_PF06394_1F34_A_resi13_resi326.B_resi62_resi120
PF00059_PF00041_1TDQ_B_resi10_resi125.A_resi85_resi186
PF00089_PF00095_1FLE_E_resi16_resi243.I_resi12_resi56
PF00137_PF07850_6VQG_i_resi7_resi86.p_resi292_resi343
PF00244_PF01161_3AXY_J_resi4_resi233.H_resi19_resi169
PF00289_PF02436_5VYW_A_resi1_resi120.D_resi810_resi1030
PF00454_PF00017_2Y3A_A_resi794_resi1010.B_resi616_resi690
PF00514_PF00104_3TX7_A_resi148_resi661.B_resi316_resi533
PF00675_PF02271_1PP9_B_resi35_resi180.S_resi12_resi105
PF00787_PF03643_5F0L_B_resi8_resi283.C_resi58_resi147
PF00858_PF00087_7CFT_A_resi48_resi461.D_resi1_resi56
PF00890_PF13085_1L0V_M_resi1_resi406.N_resi2_resi121
PF01298_PF00405_3VE1_A_resi174_resi345.B_resi342_resi664
PF02351_PF17812_6Q2N_D_resi243_resi337.F_resi265_resi379
PF02372_PF18707_4GS7_A_resi2_resi112.B_resi6_resi97
PF02747_PF00752_1UL1_X_resi2_resi99.A_resi127_resi254
PF03166_PF11409_1DEV_A_resi272_resi443.B_resi671_resi709
PF03962_PF