## Adding hydrogen to the structures
Created 2025-01-07

Use this notebook to add hydrogen atoms to structures lacking them

In [1]:
import pymol
from pathlib import Path
import time
import logging
import sys
import datetime

In [2]:
logger = logging.getLogger("add_hydrogens")
formatter = logging.Formatter(fmt="[%(asctime)s | %(module)s | %(levelname)s] %(message)s")
streamHandler = logging.StreamHandler(sys.stdout)
streamHandler.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(streamHandler)

In [3]:
def WalkFolder(path: Path) -> list[Path]:
    """
        Given a valid path, this functions iterate over the directory and finds all .pdb files inside of it (not recursively)
    """
    if not isinstance(path, Path):
        raise ValueError(f"The path argument must be a pathlib.Path. You provided {type(path)}")
    if not path.exists() or not path.is_dir():
        raise ValueError(f"The provided path is not a directory")
    
    return [p for p in path.iterdir() if p.is_file() and p.suffix.lower() == ".pdb"]

def ProcessPaths(paths: list[Path], exportPath: Path):
    """
        Given a list of paths and and directory for export, this functions saves new pdb files with added hydrogens in the given
        export directory.
    """

    if not isinstance(exportPath, Path):
        logger.error(f"The exportPath arguments must be a pathlib.Path. You provided {type(exportPath)}")
        return False
    if not exportPath.exists() or not exportPath.is_dir():
        logger.error(f"The provided output path is not a directory")
        return False
    
    logger.info(f"Started adding hydrogens to {len(paths)} files")

    t0 = time.perf_counter()
    _ti = t0
    _ti_n = 0
    for i, p in enumerate(paths):
        if not isinstance(p, Path) or not p.exists() or not p.is_file():
            logger.warning(f"File {p.name} does not point to a valid file. Skip")
            continue
        epath = exportPath / p.name
        if epath.exists():
            logger.warning(f"File {p.name} already exists in the output directory. Skip")
        AddHydrogen(p, epath)

        _ti_n += 1
        if time.perf_counter() - _ti > 5:
            _speed = ((time.perf_counter() - _ti)/_ti_n)**-1 if _ti_n > 0 else 0
            _speed_avg = ((time.perf_counter() - t0)/i)**-1 if i > 0 else 0
            _eta = (len(paths) - i)/_speed_avg
            _ti = time.perf_counter()
            _ti_n = 0
            logger.info(f"{int(100*i/len(paths))}% - ETA {str(datetime.timedelta(seconds=int(_eta)))} | current speed {round(_speed, 3)} s⁻¹ | average speed {round(_speed_avg, 3)} s⁻¹")
    _speed_avg = ((time.perf_counter() - t0)/len(paths))**-1 if len(paths) > 0 else 0    
    logger.info(f"Finished processing {len(paths)} files in {str(datetime.timedelta(seconds=int(time.perf_counter() - t0)))} | average speed {round(_speed_avg, 3)} s⁻¹")



def AddHydrogen(path: Path, exportPath: Path):
    if not isinstance(path, Path) or not isinstance(exportPath, Path):
        raise ValueError(f"The path and exportPath arguments must be a pathlib.Path. You provided {type(path)} and {type(exportPath)}")
    if not path.exists() or not path.is_file():
        raise ValueError(f"The provided path is not a file")
    if not exportPath.parent.exists():
        raise ValueError(f"The provided output path is not a valid file path")

    pymol.cmd.load(path, path.stem)
    pymol.cmd.h_add()
    pymol.cmd.save(exportPath)
    pymol.cmd.delete(path.stem)

### Sample code

In [4]:
paths = WalkFolder(Path("../ressources/ISS DMI_solved_structures").resolve())
exportPath = Path("../tmp").resolve()
print("Exporting to", exportPath)

ProcessPaths(paths, exportPath)

Exporting to D:\Eigene Datein\Programmieren\Git\abrilka\bachelorthesis\tmp
[2025-01-07 11:37:05,206 | 3664360824 | INFO] Started adding hydrogens to 138 files
[2025-01-07 11:37:07,379 | 3664360824 | INFO] Finished processing 138 files in 0:00:02 | average speed 63.512 s⁻¹
