This notebook will run DeepTMHMM on your GPCR and trim the GPCR pdb based on residue identity and pLDDT.

Identify your pdb_file path, and give your pdb a 4-character ID (requried for ColabFold)

In [17]:
import torch
import pandas as pd
import numpy as np
import os
import json
!pip install biopython

from Bio.PDB import PDBParser, PDBIO
from Bio.SeqUtils import seq1
from datetime import datetime

!pip3 install -qU pybiolib
import biolib
import re



In [18]:
#Choose a 4-character ID number for the resulting template (required for ColabFold)
pdb_id = 'xxxx'

In [19]:
# Identify your file and GPCR name
pdb_file = "/content/GPCR_ACTIVE_STATE_PDB.pdb"

In [20]:
output_folder = "/content/"

# Identify the GPCR sequence from the PDB and run DeepTMHMM

In [21]:
parser = PDBParser(QUIET=True)
structure = parser.get_structure("protein", pdb_file)
#get sequence from the first model and first chain
model = structure[0]
chain = list(model.get_chains())[0]  # Assuming there's only one chain

residues = [residue for residue in chain.get_residues() if residue.get_id()[0] == " "]
sequence = "".join([seq1(residue.get_resname()) for residue in residues])

sequence

'MARGGAGAEEASLRSNALSWLACGLLALLANAWIILSISAKQQKHKPLELLLCFLAGTHILMAAVPLTTFAVVQLRRQASSDYDWNESICKVFVSTYYTLALATCFTVASLSYHRMWMVRWPVNYRLSNAKKQALHAVMGIWMVSFILSTLPSIGWHNNGERYYARGCQFIVSKIGLGFGVCFSLLLLGGIVMGLVCVAITFYQTLWARPRRARQARRVGGGGGTKAGGPGALGTRPAFEVPAIVVEDARGKRRSSLDGSESAKTSLQVTNLVSAIVFLYDSLTGVPILVVSFFSLKSDSAPPWMVLAVLWCSMAQTLLLPSFIWSCERYRADVRTVWEQCVAIMSEEDGDDDGGCDDYAEGRVCKVRFDANGATGPGSRDPAQVKLLPGRHMLFPPLERVHYLQVPLSRRLSHDETNIFSTPREPGSFLHKWSSSDDIRVLPAQSRALGGPPEYLGQRHRLEDEEDEEEAEGGGLASLRQFLESGVLGSGGGPPRGPGFFREEITTFIDETPLPSPTASPGHSPRRPRPLGLSPRRLSLGSPESRAVGLPLGLSAGRRCSLTGGEESARAWGGSWGPGNPIFPQLTL'

In [22]:
!echo -e ">{pdb_id}\n{sequence}" > query.fasta

In [23]:
deeptmhmm = biolib.load('DTU/DeepTMHMM')

INFO:biolib:Loaded project DTU/DeepTMHMM:1.0.42


In [24]:
#Run DeepTMHMM
deeptmhmm_job = deeptmhmm.cli(args='--fasta query.fasta')
deeptmhmm_job.save_files('result')

INFO:biolib:View the result in your browser at: https://biolib.com/results/02d2a7af-a169-4d7c-8daa-097eb9efbf58/
INFO:biolib:Cloud: Initializing
INFO:biolib:Cloud: Pulling images...
INFO:biolib:Cloud: Computing...
INFO:biolib:Cloud: Computation finished


2025-03-26 22:17:35,944 | INFO : Extracted zip file to: output/
2025-03-26 22:17:35,944 | INFO : Done in 3.11 seconds


INFO:biolib:Cloud: Result Ready
INFO:biolib:Waiting for job 02d2a7af-a169-4d7c-8daa-097eb9efbf58 to finish...
INFO:biolib:Job 02d2a7af-a169-4d7c-8daa-097eb9efbf58 has finished.
INFO:biolib:Saving 5 files to result...


In [25]:
def extract_outside_ranges(md_filename):
    with open(md_filename, 'r') as file:
        content = file.readlines()
    ranges_to_delete = []
    outside_pattern = re.compile(r"outside\s+(\d+)\s+(\d+)")
    last_inside_range = None
    for line in content:
        outside_match = outside_pattern.search(line)
        if outside_match:
            start = int(outside_match.group(1))
            end = int(outside_match.group(2))
            ranges_to_delete.append((start, end))
    return ranges_to_delete

def extract_inside_ranges(md_filename):
    with open(md_filename, 'r') as file:
        content = file.readlines()
    inside_ranges = []
    inside_pattern = re.compile(r"inside\s+(\d+)\s+(\d+)")
    for line in content:
        inside_match = inside_pattern.search(line)
        if inside_match:
            start = int(inside_match.group(1))
            end = int(inside_match.group(2))
            inside_ranges.append((start, end))
    return inside_ranges

md_filename = "/content/result/deeptmhmm_results.md"
inside_ranges = extract_inside_ranges(md_filename)
outside_ranges = extract_outside_ranges(md_filename)

In [26]:
#Print the extracted ranges
print(pdb_id)
print("Extracellular:", outside_ranges)
print("Intracellular:", inside_ranges)

xxx2
Extracellular: [(1, 16), (72, 91), (155, 178), (294, 304)]
Intracellular: [(37, 49), (112, 133), (201, 274), (326, 588)]


# Trim the pdb
Remove all extracellular residues
Remove intracellular residues with pLDDT <= 70

In [27]:
def remove_residues(structure, ranges):
    for model in structure:
        for chain in model:
            residues_to_delete = []
            for residue in chain:
                residue_id = residue.id[1]
                # Check if the residue should be removed based on the ranges
                for start, end in ranges:
                    if start <= residue_id <= end:
                        residues_to_delete.append(residue)
            for residue in residues_to_delete:
                chain.detach_child(residue.id)

def remove_low_plddt_residues(structure, ranges, plddt_threshold=70):
    for model in structure:
        for chain in model:
            residues_to_delete = []
            for residue in chain:
                residue_id = residue.id[1]
                # Check if the residue is in the "inside" ranges
                for start, end in ranges:
                    if start <= residue_id <= end:
                        # Check the pLDDT (bfactor)
                        plddt_value = residue.child_list[0].bfactor
                        # Mark residue for deletion if pLDDT is <= 70
                        if plddt_value <= plddt_threshold:
                            residues_to_delete.append(residue)
                        break
            for residue in residues_to_delete:
                chain.detach_child(residue.id)


In [28]:
parser = PDBParser(QUIET=True)
structure = parser.get_structure("gpcr", pdb_file)

# Remove residues in the "outside" ranges
remove_residues(structure, outside_ranges)

# Remove "inside" residues with pLDDT <= 70
remove_low_plddt_residues(structure, inside_ranges, plddt_threshold=70)

# Save the trimmed pdb
io = PDBIO()
output_pdb_filename = os.path.join(output_folder, f"{pdb_id}.pdb")
io.set_structure(structure)
io.save(output_pdb_filename)

print(f"Saved modified PDB file: {output_pdb_filename}")


Saved modified PDB file: /content/xxx2.pdb
