<a href="https://colab.research.google.com/github/Zebreu/DeorphaNN/blob/main/preprocessing/template_trim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook will run DeepTMHMM on your GPCR and trim the GPCR pdb based on residue identity and pLDDT


In [None]:
#@title Uoload PDB file (GPCR only)
from google.colab import files

# Prompt user to upload a PDB file
uploaded = files.upload()

# Get the first uploaded file name
pdb_file = list(uploaded.keys())[0]

print(f"Uploaded PDB file: {pdb_file}")


In [None]:
#@title Dependencies
%%capture
import torch
import pandas as pd
import numpy as np
import os
import json
!pip install biopython

from Bio.PDB import PDBParser, PDBIO
from Bio.SeqUtils import seq1
from datetime import datetime

!pip3 install -qU pybiolib
import biolib
import re

import random
import string
pdb_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))

output_folder = "/content/"

# Run DeepTMHMM

In [None]:



parser = PDBParser(QUIET=True)
structure = parser.get_structure("protein", pdb_file)
#get sequence from the first model and first chain
model = structure[0]
chain = list(model.get_chains())[0]  # Assuming there's only one chain

residues = [residue for residue in chain.get_residues() if residue.get_id()[0] == " "]
sequence = "".join([seq1(residue.get_resname()) for residue in residues])

sequence

In [None]:
!echo -e ">{pdb_id}\n{sequence}" > query.fasta

In [None]:
deeptmhmm = biolib.load('DTU/DeepTMHMM')

In [None]:
#Run DeepTMHMM
deeptmhmm_job = deeptmhmm.cli(args='--fasta query.fasta')
deeptmhmm_job.save_files('result')

In [None]:
def extract_outside_ranges(md_filename):
    with open(md_filename, 'r') as file:
        content = file.readlines()
    ranges_to_delete = []
    outside_pattern = re.compile(r"outside\s+(\d+)\s+(\d+)")
    last_inside_range = None
    for line in content:
        outside_match = outside_pattern.search(line)
        if outside_match:
            start = int(outside_match.group(1))
            end = int(outside_match.group(2))
            ranges_to_delete.append((start, end))
    return ranges_to_delete

def extract_inside_ranges(md_filename):
    with open(md_filename, 'r') as file:
        content = file.readlines()
    inside_ranges = []
    inside_pattern = re.compile(r"inside\s+(\d+)\s+(\d+)")
    for line in content:
        inside_match = inside_pattern.search(line)
        if inside_match:
            start = int(inside_match.group(1))
            end = int(inside_match.group(2))
            inside_ranges.append((start, end))
    return inside_ranges

md_filename = "/content/result/deeptmhmm_results.md"
inside_ranges = extract_inside_ranges(md_filename)
outside_ranges = extract_outside_ranges(md_filename)

In [None]:
#Print the extracted ranges
print(pdb_id)
print("Extracellular:", outside_ranges)
print("Intracellular:", inside_ranges)

# Trim the pdb
Remove all extracellular residues
Remove intracellular residues with pLDDT <= 70

In [None]:
def remove_residues(structure, ranges):
    for model in structure:
        for chain in model:
            residues_to_delete = []
            for residue in chain:
                residue_id = residue.id[1]
                # Check if the residue should be removed based on the ranges
                for start, end in ranges:
                    if start <= residue_id <= end:
                        residues_to_delete.append(residue)
            for residue in residues_to_delete:
                chain.detach_child(residue.id)

def remove_low_plddt_residues(structure, ranges, plddt_threshold=70):
    for model in structure:
        for chain in model:
            residues_to_delete = []
            for residue in chain:
                residue_id = residue.id[1]
                # Check if the residue is in the "inside" ranges
                for start, end in ranges:
                    if start <= residue_id <= end:
                        # Check the pLDDT (bfactor)
                        plddt_value = residue.child_list[0].bfactor
                        # Mark residue for deletion if pLDDT is <= 70
                        if plddt_value <= plddt_threshold:
                            residues_to_delete.append(residue)
                        break
            for residue in residues_to_delete:
                chain.detach_child(residue.id)


In [None]:
parser = PDBParser(QUIET=True)
structure = parser.get_structure("gpcr", pdb_file)

# Remove residues in the "outside" ranges
remove_residues(structure, outside_ranges)

# Remove "inside" residues with pLDDT <= 70
remove_low_plddt_residues(structure, inside_ranges, plddt_threshold=70)

# Save the trimmed pdb
io = PDBIO()
output_pdb_filename = os.path.join(output_folder, f"{pdb_id}.pdb")
io.set_structure(structure)
io.save(output_pdb_filename)

print(f"Saved modified PDB file: {output_pdb_filename}")
