In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 1.13.1+cu117
CUDA available: False


In [10]:
import os
import pandas as pd
import torch
import esm
from Bio.PDB import PDBParser
from Bio.Data import IUPACData
from pathlib import Path

from proteinttt.models.esmfold import ESMFoldTTT, DEFAULT_ESMFOLD_TTT_CFG

import warnings
from Bio.PDB.PDBExceptions import PDBConstructionWarning
warnings.simplefilter('ignore', PDBConstructionWarning)

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 10)
pd.set_option("max_colwidth", 100)

In [3]:
SABDAB_SUMMARY_PATH = Path("/scratch/project/open-32-14/pimenol1/ProteinTTT/ProteinTTT/data/20250714_0035730_summary.tsv")
OUTPUT_PATH = 'predicted_structures'
PDB_FILES_PATH = Path("./scratch/project/open-32-14/pimenol1/ProteinTTT/ProteinTTT/data/20250714_0035730_PDB/chothia")
os.makedirs(OUTPUT_PATH, exist_ok=True)

print(f"Summary file: {SABDAB_SUMMARY_PATH}")
print(f"PDB files path: {PDB_FILES_PATH}")
print(f"Output will be saved to: {OUTPUT_PATH}")

Summary file: /scratch/project/open-32-14/pimenol1/ProteinTTT/ProteinTTT/data/20250714_0035730_summary.tsv
PDB files path: scratch/project/open-32-14/pimenol1/ProteinTTT/ProteinTTT/data/20250714_0035730_PDB/chothia
Output will be saved to: predicted_structures


In [4]:
def get_sequence_from_pdb(pdb_file, h_chain_id, l_chain_id):
    """
    Extracts heavy and light chain sequences from a PDB file.
    """
    parser = PDBParser()
    try:
        structure = parser.get_structure("antibody", pdb_file)
    except FileNotFoundError:
        print(f"Warning: PDB file not found: {pdb_file}")
        return None

    sequences = {}
    for model in structure:
        for chain in model:
            chain_id = chain.get_id()
            if chain_id in [h_chain_id, l_chain_id]:
                seq = ""
                for residue in chain:
                    if residue.get_resname() in IUPACData.protein_letters_3to1:
                        seq += IUPACData.protein_letters_3to1[residue.get_resname()]
                sequences[chain_id] = seq

    if h_chain_id in sequences and l_chain_id in sequences:
        return sequences[h_chain_id] + sequences[l_chain_id]
    else:
        print(f"Warning: Could not find both H({h_chain_id}) and L({l_chain_id}) chains in {pdb_file}")
        return None

In [11]:
try:
    df = pd.read_csv(SABDAB_SUMMARY_PATH, sep='\t')
    print(f"Successfully loaded {len(df)} entries from the summary file.")
    df_subset = df.head(100)
    print(f"Processing a subset of {len(df_subset)} entries.")
except FileNotFoundError:
    print(f"Error: The summary file was not found at '{SABDAB_SUMMARY_PATH}'")
    df_subset = pd.DataFrame()

df_subset

Successfully loaded 437 entries from the summary file.
Processing a subset of 100 entries.


Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,7uow,H,L,0,A,protein,,spike glycoprotein,VIRAL PROTEIN,04/19/23,SARS-Cov2 S protein structure in complex with neutralizing monoclonal antibody 034_32,Homo sapiens; Severe acute respiratory syndrome coronavirus,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus,"Patel, A., Ortlund, E.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGKV1,Kappa,,,,,
1,7uow,J,I,0,C,protein,,spike glycoprotein,VIRAL PROTEIN,04/19/23,SARS-Cov2 S protein structure in complex with neutralizing monoclonal antibody 034_32,Homo sapiens; Severe acute respiratory syndrome coronavirus,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus,"Patel, A., Ortlund, E.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGKV1,Kappa,,,,,
2,7xj6,E,D,0,A,protein,,spike glycoprotein,VIRAL PROTEIN/IMMUNE SYSTEM,04/19/23,SARS-CoV-2 BA.1 Spike trimer in complex with 55A8 Fab and 58G6 Fab in the class 1 conformation,Homo sapiens; Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Guo, H., Gao, Y., Lu, Y., Yang, H., Ji, X.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV1,Kappa,,,,,
3,7xj6,G,F,0,B,protein,,spike glycoprotein,VIRAL PROTEIN/IMMUNE SYSTEM,04/19/23,SARS-CoV-2 BA.1 Spike trimer in complex with 55A8 Fab and 58G6 Fab in the class 1 conformation,Homo sapiens; Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Guo, H., Gao, Y., Lu, Y., Yang, H., Ji, X.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV1,Kappa,,,,,
4,7xj6,K,J,0,B,protein,,spike glycoprotein,VIRAL PROTEIN/IMMUNE SYSTEM,04/19/23,SARS-CoV-2 BA.1 Spike trimer in complex with 55A8 Fab and 58G6 Fab in the class 1 conformation,Homo sapiens; Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Guo, H., Gao, Y., Lu, Y., Yang, H., Ji, X.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7x9y,S,S,0,A | B | G,protein | protein | protein,NA | NA | NA,guanine nucleotide-binding protein g(i) subunit alpha-1 | guanine nucleotide-binding protein g(i...,MEMBRANE PROTEIN,08/24/22,Cryo-EM structure of the apo CCR3-Gi complex,Homo sapiens,homo sapiens,,homo sapiens | homo sapiens | homo sapiens,"Shao, Z., Tan, Y., Shen, Q., Yao, B., Hou, L., Qin, J., Xu, P., Mao, C., Chen, L., Zhang, H., Sh...",0.0,ELECTRON MICROSCOPY,,,True,True,unknown,unknown,unknown,,,,,
96,7vkt,E,E,0,B | D | C,protein | protein | protein,NA | NA | NA,guanine nucleotide-binding protein g(i) subunit alpha-1 | guanine nucleotide-binding protein g(i...,MEMBRANE PROTEIN,03/09/22,cryo-EM structure of LTB4-bound BLT1 in complex with Gi protein,Homo sapiens,homo sapiens,,homo sapiens | homo sapiens | homo sapiens,"He, Y., Wang, N.",0.0,ELECTRON MICROSCOPY,,,True,True,unknown,unknown,unknown,,,,,
97,7ru3,H,L,0,A,protein,,spike glycoprotein,VIRAL PROTEIN/Immune System,08/24/22,CC6.33 IgG in complex with SARS-CoV-2-6P-Mut7 S protein (non-uniform refinement),Homo sapiens; Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Ozorowski, G., Turner, H.L., Ward, A.B.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,
98,7ru3,E,F,0,B,protein,,spike glycoprotein,VIRAL PROTEIN/Immune System,08/24/22,CC6.33 IgG in complex with SARS-CoV-2-6P-Mut7 S protein (non-uniform refinement),Homo sapiens; Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Ozorowski, G., Turner, H.L., Ward, A.B.",0.0,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,
