In [1]:
from Bio.PDB import PDBParser, PDBIO
from Bio.PDB.Structure import Structure as BStructure
from Bio.PDB.Model import Model as BModel
from Bio.PDB.Chain import Chain as BChain
from Bio.PDB.Residue import Residue as BResidue
from Bio.PDB.Atom import Atom as BAtom

In [2]:
import json
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool
from tqdm import tqdm

from utils import *

Random seed set as 42


In [3]:
seqs = json.load(open("./data/sequence_pairs.json", "rb"))
len(seqs)

5388

In [12]:
# get xyz of each AA
for i in tqdm(range(len(seqs))):
    
#     if seqs[i]["pdb"]!="7pgb":
#         continue
#     if seqs[i]["pdb"]!="4o51":
#         continue

    if seqs[i]["pdb"]!="5w08":
        continue
    
    # Heavy chain
    seqs[i]["Hpos"] = get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=seqs[i]["Hchain"])

    # Light chain
    seqs[i]["Lpos"] = get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=seqs[i]["Lchain"])

    # Antigen chains
    seqs[i]["Apos"] = []
#     Achains = seqs[i]["Achain"].split(" | ")
#     for j in range(len(Achains)):
#         seqs[i]["Apos"].append(get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=Achains[j]))

    for j in range(len(seqs[i]["Achain"])):
        seqs[i]["Apos"].append(get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=seqs[i]["Achain"][j]))
        
    
    break

  0%|                                                                                 | 3/5388 [00:01<59:44,  1.50it/s]


In [15]:
def get_residue_pos(pdb_path="../SAbDab_20221124/all_structures/raw/7k5y.pdb", chain="M"):
    p = PDBParser()

    structure = p.get_structure('input', pdb_path)

    AA_coord = []
    res_names = []
    # chain = chain.lower() if chain not in [c.get_id() for c in structure[0].get_list()] else chain.upper()
    for residue in structure[0][chain]:
        # if residue.get_resname() not in AA_abbr:
        #     continue
#         print(to_abbr(residue.get_resname()))
        res_names.append(to_abbr(residue.get_resname()))
        if (residue.get_resname() not in AA_abbr) and (residue.get_resname() not in AA_abbr_alias):
            if len(AA_coord)>=1:
                AA_coord.append(AA_coord[-1])
            else:
                AA_coord.append(np.zeros((4, 3)))
            continue


        for temp_atom in residue:
            break

        try:
            N_coord = residue['N'].get_coord()
        except:
            N_coord = temp_atom.get_coord()
        try:
            CA_coord = residue['CA'].get_coord()
        except:
            CA_coord = temp_atom.get_coord()
        try:
            C_coord = residue['C'].get_coord()
        except:
            C_coord = temp_atom.get_coord()
        try:
            O_coord = residue['O'].get_coord()
        except:
            O_coord = temp_atom.get_coord()

        coord = np.vstack([N_coord, CA_coord, C_coord, O_coord])
        AA_coord.append(coord)

    return AA_coord, "".join(res_names)

coord, res_names = get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=seqs[i]["Achain"][j])

In [16]:
res_names

'ELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPVTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGFRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQS******************************************************'

In [18]:
seqs[i]["Aseq"]

['TNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPVTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGFRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGGALEVLFQ']

In [17]:
seqs[i]["Achain"]

['C']

In [6]:
seqs[i]

{'pdb': '5w08',
 'Hchain': 'K',
 'Lchain': 'L',
 'Achain': ['C'],
 'Hseq': ['EVQLVQSGAEVKKPGASVKVSCKTSGYTFTAYYLHWVRQAPGQGFEWMAWINPNTGDTNYAQKFQGRVTLSRDTSITTAYMELTRLRSDDTAVYYCAKDLTLMYVFDSGWARGAHDYYGMDVWGQGTTVAVSGASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCDKHHHHHH'],
 'Lseq': ['PSALTQPASVSGSPGQSVTISCTGTNSDVGTFDLVSWYQQYPGKAPKLIIYEGSRRPSGVSDRFSGSKSGNTASLTISGLQAEDEADYYCSSYAGSVVFGGGTKLTVLGQPKGAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS'],
 'Aseq': ['TNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPVTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGFRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGGALEVLFQ'],
 'L1': 'NSDVGTFDL',
 'L2': 'EGS',
 'L3': 'SSYAGSVV',
 'H1': 'GYTFTAYY',
 'H2': 'INPNTGDT',
 'H3': 'AKDLTLMYVFDSGWARGAHDYYGMDV',
 'Hpos': [ar

In [7]:
len(seqs[i]["Aseq"][0])

291

In [8]:
len(seqs[i]["Apos"][0])

326

In [69]:
seqs[i]["Aseq"]

['TNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPVTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGFRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGGALEVLFQ']

In [70]:
seqs[i]["Apos"]

[[array([[-114.924,  -43.922,   14.948],
         [-114.946,  -43.396,   13.59 ],
         [-113.503,  -43.441,   13.061],
         [-112.672,  -44.184,   13.585]], dtype=float32),
  array([[-113.208,  -42.638,   12.036],
         [-111.908,  -42.608,   11.38 ],
         [-111.216,  -41.255,   11.526],
         [-110.336,  -40.926,   10.724]], dtype=float32),
  array([[-111.639,  -40.433,   12.484],
         [-110.934,  -39.204,   12.832],
         [-110.603,  -39.267,   14.32 ],
         [-111.493,  -39.492,   15.151]], dtype=float32),
  array([[-109.325,  -39.11 ,   14.652],
         [-108.911,  -39.109,   16.051],
         [-109.34 ,  -37.802,   16.709],
         [-108.913,  -36.722,   16.286]], dtype=float32),
  array([[-110.176,  -37.892,   17.745],
         [-110.81 ,  -36.71 ,   18.314],
         [-110.377,  -36.395,   19.739],
         [-110.878,  -35.425,   20.317]], dtype=float32),
  array([[-109.469,  -37.172,   20.326],
         [-109.098,  -36.968,   21.718],
         [-10

In [63]:
cnt = 0
for i in range(len(seqs)):
    if "Aseqs" not in seqs[i].keys():
        print(seqs[i]["pdb"])
        cnt += 1
cnt

7t17
7t17
4o51
5w08
7lg6
7t6s
5mi0
2ny3
4yo0
7ugq
3qa3
6x1t
1nca
4ypg
7mxl
6hga
6wj1
3dvn
6mph
2vxs
6oor
7v4w
5bk2
7bh8
5hys
6x87
7rgp
6o2b
2ny0
7rlw
6vi0
7um3
6myy
7t4r
6xox
6bdz
6qb3
7dc8
7yqz
7rda
4f9p
7v23
6glw
4j8r
6mto
4ydv
7lja
3x3f
5zia
7p02
5vxr
6okm
2r56
5x2o
6z7w
7s0x
6cde
2qhr
5e2w
5dup
4hc1
4lmq
7t9b
5dsc
6wxl
3mlt
4ffv
3ulu
6w51
1vfb
3kj6
5kov
6cde
6xsk
6rp8
6awo
8err
6pzw
7s0x
6kva
6x5b
4qhu
6nqd
1ggi
4ypg
6x3x
6mfp
4xx1
7urc
6nms
4cad
4yr6
3q3g
2ht2
3opz
2b1h
7xav
6vel
5lbs
6s3t
7pc2
3vi3
7lue
2vdm
6mhg
6adc
6opo
7x26
5drz
5x0t
6mhg
3ks0
7fci
6v4n
8d5c
6d0u
2vwe
7x38
7khf
6wxl
7vgy
6vmj
5ifj
6h3t
4ers
2vc2
7ty0
7luc
6xp6
7rew
1zea
6q18
5ibl
7yu3
5a2i
1bog
7mdj
5cus
3eob
4ffw
7mly
3nid
3sge
6nf5
6xbk
1sy6
5xmh
5tkj
4ut6
3etb
8dpg
7yhk
1xct
6mb3
1yqv
7lo7
6uye
7ure
6z7y
7ue0
5mhr
3iu3
5drz
3ifp
5myk
4xgz
4rau
7z3a
7ew5
7eo4
6vmj
7zwf
6o9h
6m38
7vux
6x1v
2h2p
7njz
7rbt
5cws
6wo5
6bf4
4f15
4dn4
7v05
7ly9
6cf2
6x9r
2zcl
3wih
4yhp
7lex
8dpi
7jkt
6b3m
5ebl
4mqx
7tjq
5csz
7amr


5388

In [14]:
for idx in range(len(["N"])):  # some chains do not participate
    residue = chain.get_residue(idx)
    coord = residue.get_coord_map()

AttributeError: 'str' object has no attribute 'get_residue'

In [26]:
list(chain.get_residues())

[<Residue ALA het=  resseq=231 icode= >,
 <Residue PRO het=  resseq=232 icode= >,
 <Residue GLU het=  resseq=233 icode= >,
 <Residue LEU het=  resseq=234 icode= >,
 <Residue LEU het=  resseq=235 icode= >,
 <Residue GLY het=  resseq=236 icode= >,
 <Residue HOH het=W resseq=301 icode= >]

In [50]:
for res in chain.get_residues():
#     res["N"].get_coord()
    break

In [51]:
res.get_resname()

'ALA'

In [53]:
res["N"].get_coord()

array([49.747, 17.411,  5.857], dtype=float32)

In [49]:
for res in chain.get_residues():
    print(res.get_resname())

ALA
PRO
GLU
LEU
LEU
GLY
HOH


In [43]:
for atom in res.get_atoms():
    break

In [47]:
t = 0
for atom in res.get_atoms():
    print(atom.get_name())
t

N
CA
C
O
CB


0

In [44]:
atom.get_name()

'N'

In [45]:
atom.get_coord()

array([49.747, 17.411,  5.857], dtype=float32)

In [24]:
chain = structure[0]["N"]

for idx in range(len(chain)):  # some chains do not participate
    residue = chain.get_residues()
    coord = residue.get_coord_map()
    break

AttributeError: 'generator' object has no attribute 'get_coord_map'

In [15]:
p = PDBParser()
pdb_path = pdb_path="../SAbDab_20221124/all_structures/raw/4o51.pdb"
chain = "N"
structure = p.get_structure('input', pdb_path)

AA_coord = []

for residue in structure[0][chain]:
    if residue.get_resname() not in AA_abbr:
        continue

    for temp_atom in residue:
        break

    try:
        N_coord = residue['N'].get_coord()
    except:
        N_coord = temp_atom.get_coord()
    try:
        CA_coord = residue['CA'].get_coord()
    except:
        CA_coord = temp_atom.get_coord()
    try:
        C_coord = residue['C'].get_coord()
    except:
        C_coord = temp_atom.get_coord()
    try:
        O_coord = residue['O'].get_coord()
    except:
        O_coord = temp_atom.get_coord()

    coord = np.vstack([N_coord, CA_coord, C_coord, O_coord])
    AA_coord.append(coord)

In [12]:
AA_coord

[array([[49.747, 17.411,  5.857],
        [49.37 , 18.293,  4.764],
        [48.776, 19.579,  5.295],
        [48.005, 19.539,  6.249]], dtype=float32),
 array([[49.154, 20.706,  4.68 ],
        [50.175, 20.896,  3.659],
        [51.491, 21.359,  4.239],
        [51.563, 21.519,  5.443]], dtype=float32),
 array([[52.497, 21.59 ,  3.398],
        [53.811, 22.09 ,  3.812],
        [54.439, 22.935,  2.701],
        [53.956, 22.96 ,  1.59 ]], dtype=float32),
 array([[55.523, 23.632,  2.984],
        [56.138, 24.515,  2.001],
        [56.72 , 23.751,  0.837],
        [57.497, 22.851,  1.016]], dtype=float32),
 array([[56.343, 24.135, -0.362],
        [56.765, 23.421, -1.568],
        [57.57 , 24.328, -2.507],
        [58.376, 23.855, -3.31 ]], dtype=float32),
 array([[57.322, 25.632, -2.402],
        [57.999, 26.647, -3.187],
        [59.301, 27.105, -2.557],
        [59.541, 26.917, -1.366]], dtype=float32)]

In [7]:
seqs[i]

{'pdb': '3vi3',
 'Hchain': 'H',
 'Lchain': 'L',
 'Achain': ['D'],
 'Hseq': ['QVHLQQSGAELMKPGASVKISCKATGYTFTSYWIEWVKQRPGHGLEWLGEILPGSGYIHYNEKFKGKATFTTDTSSNTAYMQLSSLTSEDSAVYYCSRALALYAMDYWGQGTSVTVSSAKTTPPSVYPLAPGSAAQTNSMVTLGCLVKGYFPEPVTVTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVPSSTWPSETVTCNVAHPASSTKVDKKIVPR'],
 'Lseq': ['DIVMTQATPSIPVTPGESVSISCRSNKSLLHSNGNTYLYWFLQRPGQSPRLLIFRMSNLASGVPDRFSGSGSGTAFTLRISRVEAADVGIYFCLQHLEYPFTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNEC'],
 'Aseq': ['QTDENRCLKANAKSCGECIQAGPNCGWCTNSTFLQEGMPTSARCDDLEALKKKGCPPDDIENPRGSKDIKKNKNVTNRSKGTAEKLKPEDIHQIQPQQLVLRLRSGEPQTFTLKFKRAEDYPIDLYYLMDLSYSMKDDLENVKSLGTDLMNEMRRITSDFRIGFGSFVEKTVMPYISTTPAKLRNPCTSEQNCTTPFSYKNVLSLTNKGEVFNELVGKQRISGNLDSPEGGFDAIMQVAVCGSLIGWRNVTRLLVFSTDAGFHFAGDGKLGGIVLPNDGQCHLENNMYTMSHYYDYPSIAHLVQKLSENNIQTIFAVTEEFQPVYKELKNLIPKSAVGTLSANSSNVIQLIIDAYNSLSSEVILENGKLSEGVTISYKSYCKNGVNGTGENGRKCSNISIGDEVQFEISITSNKCPKKDSDSFKIRPLGFTEEVEVILQYICECEGGLENLYFQ']

In [8]:
"../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), seqs[i]["Hchain"]

('../SAbDab_20221124/all_structures/raw/3vi3.pdb', 'H')

In [9]:
i = 777

pdb_path = "../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"])

chain=seqs[i]["Hchain"]
# chain=seqs[i]["Lchain"]
# chain=Achains[j]

In [10]:
get_residue_pos("../SAbDab_20221124/all_structures/raw/{}.pdb".format(seqs[i]["pdb"]), chain=seqs[i]["Hchain"])

[array([[-40.098, -20.685,  41.029],
        [-39.076, -19.763,  40.541],
        [-37.703, -20.43 ,  40.481],
        [-37.115, -20.774,  41.509]], dtype=float32),
 array([[-37.207, -20.616,  39.261],
        [-35.934, -21.284,  39.034],
        [-35.347, -20.869,  37.69 ],
        [-36.01 , -20.965,  36.651]], dtype=float32),
 array([[-34.105, -20.395,  37.724],
        [-33.357, -20.096,  36.509],
        [-32.333, -21.195,  36.261],
        [-31.256, -21.201,  36.858]], dtype=float32),
 array([[-32.677, -22.132,  35.387],
        [-31.792, -23.246,  35.083],
        [-30.902, -22.903,  33.897],
        [-31.363, -22.326,  32.918]], dtype=float32),
 array([[-29.624, -23.254,  33.983],
        [-28.684, -22.925,  32.918],
        [-28.251, -24.158,  32.134],
        [-28.091, -25.242,  32.697]], dtype=float32),
 array([[-28.067, -23.982,  30.831],
        [-27.597, -25.058,  29.969],
        [-26.602, -24.5  ,  28.961],
        [-26.634, -23.31 ,  28.637]], dtype=float32),
 array([[-

In [11]:
p = PDBParser()

structure = p.get_structure('input', pdb_path)

AA_coord = []
chain = chain.lower() if chain not in [c.get_id() for c in structure[0].get_list()] else chain
print(chain, pdb_path)
for residue in structure[0][chain]:
    if residue.get_resname() not in AA_abbr:
        continue

    for temp_atom in residue:
        break

    try:
        N_coord = residue['N'].get_coord()
    except:
        N_coord = temp_atom.get_coord()
    try:
        CA_coord = residue['CA'].get_coord()
    except:
        CA_coord = temp_atom.get_coord()
    try:
        C_coord = residue['C'].get_coord()
    except:
        C_coord = temp_atom.get_coord()
    try:
        O_coord = residue['O'].get_coord()
    except:
        O_coord = temp_atom.get_coord()

    coord = np.vstack([N_coord, CA_coord, C_coord, O_coord])
    AA_coord.append(coord)

C ../SAbDab_20221124/all_structures/raw/4g6a.pdb


In [12]:
residue.get_resname(), residue.get_id(), len(AA_coord), AA_coord[0], AA_coord[-1]

('HOH',
 ('W', 345, ' '),
 220,
 array([[-40.098, -20.685,  41.029],
        [-39.076, -19.763,  40.541],
        [-37.703, -20.43 ,  40.481],
        [-37.115, -20.774,  41.509]], dtype=float32),
 array([[ -3.305, -53.304,  57.853],
        [ -2.185, -54.12 ,  57.393],
        [ -2.309, -55.574,  57.853],
        [ -2.136, -56.495,  57.055]], dtype=float32))

In [13]:
with open("./data/data.json", "wb") as f:
    pickle.dump(seqs, f)
f.close()

# 滤掉Aseq、Hseq、Lseq为空的

In [14]:
data = pickle.load(open("./data/data.json", "rb"))
len(data)

5388

In [15]:
data1 = []

for i in tqdm(range(len(data))):
    if data[i]["Aseq"][0]=="" or data[i]["Aseq"][0]=="**":
        print("delete ", i)
    else:
        data1.append(data[i])
len(data1)

100%|██████████████████████████████████████████████████████████████████████████| 5388/5388 [00:00<00:00, 769036.61it/s]

delete  118
delete  377
delete  410
delete  413
delete  610
delete  693
delete  1181
delete  1302
delete  1328
delete  2033
delete  2661
delete  2687
delete  2774
delete  3123
delete  3214
delete  3295
delete  3505
delete  3560
delete  3570
delete  4176
delete  4223
delete  4235
delete  4253
delete  4351
delete  4532
delete  4776
delete  4910
delete  5034
delete  5175





5359

In [16]:
with open("./data/data.json", "wb") as f:
    pickle.dump(data1, f)
f.close()