In [1]:
from Bio.PDB import PDBParser, PDBIO
from Bio.PDB.Structure import Structure as BStructure
from Bio.PDB.Model import Model as BModel
from Bio.PDB.Chain import Chain as BChain
from Bio.PDB.Residue import Residue as BResidue
from Bio.PDB.Atom import Atom as BAtom

In [2]:
import json
import math
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

from utils import *

# data

In [3]:
path = "./data/cdr_seqs_20221128.json"

with open(path, "rb") as f:
    cdr_seqs = json.loads(f.read())

print(len(cdr_seqs.keys()))
cdr_seqs

6030


{'2hh0': {'HL': {'L1': 'QDIGNN',
   'L2': 'ATS',
   'L3': 'LQHDTFPLT',
   'H1': 'GFNIEDSY',
   'H2': 'IDPEDGET',
   'H3': 'GRGAYYIKEDF'}},
 '1mhp': {'XY': {'L1': 'SSVNH',
   'L2': 'LTS',
   'L3': 'QQWSGNPWT',
   'H1': 'GFTFSRYT',
   'H2': 'ISGGGHT',
   'H3': 'TRGFGDGGYFDV'},
  'HL': {'L1': 'SSVNH',
   'L2': 'LTS',
   'L3': 'QQWSGNPWT',
   'H1': 'GFTFSRYT',
   'H2': 'ISGGGHT',
   'H3': 'TRGFGDGGYFDV'}},
 '1mhh': {'DC': {'L1': 'QSLLNSRTRKNY',
   'L2': 'WAS',
   'L3': 'KQAYIPPLT',
   'H1': 'GYTFTDFS',
   'H2': 'VNTETGEP',
   'H3': 'ARFLLRQYFDV'},
  'BA': {'L1': 'QSLLNSRTRKNY',
   'L2': 'WAS',
   'L3': 'KQAYIPPLT',
   'H1': 'GYTFTDFS',
   'H2': 'VNTETGEP',
   'H3': 'ARFLLRQYFDV'}},
 '7st3': {'Z': {'H1': 'GSIFSINT', 'H2': 'ISSGGST', 'H3': 'YGLSYSNDDY'},
  'D': {'H1': 'GSIFSINT', 'H2': 'ISSGGST', 'H3': 'YGLSYSNDDY'},
  'N': {'H1': 'GSIFSINT', 'H2': 'ISSGGST', 'H3': 'YGLSYSNDDY'},
  'X': {'H1': 'GSIFSINT', 'H2': 'ISSGGST', 'H3': 'YGLSYSNDDY'},
  'B': {'H1': 'GSIFSINT', 'H2': 'ISSGGST', 'H3': 

In [4]:
cdr_seqs["7sk7"]

{'DC': {'L1': 'QSVSSA',
  'L2': 'SAS',
  'L3': 'QQYYYPLFT',
  'H1': 'GFNFSYSS',
  'H2': 'IYSSYGYT',
  'H3': 'ARVYPWWYYKYYHGALDY'},
 'K': {'H1': 'GRTISRYA', 'H2': 'ARRSGDGA', 'H3': 'AIDSDTFYSGSYDY'}}

In [5]:
summary_file = pd.read_csv("../../MSAI_Project/SAbDab_20221124/sabdab_summary_all.tsv", sep="\t")
summary_file

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,7t17,J,K,0,C,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,
1,7t17,F,G,0,A,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,
2,6fe4,F,,0,A,protein,,shiga-like toxin 2 subunit b,TOXIN,12/29/17,Crystal structure of the complex between Shiga...,ENTEROBACTERIA PHAGE 933W; VICUGNA PACOS,vicugna pacos,,enterobacteria phage 933w,"Bernedo, R., Muyldermans, S., Sterckx, Y.G.J.",3.0,X-RAY DIFFRACTION,0.207,0.169,False,True,IGHV3,,,9.6e-09,-10.938139080820335,SPR,,TBD
3,7jmo,H,L,0,A,protein,,spike protein s1,IMMUNE SYSTEM,08/02/20,Crystal structure of SARS-CoV-2 receptor bindi...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Wu, N.C., Yuan, M., Liu, H., Zhu, X., Wilson, ...",2.359,X-RAY DIFFRACTION,0.238,0.195,False,True,IGHV3,IGKV3,Kappa,,,,,
4,7sgf,H,L,0,A | a,protein | protein,NA | NA,gpc-i53-50a | unknown,VIRAL PROTEIN/IMMUNE SYSTEM,10/12/22,Lassa virus glycoprotein construct (Josiah GPC...,Lassa mammarenavirus; Oryctolagus cuniculus,oryctolagus cuniculus,oryctolagus cuniculus,lassa mammarenavirus | lassa mammarenavirus,"Antanasijevic, A., Brouwer, P.J.M., Ward, A.B.",4.41,ELECTRON MICROSCOPY,,,False,True,unknown,unknown,unknown,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13426,7lo6,J,I,0,C,protein,,envelope glycoprotein bg505 sosip.664 gp120,VIRAL PROTEIN/IMMUNE SYSTEM,04/14/21,Structure of CD4 mimetic BNM-III-170 in comple...,HUMAN IMMUNODEFICIENCY VIRUS 1; HOMO SAPIENS,homo sapiens,homo sapiens,human immunodeficiency virus 1,"Jette, C.A., Bjorkman, P.J.",3.9,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,
13427,3vi3,H,L,0,D,protein,,integrin beta-1,CELL ADHESION/IMMUNE SYSTEM,09/21/11,Crystal structure of alpha5beta1 integrin head...,HOMO SAPIENS; MUS MUSCULUS,mus musculus,mus musculus,homo sapiens,"Nagae, M., Nogi, T., Takagi, J.",2.9,X-RAY DIFFRACTION,0.267,0.207,False,True,IGHV1,IGKV2,Kappa,,,,,
13428,6zdg,F,G,0,D,protein,,spike glycoprotein,VIRAL PROTEIN,06/14/20,Association of three complexes of largely stru...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Duyvesteyn, H.M.E., Zhou, D., Zhao, Y., Fry, E...",4.7,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGKV1,Kappa,,,,,
13429,7sk7,K,,0,,,,,SIGNALING PROTEIN/IMMUNE SYSTEM,07/27/22,Cryo-EM structure of human ACKR3 in complex wi...,Homo sapiens; Lama glama,lama glama,,,"Yen, Y.C., Schafer, C.T., Gustavsson, M., Hand...",0,ELECTRON MICROSCOPY,,,False,True,IGHV1,,,,,,,


In [6]:
summary_file[summary_file["pdb"]=="7sk7"]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
4898,7sk7,D,C,0,B | A,protein | protein,NA | NA,stromal cell-derived factor 1 | atypical chemo...,SIGNALING PROTEIN/IMMUNE SYSTEM,07/27/22,Cryo-EM structure of human ACKR3 in complex wi...,Homo sapiens; Lama glama,homo sapiens,homo sapiens,homo sapiens | homo sapiens,"Yen, Y.C., Schafer, C.T., Gustavsson, M., Hand...",0,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGKV1,Kappa,,,,,
13429,7sk7,K,,0,,,,,SIGNALING PROTEIN/IMMUNE SYSTEM,07/27/22,Cryo-EM structure of human ACKR3 in complex wi...,Homo sapiens; Lama glama,lama glama,,,"Yen, Y.C., Schafer, C.T., Gustavsson, M., Hand...",0,ELECTRON MICROSCOPY,,,False,True,IGHV1,,,,,,,


### filtering protocol
- resolution better than 3A
- have VH and VL
- any antibody sequences <95% similarity
- at least 5 residues in contact with antigen

E. Liberis, P. Veličković, P. Sormanni, M. Vendruscolo, and P. Liò, "Parapred: antibody paratope prediction using convolutional and recurrent neural networks," Bioinformatics, vol. 34, no. 17, pp. 2944-2950, 2018.


### my protocol:
- resolution <4A
- have VH, VL, VA

In [7]:
def filtering(row):
    Hchain = row["Hchain"]
    Lchain = row["Lchain"]
    Achain = row["antigen_chain"]
    res = row["resolution"]

    # and float(str(res).split()[-1])>0 # Q: what does it mean when resolution=0?
    if str(Hchain)!="nan" and \
        str(Lchain)!="nan" and \
        str(Achain)!="nan" and \
        str(res)!="NOT" and \
        float(str(res).split()[-1])<4.0 and \
        str(Hchain)!=str(Lchain) and \
        str(Hchain) not in str(Achain).split(" | ") and \
        str(Lchain) not in str(Achain).split(" | "):
        
#         str(Hchain).upper() not in str(Achain).upper().split(" | ") and \
#         str(Lchain).upper() not in str(Achain).upper().split(" | "):
        
        return 1
    else:
        return 0

summary_file["valid"] = summary_file.apply(filtering, axis=1)
summary_file = summary_file[summary_file["valid"]==1]
summary_file

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,valid
0,7t17,J,K,0,C,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,,1
1,7t17,F,G,0,A,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,,1
3,7jmo,H,L,0,A,protein,,spike protein s1,IMMUNE SYSTEM,08/02/20,Crystal structure of SARS-CoV-2 receptor bindi...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Wu, N.C., Yuan, M., Liu, H., Zhu, X., Wilson, ...",2.359,X-RAY DIFFRACTION,0.238,0.195,False,True,IGHV3,IGKV3,Kappa,,,,,,1
6,4o51,B,A,0,N,peptide,,ides hinge peptide,IMMUNE SYSTEM,12/19/13,Crystal structure of the QAA variant of anti-h...,"ORYCTOLAGUS CUNICULUS, HOMO SAPIENS; SYNTHETIC...","oryctolagus cuniculus, homo sapiens","oryctolagus cuniculus, homo sapiens",homo sapiens,"Malia, T.J., Luo, J., Teplyakov, A., Gilliland...",2.204,X-RAY DIFFRACTION,0.213,0.182,False,True,IGHV1,IGKV1,Kappa,,,,,,1
9,7orb,E,F,0,X,protein,,spike protein s1,VIRAL PROTEIN,07/07/21,Crystal structure of the L452R mutant receptor...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Zhou, D., Ren, J., Stuart, D.I.",2.5,X-RAY DIFFRACTION,0.251,0.213,False,True,IGHV3,IGKV1,Kappa,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,7tas,H,L,0,E,protein,,spike glycoprotein,VIRAL PROTEIN/IMMUNE SYSTEM,01/12/22,SARS-CoV-2 spike in complex with the S2K146 ne...,HOMO SAPIENS; SEVERE ACUTE RESPIRATORY SYNDROM...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Park, Y.J., Veesler, D., Seattle Structural Ge...",3.2,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGLV1,Lambda,,,,,,1
13424,7wk0,A,B,0,C,protein,,spike protein s1,VIRAL PROTEIN,07/13/22,Local refine of Omicron spike bitrimer with 6m...,Homo sapiens; Severe acute respiratory syndrom...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Zhan, W.Q., Zhang, X., Chen, Z.G., Sun, L.",0,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGLV3,Lambda,,,,,,1
13425,6ejm,H,h,0,B,protein,,cd81 antigen,CELL ADHESION,09/22/17,CRYSTAL STRUCTURE OF HUMAN CD81 LARGE EXTRACEL...,HOMO SAPIENS; MUS MUSCULUS,mus musculus,,homo sapiens,"Kuglstatter, A., Harris, S.F., Villasenor, A.",2.15,X-RAY DIFFRACTION,0.278,0.215,True,True,unknown,unknown,unknown,8.6e-10,-12.367556905430469,SPR,,TBD,1
13426,7lo6,J,I,0,C,protein,,envelope glycoprotein bg505 sosip.664 gp120,VIRAL PROTEIN/IMMUNE SYSTEM,04/14/21,Structure of CD4 mimetic BNM-III-170 in comple...,HUMAN IMMUNODEFICIENCY VIRUS 1; HOMO SAPIENS,homo sapiens,homo sapiens,human immunodeficiency virus 1,"Jette, C.A., Bjorkman, P.J.",3.9,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,,1


In [8]:
# capitalise Hchain and Lchain
summary_file["Hchain"] = summary_file["Hchain"].str.capitalize()
summary_file["Lchain"] = summary_file["Lchain"].str.capitalize()
summary_file["antigen_chain"] = summary_file["antigen_chain"].str.upper()
summary_file

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,valid
0,7t17,J,K,0,C,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,,1
1,7t17,F,G,0,A,protein,,core protein,VIRUS,11/23/22,Zika Virus asymmetric unit bound with IgM anti...,Homo sapiens; Zika virus,homo sapiens,homo sapiens,zika virus,"Miller, A.S., Kuhn, R.J.",0,ELECTRON MICROSCOPY,,,False,True,IGHV4,IGLV1,Lambda,,,,,,1
3,7jmo,H,L,0,A,protein,,spike protein s1,IMMUNE SYSTEM,08/02/20,Crystal structure of SARS-CoV-2 receptor bindi...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Wu, N.C., Yuan, M., Liu, H., Zhu, X., Wilson, ...",2.359,X-RAY DIFFRACTION,0.238,0.195,False,True,IGHV3,IGKV3,Kappa,,,,,,1
6,4o51,B,A,0,N,peptide,,ides hinge peptide,IMMUNE SYSTEM,12/19/13,Crystal structure of the QAA variant of anti-h...,"ORYCTOLAGUS CUNICULUS, HOMO SAPIENS; SYNTHETIC...","oryctolagus cuniculus, homo sapiens","oryctolagus cuniculus, homo sapiens",homo sapiens,"Malia, T.J., Luo, J., Teplyakov, A., Gilliland...",2.204,X-RAY DIFFRACTION,0.213,0.182,False,True,IGHV1,IGKV1,Kappa,,,,,,1
9,7orb,E,F,0,X,protein,,spike protein s1,VIRAL PROTEIN,07/07/21,Crystal structure of the L452R mutant receptor...,SEVERE ACUTE RESPIRATORY SYNDROME CORONAVIRUS ...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Zhou, D., Ren, J., Stuart, D.I.",2.5,X-RAY DIFFRACTION,0.251,0.213,False,True,IGHV3,IGKV1,Kappa,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,7tas,H,L,0,E,protein,,spike glycoprotein,VIRAL PROTEIN/IMMUNE SYSTEM,01/12/22,SARS-CoV-2 spike in complex with the S2K146 ne...,HOMO SAPIENS; SEVERE ACUTE RESPIRATORY SYNDROM...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Park, Y.J., Veesler, D., Seattle Structural Ge...",3.2,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGLV1,Lambda,,,,,,1
13424,7wk0,A,B,0,C,protein,,spike protein s1,VIRAL PROTEIN,07/13/22,Local refine of Omicron spike bitrimer with 6m...,Homo sapiens; Severe acute respiratory syndrom...,homo sapiens,homo sapiens,severe acute respiratory syndrome coronavirus2,"Zhan, W.Q., Zhang, X., Chen, Z.G., Sun, L.",0,ELECTRON MICROSCOPY,,,False,True,IGHV3,IGLV3,Lambda,,,,,,1
13425,6ejm,H,H,0,B,protein,,cd81 antigen,CELL ADHESION,09/22/17,CRYSTAL STRUCTURE OF HUMAN CD81 LARGE EXTRACEL...,HOMO SAPIENS; MUS MUSCULUS,mus musculus,,homo sapiens,"Kuglstatter, A., Harris, S.F., Villasenor, A.",2.15,X-RAY DIFFRACTION,0.278,0.215,True,True,unknown,unknown,unknown,8.6e-10,-12.367556905430469,SPR,,TBD,1
13426,7lo6,J,I,0,C,protein,,envelope glycoprotein bg505 sosip.664 gp120,VIRAL PROTEIN/IMMUNE SYSTEM,04/14/21,Structure of CD4 mimetic BNM-III-170 in comple...,HUMAN IMMUNODEFICIENCY VIRUS 1; HOMO SAPIENS,homo sapiens,homo sapiens,human immunodeficiency virus 1,"Jette, C.A., Bjorkman, P.J.",3.9,ELECTRON MICROSCOPY,,,False,True,IGHV1,IGKV3,Kappa,,,,,,1


In [9]:
data = summary_file[["pdb", "Hchain", "Lchain", "antigen_chain"]]
data.columns = ["pdb", "Hchain", "Lchain", "Achain"]
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,pdb,Hchain,Lchain,Achain
0,7t17,J,K,C
1,7t17,F,G,A
2,7jmo,H,L,A
3,4o51,B,A,N
4,7orb,E,F,X
...,...,...,...,...
6534,7tas,H,L,E
6535,7wk0,A,B,C
6536,6ejm,H,H,B
6537,7lo6,J,I,C


In [10]:
num_chains = data.iloc[:, 3][data.iloc[:, 3].str.contains(" | ")].map(lambda x: len(x.split(" | ")))
num_chains.value_counts()

2    578
3     89
4      7
5      1
Name: Achain, dtype: int64

In [11]:
data[data.iloc[:, 3].str.len()>9]

Unnamed: 0,pdb,Hchain,Lchain,Achain
216,5cws,H,G,K | I | J | L
485,5y9c,H,L,D | A | C | E
2155,6e0c,M,M,C | A | B | D
2582,6lht,H,L,C | A | B | E | D
4857,6e0p,M,M,C | A | B | D
4993,7ssv,H,L,B | A | C | D
5281,5cws,B,A,E | C | D | F
6230,6e0p,N,N,G | E | F | H


In [13]:
# get_residue_seqs("../../MSAI_Project/SAbDab_20221124/all_structures/raw/7t6s.pdb", ["A", "B"])

In [14]:
data["Hseq"] = data["Hchain"]
data["Lseq"] = data["Lchain"]
data["Aseq"] = data["Achain"]
data["L1"] = data["Lseq"]
data["L2"] = data["Lseq"]
data["L3"] = data["Lseq"]
data["H1"] = data["Hseq"]
data["H2"] = data["Hseq"]
data["H3"] = data["Hseq"]

data

Unnamed: 0,pdb,Hchain,Lchain,Achain,Hseq,Lseq,Aseq,L1,L2,L3,H1,H2,H3
0,7t17,J,K,C,J,K,C,K,K,K,J,J,J
1,7t17,F,G,A,F,G,A,G,G,G,F,F,F
2,7jmo,H,L,A,H,L,A,L,L,L,H,H,H
3,4o51,B,A,N,B,A,N,A,A,A,B,B,B
4,7orb,E,F,X,E,F,X,F,F,F,E,E,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6534,7tas,H,L,E,H,L,E,L,L,L,H,H,H
6535,7wk0,A,B,C,A,B,C,B,B,B,A,A,A
6536,6ejm,H,H,B,H,H,B,H,H,H,H,H,H
6537,7lo6,J,I,C,J,I,C,I,I,I,J,J,J


In [None]:
# names = {}

# for i in range(data.shape[0]):
#     pdb = data["pdb"].iloc[i]
#     if pdb in names:
#         names[pdb] += 1
#     else:
#         names[pdb] = 1


In [None]:
# data[data["pdb"]=="5w08"]

In [None]:
# names

In [None]:
# data.drop_duplicates(subset=['pdb'], keep="last")

In [15]:
data

Unnamed: 0,pdb,Hchain,Lchain,Achain,Hseq,Lseq,Aseq,L1,L2,L3,H1,H2,H3
0,7t17,J,K,C,J,K,C,K,K,K,J,J,J
1,7t17,F,G,A,F,G,A,G,G,G,F,F,F
2,7jmo,H,L,A,H,L,A,L,L,L,H,H,H
3,4o51,B,A,N,B,A,N,A,A,A,B,B,B
4,7orb,E,F,X,E,F,X,F,F,F,E,E,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6534,7tas,H,L,E,H,L,E,L,L,L,H,H,H
6535,7wk0,A,B,C,A,B,C,B,B,B,A,A,A
6536,6ejm,H,H,B,H,H,B,H,H,H,H,H,H
6537,7lo6,J,I,C,J,I,C,I,I,I,J,J,J


In [16]:
data["intersection"] = data["pdb"].apply(lambda x: 1 if x in cdr_seqs.keys() else 0)
data = data[data["intersection"]==1]
data = data.drop(["intersection"], axis=1)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,pdb,Hchain,Lchain,Achain,Hseq,Lseq,Aseq,L1,L2,L3,H1,H2,H3
0,7t17,J,K,C,J,K,C,K,K,K,J,J,J
1,7t17,F,G,A,F,G,A,G,G,G,F,F,F
2,4o51,B,A,N,B,A,N,A,A,A,B,B,B
3,5w08,K,L,C,K,L,C,L,L,L,K,K,K
4,7lg6,J,M,E | B,J,M,E | B,M,M,M,J,J,J
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5398,6d6u,K,L,D,K,L,D,L,L,L,K,K,K
5399,5yax,B,B,C,B,B,C,B,B,B,B,B,B
5400,6ejm,H,H,B,H,H,B,H,H,H,H,H,H
5401,7lo6,J,I,C,J,I,C,I,I,I,J,J,J


In [17]:
data.drop_duplicates(["pdb"])

Unnamed: 0,pdb,Hchain,Lchain,Achain,Hseq,Lseq,Aseq,L1,L2,L3,H1,H2,H3
0,7t17,J,K,C,J,K,C,K,K,K,J,J,J
2,4o51,B,A,N,B,A,N,A,A,A,B,B,B
3,5w08,K,L,C,K,L,C,L,L,L,K,K,K
4,7lg6,J,M,E | B,J,M,E | B,M,M,M,J,J,J
5,7t6s,E,E,A | B,E,E,A | B,E,E,E,E,E,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5392,1eo8,H,L,A,H,L,A,L,L,L,H,H,H
5393,4ps4,H,L,A,H,L,A,L,L,L,H,H,H
5396,6t3f,H,L,F,H,L,F,L,L,L,H,H,H
5397,4od2,B,A,S,B,A,S,A,A,A,B,B,B


In [18]:
# get_residue_seqs("../../MSAI_Project/SAbDab_20221124/all_structures/raw/4o51.pdb", ["B"])

In [21]:
data_list = []

for i in tqdm(range(data.shape[0])):
    tmp = {}
    tmp["pdb"] = data.iloc[i, 0]
    tmp["Hchain"] = data.iloc[i, 1]
    tmp["Lchain"] = data.iloc[i, 2]
    tmp["Achain"] = data.iloc[i, 3]
    
    p = PDBParser()
    pdb_path = "../../MSAI_Project/SAbDab_20221124/all_structures/raw/{}.pdb".format(tmp["pdb"])
    structure = p.get_structure('input', pdb_path)
    all_chains = [c.get_id() for c in structure[0].get_list()]
    tmp["Hseq"] = get_residue_seqs(pdb_path, data.iloc[i, 4].split(" | "), all_chains)
    tmp["Lseq"] = get_residue_seqs(pdb_path, data.iloc[i, 5].split(" | "), all_chains)
    tmp["Aseq"] = get_residue_seqs(pdb_path, data.iloc[i, 6].split(" | "), all_chains)

    tmp["L1"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["L1"]
    tmp["L2"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["L2"]
    tmp["L3"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["L3"]
    tmp["H1"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["H1"]
    tmp["H2"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["H2"]
    tmp["H3"] = cdr_seqs[data.iloc[i, 0]][tmp["Hchain"]+tmp["Lchain"]]["H3"]
    
    data_list.append(tmp)

len(data_list)

100%|██████████████████████████████████████████████████████████████████████████████| 5403/5403 [25:23<00:00,  3.55it/s]


5403

In [22]:
data.iloc[i, :]

pdb       3vi3
Hchain       H
Lchain       L
Achain       D
Hseq         H
Lseq         L
Aseq         D
L1           L
L2           L
L3           L
H1           H
H2           H
H3           H
Name: 5402, dtype: object

In [23]:
cdr_seqs[data.iloc[i, 0]]

{'FE': {'L1': 'KSLLHSNGNTY',
  'L2': 'RMS',
  'L3': 'LQHLEYPFT',
  'H1': 'GYTFTSYW',
  'H2': 'ILPGSGYI',
  'H3': 'SRALALYAMDY'},
 'HL': {'L1': 'KSLLHSNGNTY',
  'L2': 'RMS',
  'L3': 'LQHLEYPFT',
  'H1': 'GYTFTSYW',
  'H2': 'ILPGSGYI',
  'H3': 'SRALALYAMDY'}}

In [24]:
for i in data_list:
    if i["pdb"]=="7ks7":
        print(i)
#         break

In [25]:
for i in data_list:
    if i["pdb"]=="7lg6":
        print(i)
#         break

{'pdb': '7lg6', 'Hchain': 'J', 'Lchain': 'M', 'Achain': 'E | B', 'Hseq': ['EVQLVESGPGLVRPSETLSLTCAVSGDSISTNNGWSWIRQTPGKGLEWIGYINGRSGSTRYNPSLQSRVTISTDTSGNQFSLKVNSVTAADTAKYYCAFFWSTYYKRFDVWGPGVRVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCD'], 'Lseq': ['AIRMTQSPAILSLSPGERATLSCRASQSVDSRLAWYQQKPGQSPRLLIYDVSSRATGIPDRFSGSGSGTEFTLTISSLEPEDVAVYFCHQENDWPWTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC'], 'Aseq': ['AENLWVTVYYGVPVWKDAETTLFCASDAKAYETKKHNVWATHCCVPTDPNPQEIHLENVTEEFNMWKNNMVEQMHTDIISLWDQSLKPCVKLTPLCVTLQCTNVTNNITDDMRGELKNCSFNMTTELRDKKQKVYSLFYRLDVVQINENQGNRSNNSNKEYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCKDKKFNGTGPCPSVSTVQCTHGIKPVVSTQLLLNGSLAEEEVMIRSENITNNAKNILVQFNTPVQINCTRPNNNTRKSIRIGPGQWFYATGDIIGDIRQAHCNVSKATWNETLGKVVKQLRKHFGNNTIIRFANSSGGDLEVTTHSFNCGGEFFYCNTSGLFNSTWISNTSVQGSNSTGSNDSITLPCRIKQIINMWQRIGQAMYAPPIQGVIRCVSNITGLILTRDGGSTNSTTETFRPGGGDMRDNWRSELYKYKVVK

In [26]:
data_list[0]

{'pdb': '7t17',
 'Hchain': 'J',
 'Lchain': 'K',
 'Achain': 'C',
 'Hseq': ['QVQLQESGPGLVKPSQTLSLTCAVSGGSISSGDSYWSWIRQHPGKGLEWIGSIYYSGSTYYNPSLKSRVTIPIDTSKNQFSLKLSSVTAADTAVYYCARHVGDLRVNDAFDIWGQGTMVTVSS'],
 'Lseq': ['QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNFVSWYQRLPGTPPKLLIYDSDKRPSGIPDRFSGSKSGTSATLGITGLQTGDEGDYYCGTWDRSLSVVVFGGGTKLTVL'],
 'Aseq': ['IRCIGVSNRDFVEGMSGGTWVDVVLEHGGCVTVMAQDKPTVDIELVTTTVSNMAEVRSYCYEASISDMASDSRCPTQGEAYLDKQSDTQYVCKRTLVDRGWGNGCGLFGKGSLVTCAKFACSKKMTGKSIQPENLEYRIMLSVHGSQHSGMIVNDTGHETDENRAKVEITPNSPRAEATLGGFGSLGLDCEPRTGLDFSDLYYLTMNNKHWLVHKEWFHDIPLPWHAGADTGTPHWNNKEALVEFKDAHAKRQTVVVLGSQEGAVHTALAGALEAEMDGAKGRLSSGHLKCRLKMDKLRLKGVSYSLCTAAFTFTKIPAETLHGTVTVEVQYAGTDGPCKVPAQMAVDMQTLTPVGRLITANPVITESTENSKMMLELDPPFGDSYIVIGVG'],
 'L1': 'SSNIGNNF',
 'L2': 'DSD',
 'L3': 'GTWDRSLSVVV',
 'H1': 'GGSISSGDSY',
 'H2': 'IYYSGST',
 'H3': 'ARHVGDLRVNDAFDI'}

In [27]:
data_list[4]

{'pdb': '7lg6',
 'Hchain': 'J',
 'Lchain': 'M',
 'Achain': 'E | B',
 'Hseq': ['EVQLVESGPGLVRPSETLSLTCAVSGDSISTNNGWSWIRQTPGKGLEWIGYINGRSGSTRYNPSLQSRVTISTDTSGNQFSLKVNSVTAADTAKYYCAFFWSTYYKRFDVWGPGVRVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCD'],
 'Lseq': ['AIRMTQSPAILSLSPGERATLSCRASQSVDSRLAWYQQKPGQSPRLLIYDVSSRATGIPDRFSGSGSGTEFTLTISSLEPEDVAVYFCHQENDWPWTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC'],
 'Aseq': ['AENLWVTVYYGVPVWKDAETTLFCASDAKAYETKKHNVWATHCCVPTDPNPQEIHLENVTEEFNMWKNNMVEQMHTDIISLWDQSLKPCVKLTPLCVTLQCTNVTNNITDDMRGELKNCSFNMTTELRDKKQKVYSLFYRLDVVQINENQGNRSNNSNKEYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCKDKKFNGTGPCPSVSTVQCTHGIKPVVSTQLLLNGSLAEEEVMIRSENITNNAKNILVQFNTPVQINCTRPNNNTRKSIRIGPGQWFYATGDIIGDIRQAHCNVSKATWNETLGKVVKQLRKHFGNNTIIRFANSSGGDLEVTTHSFNCGGEFFYCNTSGLFNSTWISNTSVQGSNSTGSNDSITLPCRIKQIINMWQRIGQAMYAPPIQGVIRCVSNITGLILTRDGGSTNSTTETFRPGGGDMRDNWRSELY

In [28]:
with open("./data/sequence_pairs.json", "w") as f:
    f.write(json.dumps(data_list))
f.close()

# remove repeated or similar (>95%) samples
- criterion:
- TODO

In [29]:
with open("./data/sequence_pairs.json", "rb") as f:
    cdr_seqs = json.loads(f.read())
f.close()

In [30]:
type(cdr_seqs), len(cdr_seqs)

(list, 5403)