## Imports

In [1]:
from JSU_lib import *

## Functions

In [751]:
def get_chains(df):
    prot_rows = df.query('group_PDB == "ATOM"')
    chains = set(sorted(prot_rows.label_asym_id.unique().tolist()))
    return chains

def get_ligs(df): # equivalent to pockets in a single structure
    hetatm_df = df.query('group_PDB != "ATOM" & label_comp_id not in @exclude').drop_duplicates(["label_comp_id", "label_asym_id", "label_seq_id"]) # excluding crystal by-products
    hetatm_tuples = list(zip(hetatm_df['label_comp_id'], hetatm_df['label_asym_id'], hetatm_df['label_seq_id']))
    return hetatm_tuples

def get_chain_lengths(df, chains):
    chain_len_dict = {}
    for chain in chains:
        chain_df = df.query('label_asym_id == @chain & group_PDB == "ATOM"')
        n_ress = len(chain_df.drop_duplicates('label_seq_id'))
        chain_len_dict[chain] = n_ress
    return chain_len_dict

def extract_dataset_info(ds_dir, ds_name, struc_files = None, lig_tuples_dict = None, pdb_fmt_filt = True, threshold = 6, exclude_ligands = None, n_atoms_t = None):
    if struc_files == None:
        if pdb_fmt_filt:
            struc_files = [el for el in os.listdir(ds_dir) if el.endswith(".pdb")]
        else:
            struc_files = [el for el in os.listdir(ds_dir)]
    #print(len(struc_files))
    if exclude_ligands != None:
        print(f'Exluding: {exclude_ligands}')
    if n_atoms_t != None:
        print(f'Exluding ligands with less than {n_atoms_t} atoms')
    ds_data = []
    for i, struc in enumerate(struc_files):
        struc_name, _ = os.path.splitext(struc)
        struc_path = os.path.join(ds_dir, struc)
        df = PDBXreader(inputfile = struc_path).atoms(format_type = "pdb", excluded = ())
        
        if lig_tuples_dict == None:
            lig_tuples = get_ligs(df)
        else:
            lig_tuples = lig_tuples_dict[struc_name]
            
        if exclude_ligands != None:
            lig_tuples = [lig_tup for lig_tup in lig_tuples if lig_tup[0] not in exclude_ligands]
            
        if n_atoms_t != None:
            lig_tuples_filt = []
            for lig_tup in lig_tuples:
                try:
                    if lig_n_atoms_dict[lig_tup[0]] > n_atoms_t:
                        lig_tuples_filt.append(lig_tup)
                    else:
                        continue
                except:
                    print(f'{lig_tup[0]} not found in dictionary!')
                    continue
        else:
            lig_tuples_filt = lig_tuples
            #lig_tuples = [lig_tup for lig_tup in lig_tuples if lig_n_atoms_dict[lig_tup[0]] > n_atoms_t]

        #print(len(lig_tuples))
        ligs = []#[lig_tup[0] for lig_tup in lig_tuples]
        binding_ress = get_binding_ress(df, lig_tuples_filt, struc, threshold)
        if sum([len(v) for v in binding_ress.values()]) == 0:
            print(F"CAREFUL: NO INTERACTIONS AT ALL FOR {struc}")
        binding_chains = {k: set(sorted([res[1] for res in v])) for k,v in binding_ress.items()}
        n_binding_ress = {k: len(v) for k,v in binding_ress.items()}
        
        chains = []
        flat_chains = []
        n_ress = []
        for k, v in binding_chains.items():
            #print(v)
            if v != []: # this is to only use those ligands with residues within 6A
                chains.append(v)
                flat_chains.extend(v)
                ligs.append(k[0])
                n_ress.append(n_binding_ress[k])
        un_chains = sorted(list(set(flat_chains)))
        chain_lens = get_chain_lengths(df, un_chains)
        n_chains = [len(el) for el in chains]
        ds_data.append([ds_name, struc_name, chains, n_chains, n_ress, ligs, len(ligs), chain_lens])
    ds_df = pd.DataFrame(ds_data, columns = ["dataset", "ID", "chains", "n_chains", "n_ress", "ligs", "n_ligs", "chain_lens"])
    return ds_df

def filter_by_distance(df, ligand_tuple, ID, threshold = 6):
    # Filter the DataFrame to find rows matching the ligand tuple
    ligand_df = df[(df['label_comp_id'] == ligand_tuple[0]) &
                   (df['label_asym_id'] == ligand_tuple[1]) &
                   (df['label_seq_id'] == ligand_tuple[2])]
    
    # Filter the DataFrame for rows where label_comp_id is in the list of amino acids
    prot_df = df[df['label_comp_id'].isin(aas)]
    
    # If either DataFrame is empty, return an empty DataFrame
    if ligand_df.empty or prot_df.empty:
        #print(len(ligand_df))
        print(ID, ligand_tuple)
        print("LIGAND ATOMS NOT FOUND")
        return []
    
    # Get coordinates for ligand and protein
    lig_coords = ligand_df[['Cartn_x', 'Cartn_y', 'Cartn_z']]
    prot_coords = prot_df[['Cartn_x', 'Cartn_y', 'Cartn_z']]
    
    # Calculate the distance matrix between ligand and protein coordinates
    dist_matrix = cdist(lig_coords, prot_coords, 'euclidean')
    
    # Check if any distances are below the threshold
    if np.any(dist_matrix < threshold):
        # If so, return the rows from the protein DataFrame where the condition is met
        # Get indices where the condition is met
        prot_indices = np.where(dist_matrix < threshold)[1]  # Get column indices from distance matrix
        unique_ress = prot_df.iloc[prot_indices].drop_duplicates(["label_comp_id", "label_asym_id", "label_seq_id"])
        unique_ress_tuples = list(zip(unique_ress['label_comp_id'], unique_ress['label_asym_id'], unique_ress['label_seq_id']))
        return unique_ress_tuples
    
    # Return empty DataFrame if no distances meet the criteria
    print(f"NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for {ligand_tuple} of {ID}")
    return []

def get_binding_ress(df, ligand_tuples, ID, threshold = 6):
    binding_ress = {}
    for ligand_tuple in ligand_tuples:
        binding_ress_tuples = filter_by_distance(df, ligand_tuple, ID, threshold = threshold)
        #print(binding_ress_tuples)
        if binding_ress_tuples != []:
            binding_ress[ligand_tuple] = binding_ress_tuples
    return binding_ress
    
def get_scPDB_lig_name(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        if line.startswith('@<TRIPOS>MOLECULE'):
            next_line_index = lines.index(line) + 1
            molecule_line = lines[next_line_index].strip()
            parts = molecule_line.split('_')
            if len(parts) > 1:
                molecule_name = parts[1]  # Extract the second element
    return molecule_name

def get_scPDB_site_residues(filepath):
    # Flag to start capturing data after finding the target section
    start_capture = False
    residues = []
    
    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            # Check for the start of the target section
            if line.startswith("@<TRIPOS>SUBSTRUCTURE"):
                start_capture = True
                continue
            # Check for the end of the target section
            if line.startswith("@<TRIPOS>SET"):
                break

            # Start capturing data after the section header
            if start_capture:
                if line:
                    parts = line.split()
                    if len(parts) < 2:
                        print(filepath)
                        print(parts)
                        raise
                    # Extract fields
                    res_id = parts[1]
                    chain_id1 = parts[5]
                    res_name_1 = parts[6]
                    res_num_1 = res_id[len(res_name_1):]

                    # Append the tuple to the list
                    residues.append((chain_id1, res_num_1, res_name_1))
    
    return residues

def get_scPDB_chain_lens(filepath):
    # Prepare a dictionary to hold the counts per chain
    chain_lens = {}
    
    # Variable to track when to start and stop reading residue data
    read_data = False
    
    #try:
    with open(filepath, 'r') as file:
        for line in file:
            # Check if it's time to start reading data
            if "@<TRIPOS>SUBSTRUCTURE" in line:
                read_data = True
                continue
            
            # Check if it's time to stop reading data
            if "@<TRIPOS>SET" in line:
                break
            
            # Process the lines between start and stop markers
            if read_data:
                # Split line into components
                parts = line.split()
                residue = parts[6]
                # Ensure line is properly formatted
                if len(parts) >= 7 and residue in aas:
                    chain_id = parts[5]
                    # Initialize or increment the chain ID and residue count
                    if chain_id not in chain_lens:
                        chain_lens[chain_id] = 0
                    chain_lens[chain_id] += 1
    
    #except Exception as e:
        #print(f"An error occurred: {e}")
    
    return chain_lens

def get_pdbbind_pdb2lig(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()[1:]
    separator_index = lines.index('# ==============================================================================\n')
    data = lines[separator_index + 1:]
    df = pd.DataFrame([x.strip().split() for x in data if x.strip() != ''])
    df.drop(columns=[1, 2, 3, 4, 5, 6], inplace = True)
    df.columns = ["pdb_id", "ligand_name"]
    lig_names = df.ligand_name.tolist()
    lig_names_rf = [el.replace("(", '').replace(")", '') for el in lig_names]
    df.ligand_name = lig_names_rf
    pdb_to_lig = dict(zip(df.pdb_id, df.ligand_name))
    return pdb_to_lig

def parse_moad_csv(moad_csv):
    rows = []
    with open(moad_csv, "r") as f:
        for line in f:
            fields = line.strip().split(",")[:-1]
            f0 = fields[0]
            f2 = fields[2]
            f3 = fields[3]
            if f0 != '':
                FAM_ID = f0
                continue
            if f2 != '':
                REP_STRUC = f2
                continue
            if f3 != '':
                row = [FAM_ID, REP_STRUC,] + fields[3:]
                rows.append(row)
                
    columns = ["EC", "pdb_id", "lig", "validity", "aff_metric", "eqs", "aff_val", "aff_unit", "SMILES"]
    
    MOAD_leaders_df = pd.DataFrame(rows, columns = columns)
    MOAD_leaders_df.drop(columns= ["eqs", ], inplace = True)
    return MOAD_leaders_df

def parse_fasta(filename):
    with open(filename, 'r') as file:
        fasta_dict = {}
        current_id = ''
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                current_id = line[1:]  # Remove the ">" character
                fasta_dict[current_id] = ''
            else:
                fasta_dict[current_id] += line
    return fasta_dict

def create_MOAD_lig_tuples_dict(df):
    grouped = df.groupby('pdb_id')
    ligand_dict = {pdb_id: list(zip(group['lig_resname'], group['lig_chain'], group['lig_resnum']))
                   for pdb_id, group in grouped}
    return ligand_dict

In [6]:
aas = [
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS',
    'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
    'LEU', 'LYS', 'MET', 'PHE', 'PRO',
    'SER', 'THR', 'TRP', 'TYR', 'VAL'
]

In [238]:
exclude = ["HOH", "SO4", "MSE", "GOL", "PO4", "EDO", "ACT", "DOD", "MES", "FMT", "DMS", "UNK", "ACY", "TRS"]

#HOH: 1,425,778
#SO4: 1,664
#MSE: 1,359
#GOL: 683
#PO4: 438
#EDO: 323
#PLP: 322
#ACT: 312
#DOD: 112
#MES: 92
#FMT: 91
#DMS: 80
#UNK: 79
#ACY: 65
#TRS: 56

In [145]:
ions = [
    "CA", "MG", "ZN", "CL", "MN", "NA", "K", "CD", "IOD", "CO", "FE",
    "CU", "FE2", "NI", "TL", "XE", "O", "LU", "CS", "BR", "OH", "TE",
    "GD", "AL", "YB", "SB", "PT", "F", "RB", "SM", "SR"]
#CA: 1,174
#MG: 1,087
#ZN: 1,033
#CL: 482
#MN: 319
#NA: 288
#K: 195
#CD: 178
#IOD: 113
#CO: 91
#FE: 72
#CU: 56
#FE2: 55
#NI: 50
#TL: 39
#XE: 22
#O: 19
#LU: 9
#CS: 8
#BR: 7
#OH: 7
#TE: 6
#GD: 5
#AL: 4
#YB: 4
#MO: 3
#SB: 3
#PT: 3
#F:2
#RB: 1
#SM: 1
#SR: 1

## Input data

In [3]:
DATASETS_dir = "./../DATA/DATASETS/"

In [4]:
datasets = [
    "CHEN11", "COACH420", "fptrain", "HOLO4K",
    "JOINED/ASTEX", "JOINED/B210", "JOINED/BU48", "JOINED/DT198",
]

## What do we want to know?

For each dataset:
1. Number of chains per structure
2. Chain length
3. Ligands (pockets) per structure
4. Ligand types (need to save ligand names)

## P2Rank datasets (folders containint PDB files)

In [612]:
lig_n_atoms_dict = read_from_pickle("./../DATA/lig_n_atoms_dict.pkl")

### CHEN11

In [561]:
p2rank_exclude = ["HOH", "DOD", "WAT", "NAG", "MAN", "UNK", "GLC", "ABA", "MPD", "GOL", "SO4", "PO4"]

In [700]:
CHEN11_df = extract_dataset_info(os.path.join(DATASETS_dir, "CHEN11"), "CHEN11", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
CAREFUL: NO INTERACTIONS AT ALL FOR b.002.003.005_1oioa.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR b.089.001.001_1iiya.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR b.029.001.010_6cela.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR c.055.001.003_1czan.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR b.077.003.001_2hyra.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR b.001.018.002_1uh4a.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR b.003.001.001_1d3ca.pdb


In [703]:
len(CHEN11_df.query('n_ligs == 0')) #7

7

In [749]:
CHEN11_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,CHEN11,b.081.001.001_2jf2a,[{A}],[1],[17],[U20],1,{'A': 262}
1,CHEN11,d.142.001.009_2r84a,"[{A}, {A}]","[1, 1]","[11, 4]","[AMP, AMZ]",2,{'A': 230}
2,CHEN11,c.037.001.001_1e2fa,"[{A}, {A}]","[1, 1]","[12, 12]","[TMP, ADP]",2,{'A': 210}


In [704]:
CHEN11_df.query('n_ligs == 0')

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
21,CHEN11,b.002.003.005_1oioa,[],[],[],[],0,{}
50,CHEN11,b.089.001.001_1iiya,[],[],[],[],0,{}
120,CHEN11,b.029.001.010_6cela,[],[],[],[],0,{}
152,CHEN11,c.055.001.003_1czan,[],[],[],[],0,{}
164,CHEN11,b.077.003.001_2hyra,[],[],[],[],0,{}
220,CHEN11,b.001.018.002_1uh4a,[],[],[],[],0,{}
244,CHEN11,b.003.001.001_1d3ca,[],[],[],[],0,{}


In [705]:
CHEN11_df.shape

(251, 8)

In [706]:
CHEN11_df.to_pickle("./results/DATASETS/CHEN11_df.pkl")

### B210

In [707]:
B210_df = extract_dataset_info(os.path.join(DATASETS_dir, "JOINED/B210"), "B210", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
CAREFUL: NO INTERACTIONS AT ALL FOR 1ac0.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('PGH', 'B', '249') of 7tim.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1anf.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('F89', 'B', '2') of 1tlc.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 4mbp.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('PGH', '2', '250') of 1tph.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('2GP', 'B', '105') of 2aad.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('HAR', '', '1012') of 3nos.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('HEM', '', '1010') of 3nos.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1byb.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1lmo.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2gbp.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2msb.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('CB3', 'B', '2') of 2tsc.pdb
C

In [708]:
len(B210_df.query('n_ligs == 0')) #8

8

In [709]:
B210_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,B210,1hfc,[{}],[1],[15],[HAP],1,{'': 157}
1,B210,1a9u,[{}],[1],[14],[SB2],1,{'': 351}
2,B210,1aoe,"[{A}, {A}]","[1, 1]","[24, 10]","[NDP, GW3]",2,{'A': 192}


In [710]:
B210_df.shape

(210, 8)

In [711]:
B210_df.to_pickle("./results/DATASETS/B210_df.pkl")

### BU48

In [712]:
BU48_df = extract_dataset_info(os.path.join(DATASETS_dir, "JOINED/BU48"), "BU48", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
CAREFUL: NO INTERACTIONS AT ALL FOR 1hel.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1apu.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 3app.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1bya.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1byb.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1hew.pdb


In [713]:
len(BU48_df.query('n_ligs == 0')) #6

6

In [714]:
BU48_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,BU48,1hfc,[{}],[1],[15],[HAP],1,{'': 157}
1,BU48,1srf,"[{A}, {B}]","[1, 1]","[16, 16]","[MTB, MTB]",2,"{'A': 116, 'B': 116}"
2,BU48,5cpa,[{}],[1],[14],[FVF],1,{'': 307}


In [715]:
BU48_df.shape

(96, 8)

In [716]:
BU48_df.to_pickle("./results/DATASETS/BU48_df.pkl")

### DT198

In [717]:
DT198_df = extract_dataset_info(os.path.join(DATASETS_dir, "JOINED/DT198"), "DT198", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
CAREFUL: NO INTERACTIONS AT ALL FOR 1dp0_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1r6n_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1ltq_A.pdb
DVV not found in dictionary!
CAREFUL: NO INTERACTIONS AT ALL FOR 1pn3_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1q1c_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2biu_X.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 3k4v_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2ch5_A.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('GAI', 'A', '132') of 1rbw_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1kmv_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2agd_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1tz8_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1tpf_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1lj5_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1c1p_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2jhf_A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2of1_A.pdb
CAR

In [661]:
#DVV not found in dictionary!

In [718]:
len(DT198_df.query('n_ligs == 0')) #18

18

In [719]:
DT198_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,DT198,1dp0_A,[],[],[],[],0,{}
1,DT198,2ab2_A,[{A}],[1],[19],[SNL],1,{'A': 259}
2,DT198,1j3j_A,[{A}],[1],[12],[CP6],1,{'A': 221}


In [720]:
DT198_df.shape

(198, 8)

In [721]:
DT198_df.to_pickle("./results/DATASETS/DT198_df.pkl")

### ASTEX

In [722]:
ASTEX_df = extract_dataset_info(os.path.join(DATASETS_dir, "JOINED/ASTEX"), "ASTEX", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('IOH', '', '502') of 1q1g.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'B', '1202') of 1l7f.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('FUC', 'D', '1150') of 1n1m.pdb


In [723]:
len(ASTEX_df.query('n_ligs == 0')) #0

0

In [724]:
ASTEX_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,ASTEX,1j3j,"[{A}, {A}, {C}]","[1, 1, 1]","[12, 27, 11]","[CP6, NDP, UMP]",3,"{'A': 221, 'C': 326}"
1,ASTEX,1n2j,"[{A}, {A}]","[1, 1]","[10, 2]","[PAF, BAL]",2,{'A': 287}
2,ASTEX,1ia1,"[{A}, {A}]","[1, 1]","[25, 13]","[NDP, TQ3]",2,{'A': 192}


In [725]:
ASTEX_df.shape

(85, 8)

In [726]:
ASTEX_df.to_pickle("./results/DATASETS/ASTEX_df.pkl")

### JOINED

In [727]:
JOINED_df = pd.concat(
    [
        B210_df, BU48_df, DT198_df, ASTEX_df
    ]
).reset_index(drop = True)

JOINED_df.dataset = "JOINED"

In [728]:
len(JOINED_df.query('n_ligs == 0')) #29

32

In [729]:
JOINED_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,JOINED,1hfc,[{}],[1],[15],[HAP],1,{'': 157}
1,JOINED,1a9u,[{}],[1],[14],[SB2],1,{'': 351}
2,JOINED,1aoe,"[{A}, {A}]","[1, 1]","[24, 10]","[NDP, GW3]",2,{'A': 192}


In [730]:
JOINED_df.shape

(589, 8)

In [731]:
JOINED_df.to_pickle("./results/DATASETS/JOINED_df.pkl")

### fptrain

In [732]:
fptrain_df = extract_dataset_info(os.path.join(DATASETS_dir, "fptrain"), "fptrain", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4) # what did they filter?

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('REA', '', '301') of 1K74.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1E8U.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1GUZ.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('ACO', '', '800') of 1P7T.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('PYR', '', '810') of 1P7T.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('PEG', '', '701') of 1P7T.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', '', '304') of 1AXZ.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('XYP', '', '305') of 1AXZ.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('SF4', '', '7290') of 1FP6.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('URE', '', '214') of 1DDR.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('URE', '', '215') of 1DDR.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1BD4.pdb
CAREFUL: NO INTERACTIONS AT

In [735]:
len(fptrain_df.query('n_ligs == 0')) #51

51

In [733]:
fptrain_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,fptrain,1H0S,[{A}],[1],[13],[FA6],1,{'A': 137}
1,fptrain,1A9U,[{A}],[1],[14],[SB2],1,{'A': 351}
2,fptrain,1A80,[{A}],[1],[27],[NAP],1,{'A': 277}


In [734]:
fptrain_df.shape

(263, 8)

In [736]:
fptrain_df.to_pickle("./results/DATASETS/fptrain_df.pkl")

### COACH420

In [737]:
COACH420_df = extract_dataset_info(os.path.join(DATASETS_dir, "COACH420"), "COACH420", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
CAREFUL: NO INTERACTIONS AT ALL FOR 1teiA.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '471') of 2qwiA.pdb
CBS not found in dictionary!
CAREFUL: NO INTERACTIONS AT ALL FOR 1zu0A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 3ct5A.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BGC', 'A', '2299') of 2vu9A.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('GAL', 'A', '2300') of 2vu9A.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2j72B.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 2v8lA.pdb
CAREFUL: NO INTERACTIONS AT ALL FOR 1lzsA.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('GAL', 'A', '181') of 1cqfB.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BGC', 'A', '182') of 1cqfB.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BGC', 'B', '282') of 1cqfB.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BGC', 'B', '272') of 1cqfB.pd

In [738]:
len(COACH420_df.query('n_ligs == 0')) #7

7

In [739]:
COACH420_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,COACH420,1ogoX,[{X}],[1],[4],[BGC],1,{'X': 572}
1,COACH420,830cA,[{A}],[1],[16],[RS1],1,{'A': 164}
2,COACH420,2e6uX,[{X}],[1],[22],[COA],1,{'X': 142}


In [740]:
COACH420_df.shape

(420, 8)

In [741]:
COACH420_df.to_pickle("./results/DATASETS/COACH420_df.pkl")

### HOLO4K

In [742]:
HOLO4K_df = extract_dataset_info(os.path.join(DATASETS_dir, "HOLO4K"), "HOLO4K", threshold = 4, exclude_ligands = p2rank_exclude, n_atoms_t = 4)

Exluding: ['HOH', 'DOD', 'WAT', 'NAG', 'MAN', 'UNK', 'GLC', 'ABA', 'MPD', 'GOL', 'SO4', 'PO4']
Exluding ligands with less than 4 atoms
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '471') of 2qwe.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('PH1', 'A', '504') of 1qhj.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('LI1', 'A', '605') of 1c8s.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '476') of 1inh.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'B', '476') of 1inh.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'B', '471') of 1inh.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '304') of 1axz.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('XYP', 'A', '305') of 1axz.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('LI1', 'A', '605') of 1c8r.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '471') of 2qwd.pdb
NO PROTEIN ATOMS WITHIN DISTANCE THRESHOLD for ('BMA', 'A', '471') of 2qwf.

In [743]:
len(HOLO4K_df.query('n_ligs == 0')) #7

7

In [744]:
HOLO4K_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,HOLO4K,1drk,[{A}],[1],[11],[RIP],1,{'A': 271}
1,HOLO4K,1dqp,"[{A}, {A}, {B}, {B}]","[1, 1, 1, 1]","[6, 2, 6, 6]","[IMG, IPA, IMG, IPA]",4,"{'A': 230, 'B': 230}"
2,HOLO4K,1qin,"[{B, A}, {B, A}]","[2, 2]","[22, 19]","[GIP, GIP]",2,"{'A': 176, 'B': 176}"


In [745]:
HOLO4K_df.shape

(4543, 8)

In [746]:
HOLO4K_df.to_pickle("./results/DATASETS/HOLO4K_df.pkl")

## scPDB (one subfolder per entry with .mol2 files and .txt within)

In [747]:
scpdb_subset_dir = os.path.join(DATASETS_dir, "scPDB_2017_subset")

entries = os.listdir(scpdb_subset_dir)

ligs, error_ligs, sites, error_sites, chain_lens, error_chain_lens, scpdb_data = ([] for _ in range(7))

for i, entry in enumerate(entries):
    lig = site_ress = chain_len = None
    if i % 100 == 0:
       print(i)
    entry_dir = os.path.join(scpdb_subset_dir, entry)
    
    try:
        lig = get_scPDB_lig_name(os.path.join(entry_dir, "ligand.mol2"))
        ligs.append(lig)
    except:
        error_ligs.append(entry)
    try:
        site_ress = get_scPDB_site_residues(os.path.join(entry_dir, "site.mol2"))
        sites.append(site_ress)
    except:
        error_sites.append(entry)   
    try:
        chain_len = get_scPDB_chain_lens(os.path.join(entry_dir, "protein.mol2"))
        chain_lens.append(chain_len)
    except:
        error_chain_lens.append(entry) 
    if lig != None and site_ress != None and chain_len != None:
        n_ligs = 1
        prot_site_ress = [el for el in site_ress if el[2] in aas]
        n_ress = len(prot_site_ress)
        prot_site_chains = [set([el[0] for el in prot_site_ress]), ]
        n_chains = [len(el) for el in prot_site_chains]
        scpdb_data.append(["scPDB_filt", entry, prot_site_chains, n_chains, [n_ress, ], [lig,], n_ligs, chain_len])

scPDB_filt_df = pd.DataFrame(scpdb_data, columns = ["dataset", "ID", "chains", "n_chains", "n_ress", "ligs", "n_ligs", "chain_lens"])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000


In [754]:
n_ress_lists = [[el, ] for el in scPDB_filt_df.n_ress.tolist()]

In [756]:
scPDB_filt_df.n_ress = n_ress_lists

In [757]:
scPDB_filt_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,scPDB_filt,2zb4_1,[{A}],[1],[21],[5OP],1,{'A': 351}
1,scPDB_filt,4yfy_4,[{B}],[1],[31],[1YJ],1,{'B': 241}
2,scPDB_filt,4lt6_2,[{B}],[1],[33],[3AT],1,{'B': 462}


In [761]:
#scPDB_filt_df['pdb_id'] = scPDB_filt_df['ID'].apply(lambda x: x.split('_')[0])

In [759]:
scPDB_filt_df.to_pickle("./results/DATASETS/scPDB_filt_df.pkl")

In [758]:
scPDB_filt_df.shape

(5020, 8)

In [760]:
#len(scPDB_filt_df.drop_duplicates("pdb_id"))

## scPDB (whole)

In [762]:
scpdb_full_dir = os.path.join(DATASETS_dir, "scPDB_2017_full")

entries_full = os.listdir(scpdb_full_dir)

ligs_full, error_ligs_full, sites_full, error_sites_full, chain_lens_full, error_chain_lens_full, scpdb_data_full = ([] for _ in range(7))

for i, entry in enumerate(entries_full):
    lig = site_ress = chain_len = None
    if i % 100 == 0:
       print(i)
    entry_dir = os.path.join(scpdb_full_dir, entry)
    
    try:
        lig = get_scPDB_lig_name(os.path.join(entry_dir, "ligand.mol2"))
        ligs_full.append(lig)
    except:
        error_ligs_full.append(entry)
    try:
        site_ress = get_scPDB_site_residues(os.path.join(entry_dir, "site.mol2"))
        sites_full.append(site_ress)
    except:
        error_sites_full.append(entry)   
    try:
        chain_len = get_scPDB_chain_lens(os.path.join(entry_dir, "protein.mol2"))
        chain_lens_full.append(chain_len)
    except:
        error_chain_lens_full.append(entry) 
    if lig != None and site_ress != None and chain_len != None:
        n_ligs = 1
        prot_site_ress = [el for el in site_ress if el[2] in aas]
        n_ress = len(prot_site_ress)
        prot_site_chains = [set([el[0] for el in prot_site_ress]), ]
        n_chains = [len(el) for el in prot_site_chains]
        scpdb_data_full.append(["scPDB_full", entry, prot_site_chains, n_chains, [n_ress, ], [lig,], n_ligs, chain_len])

scPDB_full_df = pd.DataFrame(scpdb_data_full, columns = ["dataset", "ID", "chains", "n_chains", "n_ress", "ligs", "n_ligs", "chain_lens"])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500


In [90]:
#scPDB_full_df['pdb_id'] = scPDB_full_df['ID'].apply(lambda x: x.split('_')[0])

In [548]:
#scPDB_full_df.dataset = "scPDB_full"

In [763]:
scPDB_full_df.head(5)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,scPDB_full,1iki_1,[{A}],[1],[39],[REY],1,{'A': 345}
1,scPDB_full,2zb4_1,[{A}],[1],[21],[5OP],1,{'A': 351}
2,scPDB_full,3gfe_1,[{A}],[1],[37],[P37],1,{'A': 352}
3,scPDB_full,3isj_2,[{B}],[1],[26],[A8D],1,{'B': 279}
4,scPDB_full,4bu9_2,[{B}],[1],[26],[08C],1,{'B': 211}


In [549]:
scPDB_full_df.head(5)

Unnamed: 0,dataset,ID,chains,n_chains,ligs,n_ligs,chain_lens,pdb_id
0,scPDB_full,1iki_1,[{A}],[1],[REY],1,{'A': 345},1iki
1,scPDB_full,2zb4_1,[{A}],[1],[5OP],1,{'A': 351},2zb4
2,scPDB_full,3gfe_1,[{A}],[1],[P37],1,{'A': 352},3gfe
3,scPDB_full,3isj_2,[{B}],[1],[A8D],1,{'B': 279},3isj
4,scPDB_full,4bu9_2,[{B}],[1],[08C],1,{'B': 211},4bu9


In [765]:
scPDB_full_df.to_pickle("./results/DATASETS/scPDB_full_df.pkl")

In [764]:
scPDB_full_df.shape

(17594, 8)

In [94]:
#len(scPDB_full_df.drop_duplicates("pdb_id"))

16612

## PDBbind  v2020

In [766]:
pdb2lig_pdbbind = get_pdbbind_pdb2lig(os.path.join(DATASETS_dir, "INDEX_refined_data.2020"))

In [767]:
pdbbind_2020_dir = os.path.join(DATASETS_dir, "PDBbind_v2020_refined")

pdbbind_entries = os.listdir(pdbbind_2020_dir)

pdbbind_data = []
pdbbind_errors = []
for i, entry in enumerate(pdbbind_entries):
    if i % 100 == 0:
       print(i)
    try:
        entry_dir = os.path.join(pdbbind_2020_dir, entry)
        protein_path = os.path.join(entry_dir, f"{entry}_protein.pdb")
        protein_df = PDBXreader(inputfile = protein_path).atoms(format_type = "pdb", excluded = ())
        pocket_path = os.path.join(entry_dir, f"{entry}_pocket.pdb")
        pocket_df = PDBXreader(inputfile = pocket_path).atoms(format_type = "pdb", excluded = ())
        protein_pocket_df = pocket_df[pocket_df['label_comp_id'].isin(aas)]
        n_ress = len(protein_pocket_df.drop_duplicates(["label_comp_id", "label_asym_id", "label_seq_id"])) # number of unique protein ligand-binding residues
        chains = get_chains(protein_pocket_df)
        chain_lengths = get_chain_lengths(protein_df, chains)
        lig = pdb2lig_pdbbind[entry]
        n_ligs = 1
        pdbbind_data.append(["PDBbind", entry, [chains, ], [len(chains), ], [n_ress, ], [lig,], n_ligs, chain_lengths])
    except:
        pdbbind_errors.append(entry)
        print(f"There was an error with {entry}")

pdbbind2020_df = pd.DataFrame(pdbbind_data, columns = ["dataset", "ID", "chains", "n_chains", "n_ress", "ligs", "n_ligs", "chain_lens"])

0
100
200
300
400
500
600
700
There was an error with .DS_Store
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
There was an error with readme
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
There was an error with index
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300


In [768]:
pdbbind2020_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,PDBbind,6ugp,[{A}],[1],[45],[Q7A],1,{'A': 257}
1,PDBbind,4rdn,[{A}],[1],[39],[6MD],1,{'A': 149}
2,PDBbind,4mo4,[{A}],[1],[43],[ACP],1,{'A': 345}


In [770]:
pdbbind2020_df.to_pickle("./results/DATASETS/pdbbind_refined_df.pkl")

In [771]:
pdbbind2020_df.shape #(5316, 7)

(5316, 8)

## SC6K

In [772]:
sc6k_dir = os.path.join(DATASETS_dir, "SC6K_processed")

entries_sc6k = os.listdir(sc6k_dir)

ligs_sc6k, error_ligs_sc6k, sites_sc6k, error_sites_sc6k, chain_lens_sc6k, error_chain_lens_sc6k, sc6k_data = ([] for _ in range(7))

sc6k_errors = []
for i, entry in enumerate(entries_sc6k):
    lig = site_ress = chain_len = None
    if i % 100 == 0:
       print(i)
    entry_dir = os.path.join(sc6k_dir, entry)
    try:
        files = [f for f in os.listdir(entry_dir) if "CAVITY" not in f and f.endswith(".mol2")]
        subentries = sorted(list(set(["_".join(f.split(".")[0].split("_")[:3]) for f in files])))
    except:
        sc6k_errors.append(entry)
        print(f"There was an error with {entry}")
        #raise
    

    for subentry in subentries:
        ds = subentry.split("_")
        try:
            subentry_id = f'{ds[0]}_{ds[1]}'
        except:
            print(files)
            print(entry, subentry)
            raise
        lig = ds[2]
        ligs_sc6k.append(lig)
        
        try:
            site_ress = get_scPDB_site_residues(os.path.join(entry_dir, f"{subentry}_SITE.mol2"))
            sites_full.append(site_ress)
        except:
            error_sites_full.append(entry)   
        try:
            chain_len = get_scPDB_chain_lens(os.path.join(entry_dir, f"{subentry}_PROT.mol2"))
            chain_lens_full.append(chain_len)
        except:
            error_chain_lens_full.append(entry) 
        if lig != None and site_ress != None and chain_len != None:
            n_ligs = 1
            prot_site_ress = [el for el in site_ress if el[2] in aas]
            n_ress = len(prot_site_ress)
            prot_site_chains = [set([el[0] for el in prot_site_ress]), ]
            n_chains = [len(el) for el in prot_site_chains]
            sc6k_data.append(["SC6K", subentry_id, prot_site_chains, n_chains, [n_ress, ], [lig,], n_ligs, chain_len])
    #break
    
sc6k_data_df = pd.DataFrame(sc6k_data, columns = ["dataset", "ID", "chains", "n_chains", "n_ress", "ligs", "n_ligs", "chain_lens"])

0
100
200
300
There was an error with .DS_Store
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400


In [323]:
#sc6k_data_df['pdb_id'] = sc6k_data_df['ID'].apply(lambda x: x.split('_')[0])

In [773]:
sc6k_data_df.head(3)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,SC6K,6pxs_1,[{A}],[1],[65],[FAD],1,{'A': 370}
1,SC6K,6pxs_2,[{B}],[1],[64],[FAD],1,{'B': 370}
2,SC6K,6pxs_4,[{D}],[1],[64],[FAD],1,{'D': 369}


In [774]:
sc6k_data_df.to_pickle("./results/DATASETS/SC6K_df.pkl")

## Binding MOAD (leaders only, non-redundant)

In [775]:
moad_fasta = os.path.join(DATASETS_dir, "binding_MOAD_subset.fasta")

moad_csv = os.path.join(DATASETS_dir, "binding_MOAD_leaders.csv")

In [776]:
MOAD_leaders_df = parse_moad_csv(moad_csv)

MOAD_subset_seq_dict = parse_fasta(moad_fasta)
MOAD_subset_pdbs = list(MOAD_subset_seq_dict.keys())

In [777]:
len(MOAD_subset_pdbs)

6650

In [778]:
print(len(MOAD_leaders_df))
print(len(MOAD_leaders_df.EC.unique()))
print(len(MOAD_leaders_df.pdb_id.unique()))
print(len(MOAD_leaders_df.drop_duplicates(["EC", "pdb_id"])))

83526
1933
11058
11058


In [779]:
MOAD_leaders_df.validity.value_counts()

invalid            39242
valid              28693
Part of Protein    15591
Name: validity, dtype: int64

In [780]:
MOAD_leaders_not_invalid_df = MOAD_leaders_df.query('validity != "invalid"').copy().reset_index(drop = True)

In [781]:
print(len(MOAD_leaders_not_invalid_df))
print(len(MOAD_leaders_not_invalid_df.EC.unique()))
print(len(MOAD_leaders_not_invalid_df.pdb_id.unique()))
print(len(MOAD_leaders_not_invalid_df.drop_duplicates(["EC", "pdb_id"])))

44284
1933
11058
11058


In [782]:
MOAD_leaders_valid_df = MOAD_leaders_not_invalid_df.query('validity == "valid"').copy().reset_index(drop = True)

In [783]:
print(len(MOAD_leaders_valid_df))
print(len(MOAD_leaders_valid_df.EC.unique()))
print(len(MOAD_leaders_valid_df.pdb_id.unique()))
print(len(MOAD_leaders_valid_df.drop_duplicates(["EC", "pdb_id"])))

28693
1933
11058
11058


In [784]:
MOAD_leaders_valid_df.head(3)

Unnamed: 0,EC,pdb_id,lig,validity,aff_metric,aff_val,aff_unit,SMILES
0,5.3.1.-,2PA7,TYD:A:141,valid,,,,CC1=CN(C(=O)NC1=O)[C@H]2C[C@@H]([C@H](O2)CO[P@...
1,5.3.1.-,2PA7,TYD:B:140,valid,,,,CC1=CN(C(=O)NC1=O)[C@H]2C[C@@H]([C@H](O2)CO[P@...
2,5.3.1.-,2HK1,FUD:A:1300,valid,,,,C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O


In [785]:
MOAD_leaders_valid_df[['lig_resname', 'lig_chain', 'lig_resnum']] = MOAD_leaders_valid_df['lig'].str.split(':', expand=True)

In [786]:
MOAD_leaders_valid_df_rf = MOAD_leaders_valid_df[["EC", "pdb_id", "lig_resname", "lig_chain", "lig_resnum"]].drop_duplicates()

In [787]:
MOAD_subset_df = MOAD_leaders_valid_df_rf.query('pdb_id in @MOAD_subset_pdbs').copy().reset_index(drop = True)

In [788]:
print(len(MOAD_subset_df))
print(len(MOAD_subset_df.EC.unique()))
print(len(MOAD_subset_df.pdb_id.unique()))
print(len(MOAD_subset_df.drop_duplicates(["EC", "pdb_id"])))

17246
1348
6650
6650


In [789]:
MOAD_subset_df.pdb_id = MOAD_subset_df.pdb_id.str.lower()

In [790]:
MOAD_lig_tuples_dict = create_MOAD_lig_tuples_dict(MOAD_subset_df)

In [791]:
MOAD_subset_df.head()

Unnamed: 0,EC,pdb_id,lig_resname,lig_chain,lig_resnum
0,5.3.1.-,2pa7,TYD,A,141
1,5.3.1.-,2pa7,TYD,B,140
2,5.3.1.-,2hk1,FUD,A,1300
3,5.3.1.-,2hk1,FUD,B,1301
4,5.3.1.-,2hk1,FUD,C,1302


In [792]:
MOAD_leaders_dir = os.path.join(DATASETS_dir, "BindingMOAD_2020")

In [793]:
moad_strucs = os.listdir(MOAD_leaders_dir)

In [794]:
len(moad_strucs)

17068

In [795]:
MOAD_subset_pdbs = MOAD_subset_df.pdb_id.unique().tolist()

In [796]:
len(MOAD_subset_pdbs)

6650

In [797]:
strucs_dict = {}
for subset_pdb in MOAD_subset_pdbs:
    strucs_dict[subset_pdb] = []
    for moad_struc in moad_strucs:
        if subset_pdb in moad_struc:
           strucs_dict[subset_pdb].append(moad_struc)
strucs_dict = {k: sorted(v) for k, v in strucs_dict.items()}

In [798]:
len(strucs_dict)

6650

In [799]:
strucs_dict_filt = {}
for k, v in strucs_dict.items():
    if v == []:
        continue
    else:
        strucs_dict_filt[k] = v[0]

In [800]:
struc_files = list(strucs_dict_filt.values())

In [801]:
[el for el in struc_files if len(el) != 9]

[]

In [802]:
for _, row in MOAD_subset_df.iterrows():
    if len(row.lig_resname) > 4:
        print(row)
        break

EC                            4.2.2.10
pdb_id                            3njv
lig_resname    GTR RAM GTR RAM GTR RAM
lig_chain                            B
lig_resnum                           1
Name: 17, dtype: object


In [803]:
MOAD_subset_data_df = extract_dataset_info(
    ds_dir = os.path.join(DATASETS_dir, "BindingMOAD_2020"), ds_name = "bMOAD_filt",
    struc_files = struc_files, lig_tuples_dict = MOAD_lig_tuples_dict,
    pdb_fmt_filt = False, threshold = 5
)

2qun.bio1 ('FUD', 'C', '291')
LIGAND ATOMS NOT FOUND
2qun.bio1 ('FUD', 'D', '291')
LIGAND ATOMS NOT FOUND
3njv.bio1 ('GTR RAM GTR RAM GTR RAM', 'B', '1')
LIGAND ATOMS NOT FOUND
CAREFUL: NO INTERACTIONS AT ALL FOR 3njv.bio1
4rjk.bio1 ('PYR', 'F', '609')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('PYR', 'F', '610')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('PYR', 'G', '607')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('PYR', 'H', '605')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('PYR', 'G', '608')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('TPP', 'F', '601')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('TPP', 'E', '601')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('TPP', 'G', '601')
LIGAND ATOMS NOT FOUND
4rjk.bio1 ('TDL', 'H', '601')
LIGAND ATOMS NOT FOUND
3h8c.bio1 ('NSZ', 'B', '400')
LIGAND ATOMS NOT FOUND
1xon.bio1 ('PIL', 'B', '502')
LIGAND ATOMS NOT FOUND
6c7d.bio1 ('EOJ', 'D', '2001')
LIGAND ATOMS NOT FOUND
6c7d.bio1 ('EOJ', 'C', '2001')
LIGAND ATOMS NOT FOUND
6c7d.bio1 ('EOJ', 'B', '2001')
LIGAND ATOMS NOT FOUND
5c2h.bio1 ('4PX', 'A', '903')
LI

In [804]:
missing_lig_pdbs = MOAD_subset_data_df.query('n_ligs < 1').ID.unique().tolist()

In [805]:
for k in missing_lig_pdbs:
    print(k, MOAD_lig_tuples_dict[k])

3njv [('GTR RAM GTR RAM GTR RAM', 'B', '1')]
5yjw [('SMA', 'A', '625'), ('SMA', 'A', '626'), ('FAD', 'A', '601')]
6o7b [('A   A   A   A  ', 'D', '1'), ('A   A   A   A  ', 'C', '1')]
6tug [('AF2 AF2 AF2 AF2 AF2 AF2', 'A', '502'), ('AF2 AF2 AF2 AF2 AF2 AF2', 'E', '502'), ('AF2 AF2 AF2 AF2 AF2 AF2', 'C', '502'), ('AF2 AF2 AF2 AF2 AF2 AF2', 'G', '502')]
6n6a [('G   G  ', 'D', '603')]
2djh [('3PD UM3', 'A', '117')]
6n6j [('A   A  ', 'C', '603'), ('A   A  ', 'D', '603')]
6w01 [('CIT', 'A', '412'), ('CIT', 'B', '408')]
6rcl [('A   A  ', 'C', '1')]
1brn [('DC  DG  DA  DC', 'A', '1'), ('DC  DG  DA  DC', 'B', '1')]
5aci [('BGC BGC BGC BGC BGC BGC', 'B', '1')]
5awv [('GHP 3MY 3FG GHP GHP OMY 3FG NAG MAN', 'I', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG NAG MAN', 'J', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG', 'M', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG', 'J', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG', 'L', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG', 'I', '1'), ('GHP 3MY 3FG GHP GHP OMY 3FG', 'N', '1'), ('GHP 3MY 3FG GHP G

In [806]:
MOAD_subset_data_df.query('ID in @missing_lig_pdbs') # weird sugars or peptide ligands

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
6,bMOAD_filt,3njv,[],[],[],[],0,{}
54,bMOAD_filt,5yjw,[],[],[],[],0,{}
84,bMOAD_filt,6o7b,[],[],[],[],0,{}
87,bMOAD_filt,6tug,[],[],[],[],0,{}
89,bMOAD_filt,6n6a,[],[],[],[],0,{}
...,...,...,...,...,...,...,...,...
6596,bMOAD_filt,4odt,[],[],[],[],0,{}
6605,bMOAD_filt,2wn2,[],[],[],[],0,{}
6606,bMOAD_filt,6haj,[],[],[],[],0,{}
6608,bMOAD_filt,2obt,[],[],[],[],0,{}


In [807]:
MOAD_subset_data_df_filt = MOAD_subset_data_df.query('n_ligs > 0').copy().reset_index(drop = True)

In [808]:
MOAD_subset_data_df_filt.head(5)

Unnamed: 0,dataset,ID,chains,n_chains,n_ress,ligs,n_ligs,chain_lens
0,bMOAD_filt,2pa7,"[{B, A}, {B, A}]","[2, 2]","[15, 14]","[TYD, TYD]",2,"{'A': 135, 'B': 134}"
1,bMOAD_filt,2hk1,"[{A}, {B}, {C}, {D}]","[1, 1, 1, 1]","[17, 17, 16, 18]","[FUD, FUD, FUD, FUD]",4,"{'A': 283, 'B': 283, 'C': 283, 'D': 283}"
2,bMOAD_filt,4q0p,[{A}],[1],[18],[0MK],1,{'A': 249}
3,bMOAD_filt,2qun,"[{A}, {B}]","[1, 1]","[20, 20]","[FUD, FUD]",2,"{'A': 290, 'B': 290}"
4,bMOAD_filt,2nxw,"[{B, A}, {B, A}]","[2, 2]","[29, 29]","[TPP, TPP]",2,"{'A': 537, 'B': 529}"


In [809]:
MOAD_subset_data_df_filt.shape

(5899, 8)

In [810]:
MOAD_subset_data_df_filt.to_pickle("./results/DATASETS/bMOAD_filt_df.pkl")