In [21]:
import json
import os
import random

from collections import Counter
from pdbbind_metadata_processor import PDBBindMetadataProcessor

In [22]:
random.seed(42)

In [2]:
code_file = 'codes.json'
with open(code_file, 'r') as f:
    codes = json.load(f)

In [7]:
cluster_file = 'protein_clusters.json'
with open(cluster_file, 'r') as f:
    clusters = json.load(f)

In [8]:
pdb_to_cluster_file = 'pdb_clusters.json'
with open(pdb_to_cluster_file, 'r') as f:
    pdb_to_cluster = json.load(f)

In [9]:
counter = Counter(clusters)

In [10]:
top10clusters = counter.most_common()[:10]

In [12]:
top10clusters

[(2160, 436),
 (1158, 424),
 (966, 344),
 (1890, 239),
 (1317, 194),
 (2094, 190),
 (1433, 181),
 (1299, 178),
 (1879, 160),
 (204, 112)]

In [15]:
pp = PDBBindMetadataProcessor()
table = pp.get_master_dataframe()

In [26]:
for big_cluster_id, count in top10clusters :
    most_common_codes = [code for code, cluster_id in pdb_to_cluster.items() if cluster_id == big_cluster_id]
    filtered_table = table[table['PDB code'].isin(most_common_codes)]
    print(filtered_table['protein name'].value_counts())

HIV-1 PROTEASE                 369
PROTEASE NL4-3                  14
PROTEASE RETROPEPSIN            11
POL POLYPROTEIN                  7
NL4-3 PROTEASE                   3
GAG-POL POLYPROTEIN              2
PROTEASE                         2
MDR 769 HIV-1 PROTEASE           2
HIV-1 PROTEASE E35D-DRV          1
[D-ALA51/51']HIV-1 PROTEASE      1
HIV-1 PROTEASE E35D-SQV          1
HIV-1 PROTEASE E35D-APV          1
PROTEASE PR5-DRV                 1
[AIB51/51']HIV-1 PROTEASE        1
[L-ALA51/51']HIV-1 PROTEASE      1
PROTEASE PR5-SQV                 1
Name: protein name, dtype: int64
CARBONIC ANHYDRASE 2           419
CARBONIC ANHYDRASE IX-MIMIC      3
Name: protein name, dtype: int64
BETA-SECRETASE 1    330
BETA-SECRETASE-1      1
Name: protein name, dtype: int64
CELL DIVISION PROTEIN KINASE 2    152
CYCLIN-DEPENDENT KINASE 2          87
Name: protein name, dtype: int64
TRYPSIN             110
TRYPSIN BETA         56
CATIONIC TRYPSIN     11
TRYPSINOGEN           6
Name: protein name

In [13]:
frac_train = 0.8
frac_val = 0.1

train_cutoff = int(frac_train * len(codes))
val_cutoff = int((frac_train + frac_val) * len(codes))
train_inds = []
val_inds = []
test_inds = []

In [14]:
unique_cluster_ids = list(counter.keys())

In [19]:
data_dir_path = 'data/'
protein_similarity_splits_dir_name = 'protein_similarity_splits'
protein_similarity_splits_dir_path = os.path.join(data_dir_path, protein_similarity_splits_dir_name)
if not os.path.exists(protein_similarity_splits_dir_path) :
    os.mkdir(protein_similarity_splits_dir_path)

In [24]:
for i in range(5) :
    
    random.shuffle(unique_cluster_ids)
    
    train_pdbs = []
    val_pdbs = []
    test_pdbs = []
    
    for current_cluster_id in unique_cluster_ids:
        pdbs = [pdb_id for pdb_id, cluster_id in pdb_to_cluster.items() if cluster_id == current_cluster_id]
        if len(train_pdbs) + len(pdbs) > train_cutoff:
            if len(train_pdbs) + len(val_pdbs) + len(pdbs) > val_cutoff:
                test_pdbs += pdbs
            else:
                val_pdbs += pdbs
        else:
            train_pdbs += pdbs
    
    with open(os.path.join(protein_similarity_splits_dir_path, f'train_pdb_protein_similarity_split_{i}.txt'), 'w') as f :
        for pdb in train_pdbs :
            f.write(pdb)
            f.write('\n')
        
    with open(os.path.join(protein_similarity_splits_dir_path, f'val_pdb_protein_similarity_split_{i}.txt'), 'w') as f :
        for pdb in val_pdbs :
            f.write(pdb)
            f.write('\n')
        
    with open(os.path.join(protein_similarity_splits_dir_path, f'test_pdb_protein_similarity_split_{i}.txt'), 'w') as f :
        for pdb in test_pdbs :
            f.write(pdb)
            f.write('\n')

In [25]:
len(train_pdbs)

13859

In [27]:
len(val_pdbs)

1732

In [26]:
len(test_pdbs)

1733