In [1]:
from JSU_lib import *

In [2]:
POCKETFINDER_dir = "./../DATA/pocketfinder_preds"

rep_chains_dir = "./../DATA/clean_rep_chains"

prot_pdb_ter = "_residue.pdb"
pocket_pdb_ter = "_pocket.pdb"
scores_ter = ".scores"
pymol_ter = ".pml"
dx_ter = ".dx"

POCKETFINDER_files = os.listdir(POCKETFINDER_dir)
pocket_files = sorted([el for el in POCKETFINDER_files if el.endswith(pocket_pdb_ter)])
score_files = sorted([el for el in POCKETFINDER_files if el.endswith(scores_ter)])
protein_files = sorted([el for el in POCKETFINDER_files if el.endswith(prot_pdb_ter)])

print(len(pocket_files))
print(len(score_files))
print(len(protein_files))

3448
3448
3448


## Generating .ds file for PRANK re-scoring

In [11]:
header = """
# Dataset for rescoring ConCavity predictions

PARAM.PREDICTION_METHOD=concavity

HEADER: prediction protein

"""
# for i, file in enumerate(pocket_files):
#     rep_chain = file.split(".")[0]
#     print(f'./../../DATA/pocketfinder_preds/{file} ./../../DATA/clean_rep_chains/{rep_chain}.clean.pdb')

# running PRANK on it took 1 minutes 20.327 seconds with -threads 8

## Extracting pocket data

In [3]:
scores_master_dict = {}
rows = []
for i in range(len(pocket_files)):

    if i %100 == 0:
        print(i)
    pocket_file = pocket_files[i]
    score_file = score_files[i]
    rep_chain = pocket_file.split(".")[0]
    
    score_path = os.path.join(POCKETFINDER_dir, score_file)
    pocket_path = os.path.join(POCKETFINDER_dir, pocket_file)
    clean_pdb_path = f'./../DATA/clean_rep_chains/{rep_chain}.clean.pdb'
    
    clean_df = PDBXreader(inputfile=clean_pdb_path).atoms(format_type="pdb", excluded=())
    pocket_df = PDBXreader(inputfile=pocket_path).atoms(format_type="pdb", excluded=(), remove_hydrogens = False) # poitns stored as H HETATMS
    
    scores_dict = parse_concavity_residue_scores(score_path)
    scores_master_dict[rep_chain] = scores_dict

    coords_dict = create_concavity_coordinates_dict(pocket_df) # dict pocket ID: pocket grid points
    
    centroids_dict = calculate_concavity_centroids(coords_dict) # dict pocket ID: pocket centroids

    pocket_score_dict = calculate_concavity_pocket_score(pocket_df) # dict pocket ID: pocket score (SS of grid point scores)

    pocket_ress_dict = get_concavity_pocket_ress(coords_dict, clean_df)

    for k in centroids_dict.keys():
        #print(k)
        pocket_centroid = tuple([round(el, 4) for el in centroids_dict[k]])
        pocket_score = pocket_score_dict[k]
        pocket_ress = pocket_ress_dict[k]
        n_ress = len(pocket_ress)
        n_grid_points = len(coords_dict[k])
        
        pocket_ress_df = clean_df.query('label_seq_id_full in @pocket_ress')
        ress_CAs = pocket_ress_df.query('label_atom_id == "CA"')
        CAs_coords = np.array(list((zip(ress_CAs.Cartn_x, ress_CAs.Cartn_y, ress_CAs.Cartn_z))))
        center_of_mass = np.mean(CAs_coords, axis=0)
        distances_squared = np.sum((CAs_coords - center_of_mass)**2, axis=1)
        radius_of_gyration = round(np.sqrt(np.mean(distances_squared)), 2)
        d_row = [rep_chain, int(k)+1, n_grid_points, n_ress, radius_of_gyration, pocket_centroid, pocket_ress, pocket_score]
        rows.append(d_row)

    #break

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


In [4]:
POCKETFINDER_pockets = pd.DataFrame(rows, columns = ["rep_chain", "ID", "n_points", "n_aas", "RoG", "centre", "aas", "score"])

In [5]:
POCKETFINDER_pockets

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score
0,1a52_A,1,247,33,9.55,"(106.4963, 14.0131, 96.383)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",145.0046
1,1a52_A,2,102,14,6.89,"(78.2059, 17.5699, 96.6319)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",48.5404
2,1a52_A,3,106,12,7.74,"(124.243, 13.8322, 100.9459)","[35, 37, 39, 40, 41, 111, 112, 113, 114, 115, ...",47.9504
3,1a52_A,4,206,30,9.53,"(108.2262, 29.4779, 103.2746)","[24, 27, 28, 29, 30, 31, 57, 58, 60, 61, 64, 9...",93.4443
4,1a52_A,5,138,14,6.90,"(88.8159, 33.3727, 109.8651)","[10, 11, 14, 18, 22, 65, 66, 67, 68, 69, 70, 7...",63.9008
...,...,...,...,...,...,...,...,...
10499,8y6b_F,4,123,14,7.89,"(-36.8073, -40.3212, 75.3179)","[308, 309, 310, 311, 353, 355, 361, 363, 364, ...",56.4942
10500,8y6b_F,5,112,11,6.64,"(-10.1344, -15.8899, 82.3769)","[454, 500, 501, 502, 503, 504, 518, 519, 520, ...",51.0544
10501,8y6b_F,6,178,16,7.68,"(-15.1356, 5.5597, 100.5795)","[126, 128, 129, 130, 131, 132, 133, 134, 152, ...",82.3000
10502,8y6o_H,1,184,13,7.80,"(270.4442, 310.1543, 240.5725)","[32, 35, 63, 64, 67, 70, 71, 74, 75, 76, 77, 7...",80.6517


In [6]:
print(len(POCKETFINDER_pockets))

10504


## Mapping PDB residues to UP residues

In [7]:
up_aas = {}

errors = []

for _, row in POCKETFINDER_pockets.iterrows():
    b = 0
    rep_chain = row.rep_chain
    site_ress = row.aas
    ID = row["ID"]
    mapping_dict = read_from_pickle(f'./../DATA/mappings_label_full/{rep_chain}.pkl')
    site_up_aas = []
    for res in site_ress:
        try:
            site_up_aas.append(mapping_dict[str(res)]) # I think there are integers here as it is not working with AltLocs, so none are present
        except:
            errors.append(rep_chain)
            b = 1
            break
    if b ==1:
        continue
    up_aas[(rep_chain, ID)] = site_up_aas

print(len(up_aas)) #10227
print(len(errors)) #277

errors = list(set(errors)) # 242 chains have issues with SIFTS mapping
print(len(errors)) #242

save_to_pickle(errors, "./results/POCKETFINDER_no_sifts.pkl")

POCKETFINDER_pockets_w_sifts = POCKETFINDER_pockets.query('rep_chain not in @errors').copy().reset_index(drop = True)
POCKETFINDER_pockets_w_sifts["up_aas"] = POCKETFINDER_pockets_w_sifts.set_index(['rep_chain', 'ID']).index.map(up_aas)

POCKETFINDER_pockets_w_sifts["n_up_aas"] = POCKETFINDER_pockets_w_sifts['up_aas'].apply(lambda x: len(x))
assert POCKETFINDER_pockets_w_sifts.n_aas.equals(POCKETFINDER_pockets_w_sifts.n_up_aas)
POCKETFINDER_pockets_w_sifts.drop(columns = ["n_up_aas"], inplace = True)

POCKETFINDER_pockets_w_sifts

10227
277
242


Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas
0,1a52_A,1,247,33,9.55,"(106.4963, 14.0131, 96.383)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",145.0046,"[342, 343, 345, 346, 347, 348, 349, 350, 351, ..."
1,1a52_A,2,102,14,6.89,"(78.2059, 17.5699, 96.6319)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",48.5404,"[371, 373, 374, 377, 460, 461, 462, 463, 464, ..."
2,1a52_A,3,106,12,7.74,"(124.243, 13.8322, 100.9459)","[35, 37, 39, 40, 41, 111, 112, 113, 114, 115, ...",47.9504,"[331, 333, 335, 336, 337, 407, 408, 409, 410, ..."
3,1a52_A,4,206,30,9.53,"(108.2262, 29.4779, 103.2746)","[24, 27, 28, 29, 30, 31, 57, 58, 60, 61, 64, 9...",93.4443,"[320, 323, 324, 325, 326, 327, 353, 354, 356, ..."
4,1a52_A,5,138,14,6.90,"(88.8159, 33.3727, 109.8651)","[10, 11, 14, 18, 22, 65, 66, 67, 68, 69, 70, 7...",63.9008,"[306, 307, 310, 314, 318, 361, 362, 363, 364, ..."
...,...,...,...,...,...,...,...,...,...
9942,8y6b_F,4,123,14,7.89,"(-36.8073, -40.3212, 75.3179)","[308, 309, 310, 311, 353, 355, 361, 363, 364, ...",56.4942,"[339, 340, 341, 342, 384, 386, 392, 394, 395, ..."
9943,8y6b_F,5,112,11,6.64,"(-10.1344, -15.8899, 82.3769)","[454, 500, 501, 502, 503, 504, 518, 519, 520, ...",51.0544,"[485, 531, 532, 533, 534, 535, 549, 550, 551, ..."
9944,8y6b_F,6,178,16,7.68,"(-15.1356, 5.5597, 100.5795)","[126, 128, 129, 130, 131, 132, 133, 134, 152, ...",82.3000,"[157, 159, 160, 161, 162, 163, 164, 165, 183, ..."
9945,8y6o_H,1,184,13,7.80,"(270.4442, 310.1543, 240.5725)","[32, 35, 63, 64, 67, 70, 71, 74, 75, 76, 77, 7...",80.6517,"[32, 35, 63, 64, 67, 70, 71, 74, 75, 76, 77, 7..."


## Add pocket surfaces and volumes

In [13]:
master_SASA_dict = read_from_pickle("./results/master_SASA_dict.pkl")

POCKETFINDER_pockets_w_sifts_SASA = calculate_total_sasa(POCKETFINDER_pockets_w_sifts, master_SASA_dict)

vols_dict = read_from_pickle("./results/POCKETFINDER_volumes_dict.pkl")
POCKETFINDER_pockets_w_sifts_SASA["VOL"] = POCKETFINDER_pockets_w_sifts_SASA.set_index(['rep_chain', 'ID']).index.map(vols_dict)
print(len(POCKETFINDER_pockets_w_sifts_SASA.query('VOL != VOL')))
POCKETFINDER_pockets_w_sifts_SASA['centre_mat'] = POCKETFINDER_pockets_w_sifts_SASA.apply(apply_rotation, axis=1)

80


### Default RANK is unranked, RANK = ID, as LIGSITE did not assign scores nor ranked their pockets

The score here reported has been calculated by me, Javier Sánchez Utgés, in the same manner as done in P2Rank and similarly GrASP, as the sum of squares of grid point scores.

In [14]:
POCKETFINDER_pockets_w_sifts_SASA['RANK'] = POCKETFINDER_pockets_w_sifts_SASA.ID

In [15]:
POCKETFINDER_pockets_w_sifts_SASA

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas,SASA,centre_mat,RANK,VOL
0,1a52_A,1,247,33,9.55,"(106.4963, 14.0131, 96.383)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",145.0046,"[342, 343, 345, 346, 347, 348, 349, 350, 351, ...",1002.52,"(0.596, -10.246, -1.702)",1,810.0
1,1a52_A,2,102,14,6.89,"(78.2059, 17.5699, 96.6319)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",48.5404,"[371, 373, 374, 377, 460, 461, 462, 463, 464, ...",986.00,"(-22.231, 6.663, -4.167)",2,174.0
2,1a52_A,3,106,12,7.74,"(124.243, 13.8322, 100.9459)","[35, 37, 39, 40, 41, 111, 112, 113, 114, 115, ...",47.9504,"[331, 333, 335, 336, 337, 407, 408, 409, 410, ...",763.94,"(14.979, -19.541, 4.818)",3,268.0
3,1a52_A,4,206,30,9.53,"(108.2262, 29.4779, 103.2746)","[24, 27, 28, 29, 30, 31, 57, 58, 60, 61, 64, 9...",93.4443,"[320, 323, 324, 325, 326, 327, 353, 354, 356, ...",1141.78,"(8.006, 1.628, 7.98)",4,1136.0
4,1a52_A,5,138,14,6.90,"(88.8159, 33.3727, 109.8651)","[10, 11, 14, 18, 22, 65, 66, 67, 68, 69, 70, 7...",63.9008,"[306, 307, 310, 314, 318, 361, 362, 363, 364, ...",778.53,"(-8.191, 13.859, 12.819)",5,321.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9942,8y6b_F,4,123,14,7.89,"(-36.8073, -40.3212, 75.3179)","[308, 309, 310, 311, 353, 355, 361, 363, 364, ...",56.4942,"[339, 340, 341, 342, 384, 386, 392, 394, 395, ...",1055.19,"(7.25, 27.373, 18.042)",4,679.0
9943,8y6b_F,5,112,11,6.64,"(-10.1344, -15.8899, 82.3769)","[454, 500, 501, 502, 503, 504, 518, 519, 520, ...",51.0544,"[485, 531, 532, 533, 534, 535, 549, 550, 551, ...",858.19,"(-6.029, -4.212, 4.469)",5,610.0
9944,8y6b_F,6,178,16,7.68,"(-15.1356, 5.5597, 100.5795)","[126, 128, 129, 130, 131, 132, 133, 134, 152, ...",82.3000,"[157, 159, 160, 161, 162, 163, 164, 165, 183, ...",992.43,"(4.098, -29.091, 14.21)",6,412.0
9945,8y6o_H,1,184,13,7.80,"(270.4442, 310.1543, 240.5725)","[32, 35, 63, 64, 67, 70, 71, 74, 75, 76, 77, 7...",80.6517,"[32, 35, 63, 64, 67, 70, 71, 74, 75, 76, 77, 7...",878.13,"(-7.081, 13.986, -8.547)",1,176.0


In [None]:
#POCKETFINDER_pockets_w_sifts_SASA['RANK'] = POCKETFINDER_pockets_w_sifts_SASA.groupby('rep_chain')['score'].rank(ascending=False, method='min').astype(int)

In [16]:
POCKETFINDER_pockets_w_sifts_SASA.to_pickle("./results/POCKETFINDER_pockets_DEF_TRANS_UNRANKED.pkl") # rank here is default (no ranking, i.e., RANK = ID)