In [1]:
from JSU_lib import *

In [2]:
LIGSITE_dir = "./../DATA/ligsite_preds"

rep_chains_dir = "./../DATA/clean_rep_chains"

prot_pdb_ter = "_residue.pdb"
pocket_pdb_ter = "_pocket.pdb"
scores_ter = ".scores"
pymol_ter = ".pml"
dx_ter = ".dx"

LIGSITE_files = os.listdir(LIGSITE_dir)
pocket_files = sorted([el for el in LIGSITE_files if el.endswith(pocket_pdb_ter)])
score_files = sorted([el for el in LIGSITE_files if el.endswith(scores_ter)])
protein_files = sorted([el for el in LIGSITE_files if el.endswith(prot_pdb_ter)])

print(len(pocket_files))
print(len(score_files))
print(len(protein_files))

3448
3448
3448


## Generating .ds file for PRANK re-scoring

In [43]:
header = """
# Dataset for rescoring ConCavity predictions

PARAM.PREDICTION_METHOD=concavity

HEADER: prediction protein

"""
# for i, file in enumerate(pocket_files):
#     rep_chain = file.split(".")[0]
#     print(f'./../../DATA/ligsite_preds/{file} ./../../DATA/clean_rep_chains/{rep_chain}.clean.pdb')

# running PRANK on it took 1 minutes 21.719 seconds with -threads 8

## Extracting pocket data

In [14]:
scores_master_dict = {}
rows = []
for i in range(len(pocket_files)):

    if i %100 == 0:
        print(i)
    pocket_file = pocket_files[i]
    score_file = score_files[i]
    rep_chain = pocket_file.split(".")[0]
    
    score_path = os.path.join(LIGSITE_dir, score_file)
    pocket_path = os.path.join(LIGSITE_dir, pocket_file)
    clean_pdb_path = f'./../DATA/clean_rep_chains/{rep_chain}.clean.pdb'
    
    clean_df = PDBXreader(inputfile=clean_pdb_path).atoms(format_type="pdb", excluded=())
    pocket_df = PDBXreader(inputfile=pocket_path).atoms(format_type="pdb", excluded=(), remove_hydrogens = False) # poitns stored as H HETATMS
    
    scores_dict = parse_concavity_residue_scores(score_path)
    scores_master_dict[rep_chain] = scores_dict

    coords_dict = create_concavity_coordinates_dict(pocket_df) # dict pocket ID: pocket grid points
    
    centroids_dict = calculate_concavity_centroids(coords_dict) # dict pocket ID: pocket centroids

    pocket_score_dict = calculate_concavity_pocket_score(pocket_df) # dict pocket ID: pocket score (SS of grid point scores)

    pocket_ress_dict = get_concavity_pocket_ress(coords_dict, clean_df)

    for k in centroids_dict.keys():
        #print(k)
        pocket_centroid = tuple([round(el, 4) for el in centroids_dict[k]])
        pocket_score = pocket_score_dict[k]
        pocket_ress = pocket_ress_dict[k]
        n_ress = len(pocket_ress)
        n_grid_points = len(coords_dict[k])
        
        pocket_ress_df = clean_df.query('label_seq_id_full in @pocket_ress')
        ress_CAs = pocket_ress_df.query('label_atom_id == "CA"')
        CAs_coords = np.array(list((zip(ress_CAs.Cartn_x, ress_CAs.Cartn_y, ress_CAs.Cartn_z))))
        center_of_mass = np.mean(CAs_coords, axis=0)
        distances_squared = np.sum((CAs_coords - center_of_mass)**2, axis=1)
        radius_of_gyration = round(np.sqrt(np.mean(distances_squared)), 2)
        d_row = [rep_chain, int(k)+1, n_grid_points, n_ress, radius_of_gyration, pocket_centroid, pocket_ress, pocket_score]
        rows.append(d_row)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


In [15]:
LIGSITE_pockets = pd.DataFrame(rows, columns = ["rep_chain", "ID", "n_points", "n_aas", "RoG", "centre", "aas", "score"])

In [16]:
LIGSITE_pockets

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score
0,1a52_A,1,445,46,10.07,"(105.8655, 14.1064, 96.9598)","[46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 5...",9672.3621
1,1a52_A,2,267,37,9.12,"(104.8636, 23.8273, 103.9751)","[27, 28, 29, 30, 31, 32, 33, 53, 54, 56, 57, 5...",4729.9550
2,1a5h_B,1,316,39,9.36,"(21.5664, 53.3357, 12.1273)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",4718.5873
3,1a5h_B,2,340,51,10.19,"(25.0297, 56.0108, 26.3255)","[2, 47, 93, 131, 136, 144, 158, 161, 162, 164,...",5227.3023
4,1a9w_E,1,406,42,10.32,"(4.5327, 34.3028, 47.2489)","[28, 31, 32, 35, 37, 38, 39, 40, 41, 42, 63, 6...",8387.0897
...,...,...,...,...,...,...,...,...
8223,8y14_A,2,1312,80,13.50,"(-19.7134, -15.9791, 38.5796)","[217, 248, 249, 250, 251, 252, 253, 254, 280, ...",13741.3574
8224,8y6b_F,1,1415,65,12.33,"(-20.6654, -30.8571, 69.0335)","[206, 207, 208, 209, 210, 211, 212, 217, 218, ...",27575.9767
8225,8y6b_F,2,158,16,8.12,"(-23.5955, -8.2414, 75.3798)","[189, 191, 192, 193, 455, 456, 457, 458, 472, ...",2400.4619
8226,8y6o_H,1,324,28,9.25,"(265.5678, 300.1079, 251.2892)","[73, 74, 75, 76, 77, 78, 79, 80, 94, 95, 96, 9...",3420.3469


In [17]:
print(len(LIGSITE_pockets))

8228


## Mapping PDB residues to UP residues

In [18]:
up_aas = {}

errors = []

for _, row in LIGSITE_pockets.iterrows():
    b = 0
    rep_chain = row.rep_chain
    site_ress = row.aas
    ID = row["ID"]
    mapping_dict = read_from_pickle(f'./../DATA/mappings_label_full/{rep_chain}.pkl')
    site_up_aas = []
    for res in site_ress:
        try:
            site_up_aas.append(mapping_dict[str(res)]) # I think there are integers here as it is not working with AltLocs, so none are present
        except:
            errors.append(rep_chain)
            b = 1
            break
    if b ==1:
        continue
    up_aas[(rep_chain, ID)] = site_up_aas

print(len(up_aas)) #7914
print(len(errors)) #314

errors = list(set(errors)) # 278 chains have issues with SIFTS mapping
print(len(errors)) #278

save_to_pickle(errors, "./results/LIGSITE_no_sifts.pkl")

LIGSITE_pockets_w_sifts = LIGSITE_pockets.query('rep_chain not in @errors').copy().reset_index(drop = True)
LIGSITE_pockets_w_sifts["up_aas"] = LIGSITE_pockets_w_sifts.set_index(['rep_chain', 'ID']).index.map(up_aas)

LIGSITE_pockets_w_sifts["n_up_aas"] = LIGSITE_pockets_w_sifts['up_aas'].apply(lambda x: len(x))
assert LIGSITE_pockets_w_sifts.n_aas.equals(LIGSITE_pockets_w_sifts.n_up_aas)
LIGSITE_pockets_w_sifts.drop(columns = ["n_up_aas"], inplace = True)

LIGSITE_pockets_w_sifts

7914
314
278


Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas
0,1a52_A,1,445,46,10.07,"(105.8655, 14.1064, 96.9598)","[46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 5...",9672.3621,"[342, 343, 344, 345, 346, 347, 348, 349, 350, ..."
1,1a52_A,2,267,37,9.12,"(104.8636, 23.8273, 103.9751)","[27, 28, 29, 30, 31, 32, 33, 53, 54, 56, 57, 5...",4729.9550,"[323, 324, 325, 326, 327, 328, 329, 349, 350, ..."
2,1a5h_B,1,316,39,9.36,"(21.5664, 53.3357, 12.1273)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",4718.5873,"[432, 433, 434, 435, 436, 437, 438, 439, 440, ..."
3,1a5h_B,2,340,51,10.19,"(25.0297, 56.0108, 26.3255)","[2, 47, 93, 131, 136, 144, 158, 161, 162, 164,...",5227.3023,"[312, 357, 403, 441, 446, 454, 468, 471, 472, ..."
4,1a9w_E,1,406,42,10.32,"(4.5327, 34.3028, 47.2489)","[28, 31, 32, 35, 37, 38, 39, 40, 41, 42, 63, 6...",8387.0897,"[29, 32, 33, 36, 38, 39, 40, 41, 42, 43, 64, 6..."
...,...,...,...,...,...,...,...,...,...
7738,8y14_A,2,1312,80,13.50,"(-19.7134, -15.9791, 38.5796)","[217, 248, 249, 250, 251, 252, 253, 254, 280, ...",13741.3574,"[354, 385, 386, 387, 388, 389, 390, 391, 417, ..."
7739,8y6b_F,1,1415,65,12.33,"(-20.6654, -30.8571, 69.0335)","[206, 207, 208, 209, 210, 211, 212, 217, 218, ...",27575.9767,"[237, 238, 239, 240, 241, 242, 243, 248, 249, ..."
7740,8y6b_F,2,158,16,8.12,"(-23.5955, -8.2414, 75.3798)","[189, 191, 192, 193, 455, 456, 457, 458, 472, ...",2400.4619,"[220, 222, 223, 224, 486, 487, 488, 489, 503, ..."
7741,8y6o_H,1,324,28,9.25,"(265.5678, 300.1079, 251.2892)","[73, 74, 75, 76, 77, 78, 79, 80, 94, 95, 96, 9...",3420.3469,"[73, 74, 75, 76, 77, 78, 79, 80, 94, 95, 96, 9..."


## Add pocket surfaces and volumes

In [26]:
master_SASA_dict = read_from_pickle("./results/master_SASA_dict.pkl")

LIGSITE_pockets_w_sifts_SASA = calculate_total_sasa(LIGSITE_pockets_w_sifts, master_SASA_dict)

vols_dict = read_from_pickle("./results/LIGSITE_volumes_dict.pkl")
LIGSITE_pockets_w_sifts_SASA["VOL"] = LIGSITE_pockets_w_sifts_SASA.set_index(['rep_chain', 'ID']).index.map(vols_dict)
print(len(LIGSITE_pockets_w_sifts_SASA.query('VOL != VOL')))
LIGSITE_pockets_w_sifts_SASA['centre_mat'] = LIGSITE_pockets_w_sifts_SASA.apply(apply_rotation, axis=1)

72


### Default RANK is unranked, RANK = ID, as LIGSITE does not assign scores nor rank their pockets

In [27]:
LIGSITE_pockets_w_sifts_SASA['RANK'] = LIGSITE_pockets_w_sifts_SASA.ID

In [28]:
LIGSITE_pockets_w_sifts_SASA

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas,SASA,centre_mat,RANK,VOL
0,1a52_A,1,445,46,10.07,"(105.8655, 14.1064, 96.9598)","[46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 5...",9672.3621,"[342, 343, 344, 345, 346, 347, 348, 349, 350, ...",1369.28,"(-0.015, -9.913, -1.197)",1,919.0
1,1a52_A,2,267,37,9.12,"(104.8636, 23.8273, 103.9751)","[27, 28, 29, 30, 31, 32, 33, 53, 54, 56, 57, 5...",4729.9550,"[323, 324, 325, 326, 327, 328, 329, 349, 350, ...",1309.35,"(2.33, -1.69, 7.264)",2,1033.0
2,1a5h_B,1,316,39,9.36,"(21.5664, 53.3357, 12.1273)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",4718.5873,"[432, 433, 434, 435, 436, 437, 438, 439, 440, ...",1724.09,"(-9.845, 12.983, 3.672)",1,568.0
3,1a5h_B,2,340,51,10.19,"(25.0297, 56.0108, 26.3255)","[2, 47, 93, 131, 136, 144, 158, 161, 162, 164,...",5227.3023,"[312, 357, 403, 441, 446, 454, 468, 471, 472, ...",1897.66,"(2.741, 13.688, -4.192)",2,1412.0
4,1a9w_E,1,406,42,10.32,"(4.5327, 34.3028, 47.2489)","[28, 31, 32, 35, 37, 38, 39, 40, 41, 42, 63, 6...",8387.0897,"[29, 32, 33, 36, 38, 39, 40, 41, 42, 43, 64, 6...",1922.00,"(-4.049, 5.115, 1.142)",1,999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7738,8y14_A,2,1312,80,13.50,"(-19.7134, -15.9791, 38.5796)","[217, 248, 249, 250, 251, 252, 253, 254, 280, ...",13741.3574,"[354, 385, 386, 387, 388, 389, 390, 391, 417, ...",2299.26,"(3.124, 18.318, -1.097)",2,6737.0
7739,8y6b_F,1,1415,65,12.33,"(-20.6654, -30.8571, 69.0335)","[206, 207, 208, 209, 210, 211, 212, 217, 218, ...",27575.9767,"[237, 238, 239, 240, 241, 242, 243, 248, 249, ...",1675.15,"(-0.893, 17.786, 2.83)",1,3246.0
7740,8y6b_F,2,158,16,8.12,"(-23.5955, -8.2414, 75.3798)","[189, 191, 192, 193, 455, 456, 457, 458, 472, ...",2400.4619,"[220, 222, 223, 224, 486, 487, 488, 489, 503, ...",1154.90,"(10.614, -2.846, 1.334)",2,1911.0
7741,8y6o_H,1,324,28,9.25,"(265.5678, 300.1079, 251.2892)","[73, 74, 75, 76, 77, 78, 79, 80, 94, 95, 96, 9...",3420.3469,"[73, 74, 75, 76, 77, 78, 79, 80, 94, 95, 96, 9...",1824.15,"(-10.825, -1.032, -8.461)",1,1089.0


In [111]:
#LIGSITE_pockets_w_sifts_SASA['RANK'] = LIGSITE_pockets_w_sifts_SASA.groupby('rep_chain')['score'].rank(ascending=False, method='min').astype(int)

In [29]:
LIGSITE_pockets_w_sifts_SASA.to_pickle("./results/LIGSITE_pockets_DEF_TRANS_UNRANKED.pkl") # rank here is default (no ranking, i.e., RANK = ID)