In [1]:
from JSU_lib import *

In [2]:
SURFNET_dir = "./../DATA/surfnet_preds"

rep_chains_dir = "./../DATA/clean_rep_chains"

prot_pdb_ter = "_residue.pdb"
pocket_pdb_ter = "_pocket.pdb"
scores_ter = ".scores"
pymol_ter = ".pml"
dx_ter = ".dx"

SURFNET_files = os.listdir(SURFNET_dir)
pocket_files = sorted([el for el in SURFNET_files if el.endswith(pocket_pdb_ter)])
score_files = sorted([el for el in SURFNET_files if el.endswith(scores_ter)])
protein_files = sorted([el for el in SURFNET_files if el.endswith(prot_pdb_ter)])

print(len(pocket_files))
print(len(score_files))
print(len(protein_files))

3448
3448
3448


## Generating .ds file for PRANK re-scoring

In [9]:
header = """
# Dataset for rescoring ConCavity predictions

PARAM.PREDICTION_METHOD=concavity

HEADER: prediction protein

"""
# for i, file in enumerate(pocket_files):
#     rep_chain = file.split(".")[0]
#     print(f'./../../DATA/surfnet_preds/{file} ./../../DATA/clean_rep_chains/{rep_chain}.clean.pdb')

# running PRANK on it took 1 minutes 11.745 seconds with -threads 8

## Extracting pocket data

In [7]:
scores_master_dict = {}
rows = []
for i in range(len(pocket_files)):

    if i %100 == 0:
        print(i)
    pocket_file = pocket_files[i]
    score_file = score_files[i]
    rep_chain = pocket_file.split(".")[0]
    
    score_path = os.path.join(SURFNET_dir, score_file)
    pocket_path = os.path.join(SURFNET_dir, pocket_file)
    clean_pdb_path = f'./../DATA/clean_rep_chains/{rep_chain}.clean.pdb'
    
    clean_df = PDBXreader(inputfile=clean_pdb_path).atoms(format_type="pdb", excluded=())
    pocket_df = PDBXreader(inputfile=pocket_path).atoms(format_type="pdb", excluded=(), remove_hydrogens = False) # poitns stored as H HETATMS
    
    scores_dict = parse_concavity_residue_scores(score_path)
    scores_master_dict[rep_chain] = scores_dict

    coords_dict = create_concavity_coordinates_dict(pocket_df) # dict pocket ID: pocket grid points
    
    centroids_dict = calculate_concavity_centroids(coords_dict) # dict pocket ID: pocket centroids

    pocket_score_dict = calculate_concavity_pocket_score(pocket_df) # dict pocket ID: pocket score (SS of grid point scores)

    pocket_ress_dict = get_concavity_pocket_ress(coords_dict, clean_df)

    for k in centroids_dict.keys():
        #print(k)
        pocket_centroid = tuple([round(el, 4) for el in centroids_dict[k]])
        pocket_score = pocket_score_dict[k]
        pocket_ress = pocket_ress_dict[k]
        n_ress = len(pocket_ress)
        n_grid_points = len(coords_dict[k])
        
        pocket_ress_df = clean_df.query('label_seq_id_full in @pocket_ress')
        ress_CAs = pocket_ress_df.query('label_atom_id == "CA"')
        CAs_coords = np.array(list((zip(ress_CAs.Cartn_x, ress_CAs.Cartn_y, ress_CAs.Cartn_z))))
        center_of_mass = np.mean(CAs_coords, axis=0)
        distances_squared = np.sum((CAs_coords - center_of_mass)**2, axis=1)
        radius_of_gyration = round(np.sqrt(np.mean(distances_squared)), 2)
        d_row = [rep_chain, int(k)+1, n_grid_points, n_ress, radius_of_gyration, pocket_centroid, pocket_ress, pocket_score]
        rows.append(d_row)

    #break

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


In [8]:
SURFNET_pockets = pd.DataFrame(rows, columns = ["rep_chain", "ID", "n_points", "n_aas", "RoG", "centre", "aas", "score"])

In [9]:
SURFNET_pockets

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score
0,1a52_A,1,104,16,7.54,"(93.9565, 27.4173, 87.3074)","[89, 155, 156, 159, 160, 163, 183, 211, 212, 2...",733981.0
1,1a52_A,2,304,37,9.71,"(106.4199, 14.214, 96.4682)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",6062830.0
2,1a52_A,3,102,16,6.96,"(77.709, 18.6999, 96.6904)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",901960.0
3,1a52_A,4,203,31,8.60,"(103.9476, 24.4897, 103.4454)","[26, 27, 28, 29, 30, 31, 53, 54, 56, 57, 58, 5...",3879614.0
4,1a5h_B,1,211,28,8.69,"(21.4455, 54.4027, 10.1283)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",3214078.0
...,...,...,...,...,...,...,...,...
10599,8y6b_F,4,400,23,9.97,"(-27.0089, -10.9062, 76.7475)","[186, 189, 191, 192, 193, 408, 409, 410, 411, ...",2645115.0
10600,8y6b_F,5,165,13,7.49,"(-12.0191, -15.301, 82.5421)","[454, 499, 500, 501, 502, 503, 504, 518, 519, ...",1593794.0
10601,8y6b_F,6,182,13,7.05,"(-7.2889, -30.3494, 83.4135)","[212, 214, 215, 216, 217, 218, 233, 234, 235, ...",1558760.0
10602,8y6b_F,7,144,16,7.27,"(-14.6807, 8.6334, 103.2561)","[110, 128, 129, 130, 131, 132, 133, 134, 135, ...",1910498.0


In [10]:
print(len(SURFNET_pockets))

10604


## Mapping PDB residues to UP residues

In [11]:
up_aas = {}

errors = []

for _, row in SURFNET_pockets.iterrows():
    b = 0
    rep_chain = row.rep_chain
    site_ress = row.aas
    ID = row["ID"]
    mapping_dict = read_from_pickle(f'./../DATA/mappings_label_full/{rep_chain}.pkl')
    site_up_aas = []
    for res in site_ress:
        try:
            site_up_aas.append(mapping_dict[str(res)]) # I think there are integers here as it is not working with AltLocs, so none are present
        except:
            errors.append(rep_chain)
            b = 1
            break
    if b ==1:
        continue
    up_aas[(rep_chain, ID)] = site_up_aas

print(len(up_aas)) #10256
print(len(errors)) #348

errors = list(set(errors)) # 315 chains have issues with SIFTS mapping
print(len(errors)) #315

save_to_pickle(errors, "./results/SURFNET_no_sifts.pkl")

SURFNET_pockets_w_sifts = SURFNET_pockets.query('rep_chain not in @errors').copy().reset_index(drop = True)
SURFNET_pockets_w_sifts["up_aas"] = SURFNET_pockets_w_sifts.set_index(['rep_chain', 'ID']).index.map(up_aas)

SURFNET_pockets_w_sifts["n_up_aas"] = SURFNET_pockets_w_sifts['up_aas'].apply(lambda x: len(x))
assert SURFNET_pockets_w_sifts.n_aas.equals(SURFNET_pockets_w_sifts.n_up_aas)
SURFNET_pockets_w_sifts.drop(columns = ["n_up_aas"], inplace = True)

SURFNET_pockets_w_sifts

10256
348
315


Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas
0,1a52_A,1,104,16,7.54,"(93.9565, 27.4173, 87.3074)","[89, 155, 156, 159, 160, 163, 183, 211, 212, 2...",733981.0,"[385, 451, 452, 455, 456, 459, 479, 507, 508, ..."
1,1a52_A,2,304,37,9.71,"(106.4199, 14.214, 96.4682)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",6062830.0,"[342, 343, 345, 346, 347, 348, 349, 350, 351, ..."
2,1a52_A,3,102,16,6.96,"(77.709, 18.6999, 96.6904)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",901960.0,"[371, 373, 374, 377, 460, 461, 462, 463, 464, ..."
3,1a52_A,4,203,31,8.60,"(103.9476, 24.4897, 103.4454)","[26, 27, 28, 29, 30, 31, 53, 54, 56, 57, 58, 5...",3879614.0,"[322, 323, 324, 325, 326, 327, 349, 350, 352, ..."
4,1a5h_B,1,211,28,8.69,"(21.4455, 54.4027, 10.1283)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",3214078.0,"[432, 433, 434, 435, 436, 437, 438, 439, 440, ..."
...,...,...,...,...,...,...,...,...,...
9928,8y6b_F,4,400,23,9.97,"(-27.0089, -10.9062, 76.7475)","[186, 189, 191, 192, 193, 408, 409, 410, 411, ...",2645115.0,"[217, 220, 222, 223, 224, 439, 440, 441, 442, ..."
9929,8y6b_F,5,165,13,7.49,"(-12.0191, -15.301, 82.5421)","[454, 499, 500, 501, 502, 503, 504, 518, 519, ...",1593794.0,"[485, 530, 531, 532, 533, 534, 535, 549, 550, ..."
9930,8y6b_F,6,182,13,7.05,"(-7.2889, -30.3494, 83.4135)","[212, 214, 215, 216, 217, 218, 233, 234, 235, ...",1558760.0,"[243, 245, 246, 247, 248, 249, 264, 265, 266, ..."
9931,8y6b_F,7,144,16,7.27,"(-14.6807, 8.6334, 103.2561)","[110, 128, 129, 130, 131, 132, 133, 134, 135, ...",1910498.0,"[141, 159, 160, 161, 162, 163, 164, 165, 166, ..."


## Add pocket surfaces and volumes

In [16]:
master_SASA_dict = read_from_pickle("./results/master_SASA_dict.pkl")

SURFNET_pockets_w_sifts_SASA = calculate_total_sasa(SURFNET_pockets_w_sifts, master_SASA_dict)

vols_dict = read_from_pickle("./results/SURFNET_volumes_dict.pkl")
SURFNET_pockets_w_sifts_SASA["VOL"] = SURFNET_pockets_w_sifts_SASA.set_index(['rep_chain', 'ID']).index.map(vols_dict)
print(len(SURFNET_pockets_w_sifts_SASA.query('VOL != VOL')))
SURFNET_pockets_w_sifts_SASA['centre_mat'] = SURFNET_pockets_w_sifts_SASA.apply(apply_rotation, axis=1)

81


### Default RANK is unranked, RANK = ID, as SURFNET does not assign scores nor rank their pockets

In [17]:
SURFNET_pockets_w_sifts_SASA['RANK'] = SURFNET_pockets_w_sifts_SASA.ID

In [18]:
SURFNET_pockets_w_sifts_SASA

Unnamed: 0,rep_chain,ID,n_points,n_aas,RoG,centre,aas,score,up_aas,SASA,centre_mat,RANK,VOL
0,1a52_A,1,104,16,7.54,"(93.9565, 27.4173, 87.3074)","[89, 155, 156, 159, 160, 163, 183, 211, 212, 2...",733981.0,"[385, 451, 452, 455, 456, 459, 479, 507, 508, ...",675.43,"(-2.263, 8.398, -9.67)",1,531.0
1,1a52_A,2,304,37,9.71,"(106.4199, 14.214, 96.4682)","[46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 87, 8...",6062830.0,"[342, 343, 345, 346, 347, 348, 349, 350, 351, ...",1167.12,"(0.607, -10.043, -1.592)",2,827.0
2,1a52_A,3,102,16,6.96,"(77.709, 18.6999, 96.6904)","[75, 77, 78, 81, 164, 165, 166, 167, 168, 169,...",901960.0,"[371, 373, 374, 377, 460, 461, 462, 463, 464, ...",1164.77,"(-22.143, 7.88, -3.968)",3,174.0
3,1a52_A,4,203,31,8.60,"(103.9476, 24.4897, 103.4454)","[26, 27, 28, 29, 30, 31, 53, 54, 56, 57, 58, 5...",3879614.0,"[322, 323, 324, 325, 326, 327, 349, 350, 352, ...",1006.48,"(1.949, -0.616, 6.756)",4,846.0
4,1a5h_B,1,211,28,8.69,"(21.4455, 54.4027, 10.1283)","[122, 123, 124, 125, 126, 127, 128, 129, 130, ...",3214078.0,"[432, 433, 434, 435, 436, 437, 438, 439, 440, ...",1304.26,"(-10.555, 12.844, 5.823)",1,403.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9928,8y6b_F,4,400,23,9.97,"(-27.0089, -10.9062, 76.7475)","[186, 189, 191, 192, 193, 408, 409, 410, 411, ...",2645115.0,"[217, 220, 222, 223, 224, 439, 440, 441, 442, ...",1856.81,"(12.022, -0.298, 4.819)",4,2454.0
9929,8y6b_F,5,165,13,7.49,"(-12.0191, -15.301, 82.5421)","[454, 499, 500, 501, 502, 503, 504, 518, 519, ...",1593794.0,"[485, 530, 531, 532, 533, 534, 535, 549, 550, ...",965.62,"(-4.174, -4.192, 5.165)",5,1136.0
9930,8y6b_F,6,182,13,7.05,"(-7.2889, -30.3494, 83.4135)","[212, 214, 215, 216, 217, 218, 233, 234, 235, ...",1558760.0,"[243, 245, 246, 247, 248, 249, 264, 265, 266, ...",967.13,"(-15.35, 6.173, 9.319)",6,391.0
9931,8y6b_F,7,144,16,7.27,"(-14.6807, 8.6334, 103.2561)","[110, 128, 129, 130, 131, 132, 133, 134, 135, ...",1910498.0,"[141, 159, 160, 161, 162, 163, 164, 165, 166, ...",936.44,"(4.526, -33.051, 15.187)",7,379.0


In [None]:
#SURFNET_pockets_w_sifts_SASA['RANK'] = SURFNET_pockets_w_sifts_SASA.groupby('rep_chain')['score'].rank(ascending=False, method='min').astype(int)

In [19]:
SURFNET_pockets_w_sifts_SASA.to_pickle("./results/SURFNET_pockets_DEF_TRANS_UNRANKED.pkl")  # rank here is default (no ranking, i.e., RANK = ID)