## Imports

In [1]:
from JSU_lib import *

## Functions

## Reading input data

In [2]:
grasp_pockets = read_from_pickle("./results/GrASP_pockets_dict_RIGHT_CLUSTERING.pkl")
grasp_pockets_scores = read_from_pickle("./results/GrASP_pocket_scores_dict_RIGHT_CLUSTERING.pkl")

In [3]:
len(grasp_pockets)

4030

In [5]:
LIGYSIS_reps = read_from_pickle("./results/LIGYSIS_3448_chains.pkl")
len(set(LIGYSIS_reps).intersection(set(list(grasp_pockets.keys()))))

3441

In [6]:
# Converting the nested dictionary to a format suitable for DataFrame
rows = []
for rep_chain, ids_sites in grasp_pockets.items():
    for ID, sites in ids_sites.items():
        #print(grasp_pockets_scores[rep_chain])
        try:
            rankings = {k: rank for rank, (k, v) in enumerate(sorted(grasp_pockets_scores[rep_chain].items(), key=lambda item: item[1], reverse=True), start=1)}
            rows.append({'rep_chain': rep_chain, 'ID': ID, 'site_ress': sites, 'score': grasp_pockets_scores[rep_chain][ID], 'RANK': rankings[ID]})
        except:
            print(grasp_pockets[rep_chain])
            print(grasp_pockets_scores[rep_chain])
            print(f'ERROR with {rep_chain} {ID}')
    #break

# Creating the DataFrame
grasp_pockets_df = pd.DataFrame(rows)
grasp_pockets_df["n_ress"] = grasp_pockets_df['site_ress'].apply(lambda x: len(x))
grasp_pockets_df = grasp_pockets_df.sort_values(by=["rep_chain", "RANK"]).reset_index(drop = True)

In [7]:
grasp_pockets_df.head(3)

Unnamed: 0,rep_chain,ID,site_ress,score,RANK,n_ress
0,1a52_A,0,"[108, 125, 128, 129, 132, 225, 228, 229, 47, 5...",45.484,1,22
1,1a5h_B,0,"[174, 175, 197, 198, 199, 200, 201, 203, 221, ...",47.888,1,25
2,1a9w_E,0,"[102, 103, 106, 110, 137, 138, 141, 28, 31, 38...",40.447,1,23


In [6]:
for _, row in grasp_pockets_df.query('rep_chain == "5emn_B"').iterrows():
    ress_sels = ",".join(row.site_ress)
    print(f'{row.score}\tsel : {ress_sels}')

109.711	sel : 114,115,116,119,120,121,122,123,149,153,260,27,28,29,30,31,319,320,365,393,394,395,396,397,398,413,414,415,417,419,427,428,429,430,431,432,455,479,571,615,616,617,618,79,80,81,82,83,84,87
35.354	sel : 239,241,474,475,476,477,506,507,508,513,535,537,538,543,545,546,547,572,576,579


## Calcualte RoG of GrASP pockets

In [9]:
target_dir = './../DATA/trans_rep_chains'

rep_chains_sites = grasp_pockets_df.rep_chain.unique().tolist()

In [10]:
len(rep_chains_sites)

4030

In [11]:
cc = [
    'label_seq_id_full', 'label_comp_id', 'type_symbol',
    'label_atom_id', 'Cartn_x', 'Cartn_y', 'Cartn_z'
]

In [12]:
T0 = time.time()
RoG_dict = {}
centres_dict = {}
for i, rep_chain in enumerate(rep_chains_sites):
    if i % 100 == 0:
        print(i)
    pdb_path = os.path.join(target_dir, f'{rep_chain}.trans.pdb') # calculating RoG in TRANS structures
    df = PDBXreader(inputfile=pdb_path).atoms(format_type="pdb", excluded=())[cc].query('label_atom_id == "CA"').reset_index(drop = True)
    #df.label_seq_id = df.label_seq_id.astype(int)
    rep_chain_df = grasp_pockets_df.query('rep_chain == @rep_chain')
    #print(len(acc_sites_df))
    for _, row in rep_chain_df.iterrows():
        site_ress = row.site_ress
        #print(site_ress)
        ress_CAs = df.query('label_seq_id_full in @site_ress')
        CAs_coords = np.array(list((zip(ress_CAs.Cartn_x, ress_CAs.Cartn_y, ress_CAs.Cartn_z))))
        center_of_mass = np.mean(CAs_coords, axis=0)
        distances_squared = np.sum((CAs_coords - center_of_mass)**2, axis=1)
        radius_of_gyration = round(np.sqrt(np.mean(distances_squared)), 2)
        RoG_dict[(rep_chain, row["RANK"])] = radius_of_gyration
        centres_dict[(rep_chain, row["RANK"])] = tuple([round(v, 3) for v in center_of_mass])
TF = time.time()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000


In [13]:
dTs = TF-T0

dTm = dTs/60

N_RoG = len(RoG_dict)

print(N_RoG) #8051

print(round(dTm,2))

print(round(dTs/N_RoG, 5))

6642
5.8
0.05236


In [14]:
round((N_RoG*(dTs/N_RoG))/60, 5) # 6 minutes

5.79664

In [15]:
save_to_pickle(RoG_dict, "./results/GrASP_RoG_TRANS_RIGHT_CLUSTERING.pkl")

In [16]:
len(grasp_pockets_df) #used to be 8051

6642

In [17]:
grasp_pockets_df["RoG"] = grasp_pockets_df.set_index(['rep_chain', 'RANK']).index.map(RoG_dict)

In [18]:
centres = []
errs = 0
for i, row in grasp_pockets_df.iterrows():
    try:
        centre = centres_dict[(row.rep_chain, row.RANK)]
        centres.append(centre)
    except:
        errs += 1
        centres.append(np.nan)
print(errs)

0


In [19]:
grasp_pockets_df["centre"] = centres

In [20]:
set(RoG_dict.keys()) == set(centres_dict.keys())

True

In [21]:
len(set(LIGYSIS_reps).intersection(set(list(grasp_pockets_df.rep_chain.unique()))))

3441

In [22]:
grasp_pockets_df.head(3)

Unnamed: 0,rep_chain,ID,site_ress,score,RANK,n_ress,RoG,centre
0,1a52_A,0,"[108, 125, 128, 129, 132, 225, 228, 229, 47, 5...",45.484,1,22,8.86,"(1.145, -8.457, -0.277)"
1,1a5h_B,0,"[174, 175, 197, 198, 199, 200, 201, 203, 221, ...",47.888,1,25,8.59,"(-0.198, 7.944, -7.532)"
2,1a9w_E,0,"[102, 103, 106, 110, 137, 138, 141, 28, 31, 38...",40.447,1,23,9.34,"(-4.465, 4.61, 0.882)"


In [27]:
grasp_pockets_df.query('centre != centre')

Unnamed: 0,rep_chain,ID,site_ress,score,RANK,n_ress,RoG,centre


In [30]:
#grasp_pockets_df.to_pickle("./results/GRASP_pockets_RoG_V2.pkl")

## Mapping PDB residues to UP residues

In [23]:
up_aas = {}

errors = []

for _, row in grasp_pockets_df.iterrows():
    b = 0
    rep_chain = row.rep_chain
    site_ress = row.site_ress
    rank = row["RANK"]
    mapping_dict = read_from_pickle(f'./../DATA/mappings_label_full/{rep_chain}.pkl')
    site_up_aas = []
    for res in site_ress:
        try:
            site_up_aas.append(mapping_dict[str(res)]) # I think there are integers here as it is not working with AltLocs, so none are present
        except:
            errors.append(rep_chain)
            b = 1
            break
    if b ==1:
        continue
    up_aas[(rep_chain, rank)] = site_up_aas

In [24]:
len(up_aas) #7948

6545

In [25]:
len(errors) #103

97

In [26]:
errors = list(set(errors)) # 93 chains have issues with SIFTS mapping, e.g, 1nav_A. For some reason, the last 40 residues lack residue mapping to UniProt
print(len(errors)) #93

93


In [27]:
save_to_pickle(errors, "./results/GRaSP_no_sifts_TRANS_RIGHT_CLUSTERING.pkl")

In [28]:
grasp_pockets_df_w_sifts = grasp_pockets_df.query('rep_chain not in @errors').copy().reset_index(drop = True)
grasp_pockets_df_w_sifts["up_aas"] = grasp_pockets_df_w_sifts.set_index(['rep_chain', 'RANK']).index.map(up_aas)

In [29]:
grasp_pockets_df_w_sifts["n_up_aas"] = grasp_pockets_df_w_sifts['up_aas'].apply(lambda x: len(x))
assert grasp_pockets_df_w_sifts.n_ress.equals(grasp_pockets_df_w_sifts.n_up_aas)
grasp_pockets_df_w_sifts.drop(columns = ["n_up_aas"], inplace = True)

In [30]:
grasp_pockets_df_w_sifts.rename(columns = {"site_ress": "aas", "n_ress": "n_aas"}, inplace = True)

In [31]:
grasp_pockets_df_w_sifts.head(5)

Unnamed: 0,rep_chain,ID,aas,score,RANK,n_aas,RoG,centre,up_aas
0,1a52_A,0,"[108, 125, 128, 129, 132, 225, 228, 229, 47, 5...",45.484,1,22,8.86,"(1.145, -8.457, -0.277)","[404, 421, 424, 425, 428, 521, 524, 525, 343, ..."
1,1a5h_B,0,"[174, 175, 197, 198, 199, 200, 201, 203, 221, ...",47.888,1,25,8.59,"(-0.198, 7.944, -7.532)","[484, 485, 507, 508, 509, 510, 511, 513, 531, ..."
2,1a9w_E,0,"[102, 103, 106, 110, 137, 138, 141, 28, 31, 38...",40.447,1,23,9.34,"(-4.465, 4.61, 0.882)","[103, 104, 107, 111, 138, 139, 142, 29, 32, 39..."
3,1ags_B,0,"[11, 162, 17, 18, 19, 20, 21, 22, 54, 7]",2.058,1,10,8.09,"(2.823, -4.357, -2.352)","[12, 163, 18, 19, 20, 21, 22, 23, 55, 8]"
4,1aii_A,2,"[120, 121, 122, 124, 164, 206, 279, 282, 4, 5,...",11.187,1,13,9.14,"(-2.977, -0.043, 7.244)","[120, 121, 122, 124, 164, 206, 279, 282, 4, 5,..."


In [32]:
len(grasp_pockets_df_w_sifts.rep_chain.unique()) #3937

3937

In [33]:
len(set(LIGYSIS_reps).intersection(set(list(grasp_pockets_df_w_sifts.rep_chain.unique())))) #3353

3353

In [34]:
grasp_pockets_df_w_sifts.to_pickle("./results/GRASP_pockets_sifted_RIGHT_CLUSTERING.pkl")

## Add pocket surfaces and volumes

In [30]:
#grasp_pockets_df_w_sifts = pd.read_pickle("./results/GRASP_pockets_sifted_V2.pkl")

In [35]:
master_SASA_dict = read_from_pickle("./results/master_SASA_dict.pkl")

In [36]:
grasp_pockets_df_w_sifts_SASA = calculate_total_sasa(grasp_pockets_df_w_sifts, master_SASA_dict)

In [37]:
grasp_pockets_df_w_sifts_SASA.query('SASA != SASA')

Unnamed: 0,rep_chain,ID,aas,score,RANK,n_aas,RoG,centre,up_aas,SASA


In [38]:
vols_dict = read_from_pickle("./results/GrASP_volumes_dict_RIGHT_CLUSTERING.pkl")

In [39]:
grasp_pockets_df_w_sifts_SASA["VOL"] = grasp_pockets_df_w_sifts_SASA.set_index(['rep_chain', 'ID']).index.map(vols_dict)

In [40]:
grasp_pockets_df_w_sifts_SASA

Unnamed: 0,rep_chain,ID,aas,score,RANK,n_aas,RoG,centre,up_aas,SASA,VOL
0,1a52_A,0,"[108, 125, 128, 129, 132, 225, 228, 229, 47, 5...",45.484,1,22,8.86,"(1.145, -8.457, -0.277)","[404, 421, 424, 425, 428, 521, 524, 525, 343, ...",579.34,506.0
1,1a5h_B,0,"[174, 175, 197, 198, 199, 200, 201, 203, 221, ...",47.888,1,25,8.59,"(-0.198, 7.944, -7.532)","[484, 485, 507, 508, 509, 510, 511, 513, 531, ...",879.85,725.0
2,1a9w_E,0,"[102, 103, 106, 110, 137, 138, 141, 28, 31, 38...",40.447,1,23,9.34,"(-4.465, 4.61, 0.882)","[103, 104, 107, 111, 138, 139, 142, 29, 32, 39...",822.66,788.0
3,1ags_B,0,"[11, 162, 17, 18, 19, 20, 21, 22, 54, 7]",2.058,1,10,8.09,"(2.823, -4.357, -2.352)","[12, 163, 18, 19, 20, 21, 22, 23, 55, 8]",560.25,
4,1aii_A,2,"[120, 121, 122, 124, 164, 206, 279, 282, 4, 5,...",11.187,1,13,9.14,"(-2.977, -0.043, 7.244)","[120, 121, 122, 124, 164, 206, 279, 282, 4, 5,...",401.72,1469.0
...,...,...,...,...,...,...,...,...,...,...,...
6501,8y6b_F,1,"[206, 207, 208, 209, 253, 254, 271, 302, 303, ...",27.773,1,24,8.80,"(-1.192, 20.648, -5.917)","[237, 238, 239, 240, 284, 285, 302, 333, 334, ...",624.74,679.0
6502,8y6b_F,0,"[30, 48, 50, 51, 67, 72, 75, 96]",7.431,2,8,6.73,"(-5.597, -38.736, -7.57)","[61, 79, 81, 82, 98, 103, 106, 127]",175.10,57.0
6503,8y6o_H,1,"[102, 139, 140, 141, 144, 96]",3.365,1,6,6.41,"(-12.024, -3.024, -5.587)","[102, 139, 140, 141, 144, 96]",435.96,393.0
6504,8y6o_H,0,[72],0.268,2,1,0.00,"(2.941, 8.14, -12.706)",[72],152.58,0.0


In [41]:
grasp_pockets_df_w_sifts_SASA.to_pickle("./results/GRASP_pockets_DEF_TRANS_RIGHT_CLUSTERING.pkl")

In [44]:
bad_reps = read_from_pickle("./results/OLD/other_errrors_reps.pkl")

In [45]:
grasp_pockets_df_w_sifts_SASA.query('VOL != VOL & rep_chain not in @bad_reps')

Unnamed: 0,rep_chain,ID,aas,score,RANK,n_aas,RoG,centre,up_aas,SASA,VOL


In [50]:
for _i, row in combined_SASA_df.iterrows():
    RoG = row.RoG
    rep_chain = row.rep_chain
    aas = row.aas
    ID = row.ID
    if 25<RoG<30:
        print(f"{rep_chain}_{ID}: {RoG} --> {'+'.join(aas)}")

3k2s_B_1: 26.15 --> 111+122+14+17+18+21+33+36+37+5+53+57+6+60+61+62+64
5hdt_B_1: 27.52 --> 1030+1079+1080+1082+388+393+396+397+398+399+402+403+404+405+428+431+432+434+445+446+448+449+450+451+452+453+457+483+487+527+536+540+567+568+571+592+595+596+597+600+601+602+603+715+758+761+762+780+784+785+789+818+819+820+821+824+825+826+827+835+837+840+841+843+844+845+872+876+884+886+974+978
6l53_A_0: 28.5 --> 1109+1193+1194+1347+1348+1349+1433+1508+1526+1529+1598+1857+1858+1951+2099+2183+2185+2186+2191+2311+2315+2316+2351+733+734+737+738+741+742+745+812+815+816+819+870+873+915+916+932+933+934+987
7e7o_A_1: 25.84 --> 1174+1176+1256+1257+1258+1264+1287+1372+1720+1721+1724+1809+1998+1999+2000+2050+2094+2187+2189+2220+2228+2238+2255+2256+2257+2258+2259+2260+2261+697
7ktp_C_1: 27.58 --> 100+101+104+105+108+109+110+112+159+161+162+164+186+208+231+232+233+236+315+355+356+357+430+432+440+442+444+447+448+452+453+460+465+466+470+471+473+86+90+96+97
7o7l_B_1: 26.88 --> 130+131+132+133+136+137+151+155+217+23