## Imports

In [1]:
from JSU_lib import *

## Functions

## Input data

In [2]:
cc = [
    'label_seq_id_full', 'label_comp_id', 'type_symbol',
    'label_atom_id', 'Cartn_x', 'Cartn_y', 'Cartn_z'
]

In [3]:
segments_to_compare = pd.read_pickle("./results/PDB_rep_chains_df.pkl")

In [4]:
segments_to_compare.SEG = segments_to_compare.SEG.astype(str)

In [5]:
segments_to_compare.head(3)

Unnamed: 0,ACC,SEG,CLUST,pdb_id,auth_asym_id,struct_asym_id,SEG_ID,SEG_COUNT
0,A0A0B4J272,1,1,6cph,D,A,A0A0B4J272_1,1
1,A0AVT1,1,1,7sol,C,C,A0AVT1_1,1
2,A0FGR8,1,1,4npj,A,A,A0FGR8_1,2


In [6]:
len(segments_to_compare)

4037

## Transform dictionary to DF and format

In [7]:
pdb_sites_df = pd.read_pickle("./../DATA/LIGYSIS_human_sites_APRIL_2024.pkl")

In [8]:
pdb_sites_df.head(3)

Unnamed: 0,ACC,SEG,ID,up_aas,n_up_aas,SEG_ID,SITE_NAME
0,P41182,1,0,"[11, 13, 14, 17, 18, 21, 22, 24, 25, 28, 30, 4...",33,P41182_1,P41182_1_0
1,P41182,1,1,"[7, 8, 9, 10, 11, 13, 120, 123, 124, 127]",10,P41182_1,P41182_1_1
2,P41182,1,2,"[56, 60, 66, 69, 71, 72, 73, 74, 75, 76, 111]",11,P41182_1,P41182_1_2


In [9]:
print(pdb_sites_df.shape[0])

10689


In [10]:
len(set(segments_to_compare.SEG_ID.tolist()))

4037

In [11]:
len(set(pdb_sites_df.SEG_ID.tolist()))

4083

In [12]:
merged_df = pdb_sites_df.merge(segments_to_compare[["SEG_ID", "pdb_id", "auth_asym_id", "struct_asym_id"]], on=["SEG_ID"])
merged_df["rep_chain"] = merged_df.pdb_id + "_" + merged_df.auth_asym_id
print(merged_df.shape[0])

10602


In [60]:
merged_df.head(3)

Unnamed: 0,ACC,SEG,ID,up_aas,n_up_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre
0,P41182,1,0,"[11, 13, 14, 17, 18, 21, 22, 24, 25, 28, 30, 4...",33,P41182_1,P41182_1_0,6xmx,H,H,6xmx_H,12.95,"[51, 53, 54, 57, 58, 61, 62, 64, 65, 68, 70, 8...","(-1.716, -2.743, -0.583)"
1,P41182,1,1,"[7, 8, 9, 10, 11, 13, 120, 123, 124, 127]",10,P41182_1,P41182_1_1,6xmx,H,H,6xmx_H,17.11,"[47, 48, 49, 50, 51, 53, 160, 163, 164, 167]","(-16.009, -8.151, -5.015)"
2,P41182,1,2,"[56, 60, 66, 69, 71, 72, 73, 74, 75, 76, 111]",11,P41182_1,P41182_1_2,6xmx,H,H,6xmx_H,7.92,"[96, 100, 106, 109, 111, 112, 113, 114, 115, 1...","(11.392, 3.784, -3.561)"


In [61]:
one_clust_segs = read_from_pickle("./results/1_clust_segs.pkl")

In [14]:
len(merged_df.ACC.unique())

3513

In [15]:
rep_chains_sites = merged_df.rep_chain.unique().tolist()
print(len(rep_chains_sites))

4037


In [16]:
target_dir = './../DATA/trans_rep_chains' # this needs to be trans so it is aligned

## Calcualte RoG of PDB sites

In [18]:
T0 = time.time()
errors = []
RoG_dict = {}
centres_dict = {}
pdb_ress_dict = {}
for i, rep_chain in enumerate(rep_chains_sites):
    if i % 100 == 0:
        print(i)
    pdb_path = os.path.join(target_dir, f'{rep_chain}.trans.pdb')
    df = PDBXreader(inputfile=pdb_path).atoms(format_type="pdb", excluded=())[cc].query('label_atom_id == "CA"').reset_index(drop = True)
    rep_chain_df = merged_df.query('rep_chain == @rep_chain')
    mapping_dict = read_from_pickle(f'./../DATA/mappings_label_full/{rep_chain}.pkl') # only will contain structure, chain and ACC of interest. Artificial N/C termini will have no mapping
    try:
        reversed_dict = {value: key for key, value in mapping_dict.items()}
    except Exception as e:
        print(f'ERRROR with {rep_chain}. Cannot map all chain residues') # should not happen
        errors.append(rep_chain)
        continue
    for _, row in rep_chain_df.iterrows():
        b = 0
        site_ress = row.up_aas
        try:
            pdb_ress = [reversed_dict[res] for res in site_ress]
        except Exception as e:
            print(f'ERRROR. LIGYSIS site residue not on {rep_chain}') # this will happen when LIGYSIS site residues are not present on the structure.
            errors.append(rep_chain)
            b = 1
            #raise
            break

        assert len(pdb_ress) == len(site_ress)
            
        ress_CAs = df.query('label_seq_id_full in @pdb_ress')
        CAs_coords = np.array(list((zip(ress_CAs.Cartn_x, ress_CAs.Cartn_y, ress_CAs.Cartn_z))))
        center_of_mass = np.mean(CAs_coords, axis=0)
        distances_squared = np.sum((CAs_coords - center_of_mass)**2, axis=1)
        radius_of_gyration = round(np.sqrt(np.mean(distances_squared)), 2)
        RoG_dict[(rep_chain, row["SITE_NAME"])] = radius_of_gyration
        centres_dict[(rep_chain, row["SITE_NAME"])] = tuple([round(v, 3) for v in center_of_mass])
        pdb_ress_dict[(rep_chain, row["SITE_NAME"])] = pdb_ress
    
TF = time.time()

0
ERRROR. LIGYSIS site residue not on 7sol_C
ERRROR. LIGYSIS site residue not on 4npj_A
ERRROR. LIGYSIS site residue not on 3al5_C
ERRROR. LIGYSIS site residue not on 7pue_A
ERRROR. LIGYSIS site residue not on 2j1l_A
ERRROR. LIGYSIS site residue not on 8cm8_B
ERRROR. LIGYSIS site residue not on 7fbq_A
ERRROR. LIGYSIS site residue not on 7l3l_C
ERRROR. LIGYSIS site residue not on 4xbm_A
ERRROR. LIGYSIS site residue not on 6cz5_A
ERRROR. LIGYSIS site residue not on 2jhh_C
ERRROR. LIGYSIS site residue not on 5k55_A
ERRROR. LIGYSIS site residue not on 7of0_w
ERRROR. LIGYSIS site residue not on 6yg7_B
ERRROR. LIGYSIS site residue not on 6uxy_A
100
ERRROR. LIGYSIS site residue not on 8c5g_B
ERRROR. LIGYSIS site residue not on 6owv_A
ERRROR. LIGYSIS site residue not on 2bmc_E
ERRROR. LIGYSIS site residue not on 4y8d_A
ERRROR. LIGYSIS site residue not on 2xxz_B
ERRROR. LIGYSIS site residue not on 7kx8_B
ERRROR. LIGYSIS site residue not on 7b9x_A
ERRROR. LIGYSIS site residue not on 7yiy_E
ERRRO

In [None]:
8719
6.04
0.04156

In [19]:
dTs = TF-T0

dTm = dTs/60

N_RoG = len(RoG_dict)

print(N_RoG)

print(round(dTm,2))

print(round(dTs/N_RoG, 5))

8719
5.89
0.04054


In [20]:
round((N_RoG*(dTs/N_RoG))/60, 2) # 5.9 minutes

5.89

In [21]:
len(errors) # losing 589 representative chains as not all site UniProt residues present in chain

589

In [22]:
len(set(errors)) # this corresponds to 589 single chains

589

In [23]:
len(centres_dict) #8719

8719

In [24]:
ks = list(centres_dict.keys())
idx = merged_df.set_index(['rep_chain', 'SITE_NAME']).index.tolist()
[el for el in ks if el not in idx]

[]

In [48]:
save_to_pickle(RoG_dict, "./results/LIGYSIS_RoG_TRANS.pkl")

In [26]:
save_to_pickle(RoG_dict, "./results/LIGYSIS_centres_TRANS.pkl")

In [27]:
merged_df["RoG"] = merged_df.set_index(['rep_chain', 'SITE_NAME']).index.map(RoG_dict)

In [28]:
pdb_ress_dict

{('6xmx_H', 'P41182_1_0'): ['51',
  '53',
  '54',
  '57',
  '58',
  '61',
  '62',
  '64',
  '65',
  '68',
  '70',
  '81',
  '88',
  '91',
  '92',
  '93',
  '94',
  '95',
  '98',
  '99',
  '120',
  '121',
  '124',
  '125',
  '129',
  '149',
  '150',
  '153',
  '154',
  '155',
  '156',
  '157',
  '158'],
 ('6xmx_H', 'P41182_1_1'): ['47',
  '48',
  '49',
  '50',
  '51',
  '53',
  '160',
  '163',
  '164',
  '167'],
 ('6xmx_H', 'P41182_1_2'): ['96',
  '100',
  '106',
  '109',
  '111',
  '112',
  '113',
  '114',
  '115',
  '116',
  '151'],
 ('2yrm_A', 'P41182_2_0'): ['13', '15', '16', '29', '30', '33', '34'],
 ('2lce_A', 'P41182_3_0'): ['20', '22', '23', '25', '36', '40'],
 ('2lce_A', 'P41182_3_1'): ['48', '50', '51', '64', '67', '68'],
 ('2en2_A', 'P41182_4_0'): ['14',
  '16',
  '17',
  '18',
  '19',
  '30',
  '31',
  '33',
  '34'],
 ('2eos_A', 'P41182_5_0'): ['14', '16', '17', '18', '19', '30', '34'],
 ('6cph_D', 'A0A0B4J272_1_0'): ['47', '48', '49', '57'],
 ('6cph_D', 'A0A0B4J272_1_1'): [

In [28]:
merged_df["aas"] = merged_df.set_index(['rep_chain', 'SITE_NAME']).index.map(pdb_ress_dict)

In [29]:
merged_df["centre"] = np.nan
centres = []
errs = 0
err_chains = []
for i, row in merged_df.iterrows():
    try:
        centre = centres_dict[(row.rep_chain, row.SITE_NAME)]
        centres.append(centre)
    except:
        errs += 1
        centres.append(np.nan)
        err_chains.append(row.rep_chain)
print(errs) # 1883

1883


In [30]:
set(err_chains) == set(errors)

True

In [31]:
len(centres) #10602

10602

In [32]:
merged_df["centre"] = centres

In [33]:
merged_df.head(5)

Unnamed: 0,ACC,SEG,ID,up_aas,n_up_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre
0,P41182,1,0,"[11, 13, 14, 17, 18, 21, 22, 24, 25, 28, 30, 4...",33,P41182_1,P41182_1_0,6xmx,H,H,6xmx_H,12.95,"[51, 53, 54, 57, 58, 61, 62, 64, 65, 68, 70, 8...","(-1.716, -2.743, -0.583)"
1,P41182,1,1,"[7, 8, 9, 10, 11, 13, 120, 123, 124, 127]",10,P41182_1,P41182_1_1,6xmx,H,H,6xmx_H,17.11,"[47, 48, 49, 50, 51, 53, 160, 163, 164, 167]","(-16.009, -8.151, -5.015)"
2,P41182,1,2,"[56, 60, 66, 69, 71, 72, 73, 74, 75, 76, 111]",11,P41182_1,P41182_1_2,6xmx,H,H,6xmx_H,7.92,"[96, 100, 106, 109, 111, 112, 113, 114, 115, 1...","(11.392, 3.784, -3.561)"
3,P41182,2,0,"[520, 522, 523, 536, 537, 540, 541]",7,P41182_2,P41182_2_0,2yrm,A,A,2yrm_A,5.01,"[13, 15, 16, 29, 30, 33, 34]","(1.62, -4.334, -1.631)"
4,P41182,3,0,"[548, 550, 551, 553, 564, 568]",6,P41182_3,P41182_3_0,2lce,A,A,2lce_A,5.48,"[20, 22, 23, 25, 36, 40]","(2.633, 10.007, 4.638)"


In [34]:
merged_df_w_RoG = merged_df.query('rep_chain not in @errors').copy().reset_index(drop = True)

In [35]:
len(merged_df_w_RoG.ACC.unique().tolist()) #3048

3048

In [36]:
len(merged_df_w_RoG.rep_chain.unique().tolist()) #3448

3448

In [37]:
len(merged_df_w_RoG) #8244

8244

In [38]:
merged_df_w_RoG.rename(columns ={'n_up_aas': 'n_aas'}, inplace = True)

In [39]:
merged_df_w_RoG.head(5)

Unnamed: 0,ACC,SEG,ID,up_aas,n_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre
0,P41182,1,0,"[11, 13, 14, 17, 18, 21, 22, 24, 25, 28, 30, 4...",33,P41182_1,P41182_1_0,6xmx,H,H,6xmx_H,12.95,"[51, 53, 54, 57, 58, 61, 62, 64, 65, 68, 70, 8...","(-1.716, -2.743, -0.583)"
1,P41182,1,1,"[7, 8, 9, 10, 11, 13, 120, 123, 124, 127]",10,P41182_1,P41182_1_1,6xmx,H,H,6xmx_H,17.11,"[47, 48, 49, 50, 51, 53, 160, 163, 164, 167]","(-16.009, -8.151, -5.015)"
2,P41182,1,2,"[56, 60, 66, 69, 71, 72, 73, 74, 75, 76, 111]",11,P41182_1,P41182_1_2,6xmx,H,H,6xmx_H,7.92,"[96, 100, 106, 109, 111, 112, 113, 114, 115, 1...","(11.392, 3.784, -3.561)"
3,P41182,2,0,"[520, 522, 523, 536, 537, 540, 541]",7,P41182_2,P41182_2_0,2yrm,A,A,2yrm_A,5.01,"[13, 15, 16, 29, 30, 33, 34]","(1.62, -4.334, -1.631)"
4,P41182,3,0,"[548, 550, 551, 553, 564, 568]",6,P41182_3,P41182_3_0,2lce,A,A,2lce_A,5.48,"[20, 22, 23, 25, 36, 40]","(2.633, 10.007, 4.638)"


In [40]:
merged_df_w_RoG = merged_df_w_RoG.sort_values(["rep_chain", "ID"]).copy().reset_index(drop = True)

In [41]:
merged_df_w_RoG.to_pickle("./results/LIGYSIS_sites_sifted_TRANS.pkl")

In [42]:
merged_df_w_RoG.query('rep_chain == "1a52_A" & ID == 2').aas

2    [30, 31, 32, 34, 56]
Name: aas, dtype: object

In [20]:
#merged_df_w_RoG = pd.read_pickle("./results/LIGYSI_sites_sifted_trans.pkl")

In [44]:
len(merged_df_w_RoG.rep_chain.unique().tolist()) #3448

3448

In [45]:
merged_df_w_RoG.head(5)

Unnamed: 0,ACC,SEG,ID,up_aas,n_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre
0,P03372,2,0,"[339, 341, 342, 343, 344, 345, 346, 347, 349, ...",61,P03372_2,P03372_2_0,1a52,A,A,1a52_A,14.2,"[43, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 5...","(-0.966, -16.192, -2.54)"
1,P03372,2,1,"[329, 330, 331, 332, 333, 335, 336, 337, 338, ...",18,P03372_2,P03372_2_1,1a52,A,A,1a52_A,7.67,"[33, 34, 35, 36, 37, 39, 40, 41, 42, 45, 46, 4...","(7.883, -18.704, 5.375)"
2,P03372,2,2,"[326, 327, 328, 330, 352]",5,P03372_2,P03372_2_2,1a52,A,A,1a52_A,5.46,"[30, 31, 32, 34, 56]","(2.537, -9.125, 11.7)"
3,P00750,3,0,"[342, 357, 358, 401, 402, 403, 482, 484, 485, ...",27,P00750_3,P00750_3_0,1a5h,B,D,1a5h_B,8.98,"[32, 47, 48, 91, 92, 93, 172, 174, 175, 197, 1...","(0.467, 7.085, -7.455)"
4,P02100,1,0,"[32, 39, 42, 43, 46, 60, 64, 67, 68, 71, 72, 8...",22,P02100_1,P02100_1_0,1a9w,E,B,1a9w_E,9.81,"[31, 38, 41, 42, 45, 59, 63, 66, 67, 70, 71, 8...","(-4.39, 6.075, 1.831)"


## Add pocket surfaces and volumes

In [49]:
#merged_df_w_RoG = pd.read_pickle("./results/LIGYSIS_sites_sifted_trans.pkl")

master_SASA_dict = read_from_pickle("./results/master_SASA_dict.pkl")

merged_df_w_RoG_SASA = calculate_total_sasa(merged_df_w_RoG, master_SASA_dict)

vols_dict = read_from_pickle("./results/LIGYSIS_volumes_dict.pkl")

merged_df_w_RoG_SASA["VOL"] = merged_df_w_RoG_SASA.set_index(['rep_chain', 'ID']).index.map(vols_dict)

merged_df_w_RoG_SASA.query('VOL != VOL')

Unnamed: 0,ACC,SEG,ID,up_aas,n_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre,SASA,VOL
757,P27918,2,8,"[59, 61, 270, 271, 272, 273, 310, 311, 312, 313]",10,P27918_2,P27918_2_8,1w0s,C,C,1w0s_C,95.9,"[32, 34, 243, 244, 245, 246, 283, 284, 285, 286]","(34.788, 24.53, -3.41)",727.44,
2603,P02647,1,14,"[118, 121, 122, 125, 190, 194]",6,P02647_1,P02647_1_14,3k2s,B,B,3k2s_B,43.06,"[94, 97, 98, 101, 166, 170]","(-18.682, 6.988, -0.352)",709.59,
2616,P02647,1,27,"[66, 226, 227, 230, 231, 234, 235, 237, 238, 242]",10,P02647_1,P02647_1_27,3k2s,B,B,3k2s_B,45.16,"[42, 202, 203, 206, 207, 210, 211, 213, 214, 218]","(-9.157, -58.961, 13.036)",960.22,
3315,Q9BYX4,2,1,"[307, 308, 309, 312, 329, 330, 331, 332, 333, ...",21,Q9BYX4_2,Q9BYX4_2_1,4gl2,A,A,4gl2_A,9.33,"[7, 8, 9, 12, 29, 30, 31, 32, 33, 34, 35, 36, ...","(-9.464, 19.881, -4.636)",918.19,
4717,Q8TEQ6,1,0,"[383, 474, 475, 503, 540, 541, 580, 581, 582, ...",15,Q8TEQ6_1,Q8TEQ6_1_0,5tee,A,A,5tee_A,8.09,"[402, 493, 494, 522, 559, 560, 599, 600, 601, ...","(-4.041, -18.855, -7.357)",556.16,
4874,Q9BVG8,1,0,"[451, 453, 454, 456, 529, 530, 531, 532, 533, ...",14,Q9BVG8_1,Q9BVG8_1_0,5wde,A,A,5wde_A,6.88,"[11, 13, 14, 16, 89, 90, 91, 92, 93, 94, 95, 9...","(-0.064, -10.536, 6.744)",533.65,


In [56]:
save_to_pickle(merged_df_w_RoG_SASA.query('VOL != VOL').rep_chain.unique().tolist(), "./results/LIGYSIS_no_vol_reps_TRANS.pkl")

In [52]:
merged_df_w_RoG_SASA.to_pickle("./results/LIGYSIS_sites_DEF_TRANS.pkl")

In [53]:
#combined_SASA_df = pd.read_pickle("./results/LIGYSIS_sites_DEF.pkl")

In [47]:
combined_SASA_df.head(5)

Unnamed: 0,ACC,SEG,ID,up_aas,n_aas,SEG_ID,SITE_NAME,pdb_id,auth_asym_id,struct_asym_id,rep_chain,RoG,aas,centre,SASA,VOL
0,P03372,2,0,"[339, 341, 342, 343, 344, 345, 346, 347, 349, ...",61,P03372_2,P03372_2_0,1a52,A,A,1a52_A,14.2,"[43, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 5...","(107.959, 7.984, 96.444)",3266.99,5651.0
1,P03372,2,1,"[329, 330, 331, 332, 333, 335, 336, 337, 338, ...",18,P03372_2,P03372_2_1,1a52,A,A,1a52_A,7.67,"[33, 34, 35, 36, 37, 39, 40, 41, 42, 45, 46, 4...","(117.768, 11.344, 102.749)",1034.84,451.0
2,P03372,2,2,"[326, 327, 328, 330, 352]",5,P03372_2,P03372_2_2,1a52,A,A,1a52_A,5.46,"[30, 31, 32, 34, 56]","(109.206, 18.269, 109.0)",460.17,30.0
3,P00750,3,0,"[342, 357, 358, 401, 402, 403, 482, 484, 485, ...",27,P00750_3,P00750_3_0,1a5h,B,D,1a5h_B,8.98,"[32, 47, 48, 91, 92, 93, 172, 174, 175, 197, 1...","(17.763, 54.011, 27.939)",968.54,951.0
4,P02100,1,0,"[32, 39, 42, 43, 46, 60, 64, 67, 68, 71, 72, 8...",22,P02100_1,P02100_1_0,1a9w,E,B,1a9w_E,9.81,"[31, 38, 41, 42, 45, 59, 63, 66, 67, 70, 71, 8...","(3.533, 33.886, 46.667)",895.32,913.0


In [58]:
len(merged_df_w_RoG_SASA.rep_chain.unique().tolist())

3448

In [59]:
save_to_pickle(merged_df_w_RoG_SASA.rep_chain.unique().tolist(), "./results/ligysis_3448_chains.pkl")

In [66]:
one_clust_ligysis_reps = merged_df_w_RoG_SASA.query('SEG_ID in @one_clust_segs').rep_chain.unique().tolist()

In [68]:
save_to_pickle(one_clust_ligysis_reps, "./results/1_clust_ligysis_reps.pkl")

In [67]:
len(one_clust_ligysis_reps)

963