In [1]:
from biopandas.pdb import PandasPdb
from tqdm.notebook import tqdm
import glob

import random
import urllib
import pickle
random.seed(52)

from protein_tools import *

### initialise PyRosetta
pr.init('-ignore_unrecognized_res -ignore_zero_occupancy -mute all -holes:dalphaball /work/lpdi/bin/rosetta_06_2021/main/source/external/DAlpahBall/DAlphaBall.gcc -corrections::beta_nov16 true -relax:default_repeats 1')

PyRosetta-4 2023 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python310.Release 2023.35+release.23439d33534e3f106f2ad301c8c3c56013ca8471 2023-08-30T15:39:05] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


# Sample natural PPIs from CATH

In [None]:
# native pdb codes are in cath-domain-list-S100.txt, every line has this format:
#1oaiA00     1    10     8    10     1     1     1     1     1    59 1.000
# pdb code is the first 4 characters
# read all pdb codes from this file

native_pdb_codes = []
with open('cath-domain-list-S100.txt') as f:
    for line in f:
        native_pdb_codes.append(line[:4])

# remove duplicates
native_pdb_codes = list(set(native_pdb_codes))

# subselect random 1000
random.shuffle(native_pdb_codes)
native_pdb_codes = native_pdb_codes[:1029]

for pdb_code in native_pdb_codes:
    url = 'https://files.rcsb.org/download/' + pdb_code + '.pdb'
    filename = 'native_domain_database/' + pdb_code + '.pdb'
    if os.path.isfile(filename):
        continue
    print('Downloading ' + pdb_code + ' from ' + url + ' to ' + filename)
    try:
        urllib.request.urlretrieve(url, filename)
    except:
        print('Error downloading ' + pdb_code + ' from ' + url)

# Calculate the native PPI interface scores

In [3]:
for pdb in tqdm(glob.glob('native_binder_database_clean/*.pdb')):
    name = pdb.split('/')[-1].replace('.pdb','')
    print(name)
    pdb_file = pdb
    
    chain_A = pdb.split('/')[-1].split('_')[1]
    chain_B = pdb.split('/')[-1].split('_')[2].replace('.pdb', '')
    
    # randomly select weather to use chain A or chain B
    random_number = random.uniform(0, 1)
    if random_number < 0.5:
        chain = chain_A
        if len(chain_A) != 1:
            chain = chain_B
    else:
        chain = chain_B

    try:
        interface_scores, interface_AA, interface_residues_pdb_ids_str = score_interface(pdb_file, chain)
        total_count, helix_count, sheet_count, loop_count = obtain_sse_content_interface(pdb_file, chain)

        pose = pr.pose_from_pdb(pdb_file)
        interface_scores['interface_bb_hbonds'] = count_interface_bb_hbonds(pose)
        chain1_interface_residues, chain2_interface_residues = get_interface_residues(pose, chain_A, chain_B)
        interface_scores['interface_hydrophobicity'] = calculate_hydrophobicity(pose, chain1_interface_residues)
        interface_scores['loop_perc'] = (loop_count/total_count)*100
        interface_scores['helix_perc'] = (helix_count/total_count)*100
        interface_scores['beta_perc'] = (sheet_count/total_count)*100

        with open('native_PPI_interface_scores/{name}.pickle'.format(name=name), 'wb') as handle:
            pickle.dump(interface_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)
    except:
        print('error with:', name, 'skipping...')

  0%|          | 0/926 [00:00<?, ?it/s]

1GCQ_A_B
1L4D_A_B
3I2B_A_B
3QWQ_A_B
2HVY_A_B
2ABZ_A_B
2FP7_A_B
1TM3_A_B
1AVX_A_B
3QC8_A_B
3WDG_A_B
3EPZ_A_B
3BCP_A_B
1T0F_A_B
3K4W_A_B
2YVJ_A_B
3VPJ_A_B
3F74_A_B
3Q87_A_B
1WDX_A_B
2P04_A_B
3HRD_A_B
1X1X_A_B
2GJV_A_B
5D1K_A_B
4TQ0_A_B
3DQQ_A_B
3NRJ_A_B
3LQV_A_B
1MBY_A_B
2FTM_A_B
2ZXW_A_B
2IDO_A_B
1X1Y_A_B
3GRW_A_B
3IAS_A_B
3I84_A_B
1FGL_A_B
4AYI_A_B
1XWD_A_B
1JBU_A_B
3LM1_A_B
1VGO_A_B
3E38_A_B
2OS7_A_B
3GQH_A_B
3RDZ_A_B
1VS3_A_B
1DJS_A_B
1UP6_A_B
3HO5_A_B
3BIW_A_B
3QQ8_A_B
1W1I_A_B
2DP4_A_B
1B27_A_B
2D10_A_B
1DML_A_B
2HZM_A_B
2ZSU_A_B
5F3X_A_B
2A6P_A_B
2DG0_A_B
4AFZ_A_B
3KW5_A_B
2OUL_A_B
4U30_A_B
3JVZ_A_B
3TG9_A_B
1KL8_A_B
3CJX_A_B
3GNJ_A_B
1EPT_A_B
3B01_A_B
1H1V_A_B
3LRJ_A_B
3RBQ_A_B
3MAL_A_B
4B1Y_A_B
3PPE_A_B
5B77_A_B
1VHJ_A_B
1ZH8_A_B
3QWN_A_B
2J12_A_B
3DSN_A_B
1Z3G_A_B
2FPE_A_B
3D1E_A_B
2P35_A_B
2A74_A_B
4CMM_A_B
1SUW_A_B
1GO4_A_B
1ZR0_A_B
1JKG_A_B
5B78_A_B
3FK9_A_B
4UDM_A_B
1GT7_A_B
3H35_A_B
3UI2_A_B
2JJT_A_B
4ZQU_A_B
1U20_A_B
2GD4_A_B
2VJF_A_B
1EJA_A_B
3CGY_A_B
2WQZ_A_B
1TM7_A_B
1



error with: 1UK4_A_B skipping...
3BGL_A_B
2I0B_A_B
4QZV_A_B
2NU1_A_B
2HJ1_A_B
3BRD_A_B
4K1R_A_B
1QB3_A_B
1S4C_A_B
4IOP_A_B
2W1T_A_B
3O34_A_B
4NZL_A_B



ERROR: Error in core::scoring::methods::RamaPreProEnergy::residue_pair_energy(): The RamaPrePro term is incompatible with cyclic dipeptides (as is most of the rest of Rosetta).
ERROR:: Exit from: /home/benchmark/rosetta/source/src/core/energy_methods/RamaPreProEnergy.cc line: 127


error with: 4NZL_A_B skipping...
1GXD_A_B
4KRL_A_B
3S8V_A_B
2Y9X_A_B
1NU9_A_B
3V3K_A_B
1OMO_A_B
1Y33_A_B
4WEN_A_B
2OVI_A_B
1B2S_A_B




error with: 1B2S_A_B skipping...
2GQS_A_B
1C8N_A_B
2PQ2_A_B
2ZCK_A_B
2UUY_A_B




error with: 2UUY_A_B skipping...
2CH8_A_B
4V0O_A_B
4NSO_A_B
1OSM_A_B
1R0R_A_B
1AK4_A_B
2GEF_A_B
2QBW_A_B
1F7Z_A_B
4YEB_A_B
3HHJ_A_B
2IQH_A_B
1VGC_A_B
3B6P_A_B
2Z2M_A_B
1RZP_A_B
1AHS_A_B
2AQX_A_B
2F2F_A_B
4K24_A_B
3ENT_A_B
1L0A_A_B
2EVV_A_B
2X89_A_B
3JRQ_A_B
3AHS_A_B
3S9C_A_B
3MJ9_A_B
2QLP_A_B
3TIW_A_B
3GMW_A_B
3B9I_A_B
1FLE_A_B
2IO1_A_B
3QDZ_A_B
1B2U_A_B
4I6L_A_B
3HN6_A_B
3E05_A_B
2DSP_A_B
2X53_A_B
1SMO_A_B
1QOL_A_B
2QKI_A_B
3AJY_A_B
3B08_A_B
1AGQ_A_B
3GZR_A_B
2V0R_A_B
1DEV_A_B
3P92_A_B
1X1W_A_B
1KAC_A_B
1KXP_A_B
1YOX_A_B
3QFM_A_B
2FB8_A_B
2CCL_A_B




error with: 2CCL_A_B skipping...
3KZH_A_B
3LU9_A_B
2ANE_A_B
3BIK_A_B
2TGP_A_B
1W4R_A_B
2WO3_A_B
3QNA_A_B
3BWU_A_B
1D6R_A_B
1L4I_A_B
1A99_A_B
1OYV_A_B
3M85_A_B
2W0C_A_B
1PXV_A_B
2CE8_A_B
3TQY_A_B
3HM8_A_B
2Q7N_A_B
1KCA_A_B
2QYI_A_B
1I9C_A_B
1LDT_A_B
1UM2_A_B
2OGJ_A_B
3QJ7_A_B
2P46_A_B
2Y32_A_B
3BFW_A_B
2A0S_A_B
3QHY_A_B
3Q9U_A_B
2QC1_A_B
3O2X_A_B
1FFV_A_B
3DCA_A_B
2OIN_A_B
3VYR_A_B
1IJX_A_B
3F75_A_B
3PRP_A_B
4PJ2_A_B
1UDI_A_B
3FPU_A_B
2NUU_A_B
1HAA_A_B
1C9P_A_B
2HEK_A_B
1JXQ_A_B
1P69_A_B
2H3N_A_B
4FZA_A_B
2EP5_A_B
2Z0P_A_B
1YL7_A_B
4CDK_A_B
2XTJ_A_B
4EIG_A_B
1TM5_A_B
2JJS_A_B
4BQD_A_B
2Z8M_A_B
1J3R_A_B
1TAW_A_B
3Q7H_A_B
1H6D_A_B
2HAX_A_B
3DGP_A_B
3O9L_A_B
1Q9U_A_B
2BBA_A_B
3FD4_A_B
3OSL_A_B
5B75_A_B
2P4Z_A_B
4KFZ_A_B
3TND_A_B
3E9M_A_B
1NR7_A_B
2XFG_A_B
2FDB_A_B
2VPM_A_B
3LMS_A_B
4GI3_A_B
2QLC_A_B
5D3I_A_B
2SNI_A_B
1QI1_A_B
3E2K_A_B
1L2W_A_B
1PBI_A_B
3THT_A_B
3DJP_A_B
2I04_A_B
2A2L_A_B
3TDM_A_B
2XCE_A_B
3ISM_A_B
1INN_A_B
2GBK_A_B
2MCN_A_B



ERROR: Assertion `symmetric || pose.num_chains() < 4` failed. MSG:ERROR: use_ddG_style not compatible with symmetry or poses with > 3 chains
ERROR:: Exit from: /home/benchmark/rosetta/source/src/protocols/simple_filters/BuriedUnsatHbondFilter.cc line: 530


error with: 2MCN_A_B skipping...
3BT1_A_B
2P49_A_B
5B76_A_B
3UZP_A_B
1Q1L_A_B
3GFU_A_B
3FCG_A_B
1B3T_A_B
3QPB_A_B
3L33_A_B
2WC4_A_B




error with: 2WC4_A_B skipping...
3KY8_A_B
2Z0E_A_B
4AOR_A_B
2DUP_A_B
1Y3B_A_B




error with: 1Y3B_A_B skipping...
2JOD_A_B
2YYS_A_B




error with: 2YYS_A_B skipping...
2GS7_A_B
1Y43_A_B
2JI1_A_B
3P71_A_B
3L9J_A_B
4U97_A_B
1I07_A_B
1HYR_A_B
1TM1_A_B
2CJR_A_B
1ZLI_A_B
3P8B_A_B
2GEC_A_B
1ZJD_A_B
2YVL_A_B
1XUA_A_B
3UZV_A_B
1IGU_A_B
2P6B_A_B
4G6U_A_B
3OZB_A_B
1F9S_A_B
3JUY_A_B
1F37_A_B
3KWV_A_B
3H8D_A_B
3L2H_A_B
1UHE_A_B
1YUK_A_B
5F4E_A_B
1ZCP_A_B
1I4O_A_B
2IWO_A_B
1SOT_A_B
1TE1_A_B
4KR0_A_B
2V3B_A_B
1POI_A_B
3NFG_A_B
1Y07_A_B
2SIC_A_B
2PQS_A_B
2GHV_A_B
3H6S_A_B
1IYJ_A_B
1MR1_A_B
4DOQ_A_B
1MAS_A_B
2L0F_A_B



ERROR: Assertion `symmetric || pose.num_chains() < 4` failed. MSG:ERROR: use_ddG_style not compatible with symmetry or poses with > 3 chains
ERROR:: Exit from: /home/benchmark/rosetta/source/src/protocols/simple_filters/BuriedUnsatHbondFilter.cc line: 530


error with: 2L0F_A_B skipping...
2WAM_A_B
1XFS_A_B
2DOI_A_B
3H3B_A_B
3CAM_A_B
4B1V_A_B
2PNH_A_B
3FPV_A_B
1HBT_A_B
1A79_A_B
2KWJ_A_B



ERROR: Assertion `symmetric || pose.num_chains() < 4` failed. MSG:ERROR: use_ddG_style not compatible with symmetry or poses with > 3 chains
ERROR:: Exit from: /home/benchmark/rosetta/source/src/protocols/simple_filters/BuriedUnsatHbondFilter.cc line: 530


error with: 2KWJ_A_B skipping...
1UE7_A_B
1E5Q_A_B
3AFF_A_B
1XV2_A_B
3EN0_A_B
1JIW_A_B
3PS4_A_B
3ECY_A_B
3D4G_A_B
2P43_A_B
1Q8M_A_B
3MQW_A_B
1NQL_A_B
1MZW_A_B
3BAL_A_B
3T3A_A_B
3HMK_A_B
1YVB_A_B
3FJS_A_B
3B5U_A_B
3H9G_A_B
5JLV_A_B
1XT9_A_B



ERROR: Assertion `jump_num_ <= pose.num_jump()` failed.
ERROR:: Exit from: /home/benchmark/rosetta/source/src/protocols/simple_filters/BuriedUnsatHbondFilter.cc line: 531


error with: 1XT9_A_B skipping...
3FUY_A_B
1FU5_A_B
3F5N_A_B
4XXB_A_B
3E2U_A_B
1Y48_A_B
2XJZ_A_B
2VIF_A_B




error with: 2VIF_A_B skipping...
1CL7_A_B
4YWC_A_B
3FLP_A_B
1CQ3_A_B
1H9R_A_B
5DJT_A_B
2ZG6_A_B
4NZW_A_B
3OEU_A_B
1P9U_A_B
1OX9_A_B
3OLM_A_B
1XDT_A_B
2D1P_A_B
1R0K_A_B
5IOH_A_B
3BTV_A_B
3FG8_A_B
3U4J_A_B
1EM8_A_B
4XL5_A_B
1ICF_A_B
2W2N_A_B
4J2Y_A_B
3B76_A_B
2GKW_A_B
1ID5_A_B
3U1O_A_B
3FPR_A_B
2WG4_A_B
2PZD_A_B
1DN2_A_B
1YBG_A_B
3K25_A_B
2Q81_A_B
1NR9_A_B
3BN3_A_B
2DPF_A_B
3DCL_A_B
3EHU_A_B
4QT8_A_B
2P42_A_B
2Z7X_A_B
2HDP_A_B



ERROR: Assertion `symmetric || pose.num_chains() < 4` failed. MSG:ERROR: use_ddG_style not compatible with symmetry or poses with > 3 chains
ERROR:: Exit from: /home/benchmark/rosetta/source/src/protocols/simple_filters/BuriedUnsatHbondFilter.cc line: 530


error with: 2HDP_A_B skipping...
3E1Z_A_B
2G2W_A_B
3NCT_A_B
1EWY_A_B
1JYI_A_B
2FJU_A_B
2BMA_A_B
3FHC_A_B
1O9Y_A_B
2A5Z_A_B
2O8Q_A_B
3TL8_A_B
1T8U_A_B
4KBB_A_B
1DFJ_A_B
3KLQ_A_B
3GBU_A_B
3CE9_A_B
5DMJ_A_B
2YCH_A_B
3U02_A_B
4AFQ_A_B
2G2U_A_B
4AN7_A_B
1G60_A_B
1R7A_A_B
3BRC_A_B
4ILW_A_B
3ME4_A_B
4A94_A_B
4GH7_A_B
1GL1_A_B
4HDO_A_B
2PUY_A_B
3B93_A_B
2O9Q_A_B
1UWG_A_B
3D5N_A_B
2PMV_A_B
2NZ1_A_B
2R0K_A_B
1VG9_A_B
1LQM_A_B
2IWP_A_B
2NXM_A_B
3KMT_A_B
2R5O_A_B
2G6V_A_B
3HTR_A_B
1UNN_A_B
3GZE_A_B
2W80_A_B




error with: 2W80_A_B skipping...
1NQ9_A_B
3I5V_A_B
4LQW_A_B
2WQ4_A_B
4F0A_A_B
1VH4_A_B
2C1W_A_B




error with: 2C1W_A_B skipping...
3HPN_A_B
2HTB_A_B
1F2U_A_B
4BWQ_A_B
1JZO_A_B
1NP6_A_B
2ZNV_A_B


In [2]:
for pdb in tqdm(glob.glob('de_novo_binder_database_clean/*.pdb')):
    name = pdb.split('/')[-1].replace('.pdb','')
    print(name)
    pdb_file = pdb
    
    chain_A = pdb.split('/')[-1].split('_')[1]
    chain_B = pdb.split('/')[-1].split('_')[2].replace('.pdb', '')
    
    chain = chain_B

    interface_scores, interface_AA, interface_residues_pdb_ids_str = score_interface(pdb_file, chain)
    total_count, helix_count, sheet_count, loop_count = obtain_sse_content_interface(pdb_file, chain)

    pose = pr.pose_from_pdb(pdb_file)
    interface_scores['interface_bb_hbonds'] = count_interface_bb_hbonds(pose)
    chain1_interface_residues, chain2_interface_residues = get_interface_residues(pose, chain_A, chain_B)
    interface_scores['interface_hydrophobicity'] = calculate_hydrophobicity(pose, chain1_interface_residues)
    interface_scores['loop_perc'] = (loop_count/total_count)*100
    interface_scores['helix_perc'] = (helix_count/total_count)*100
    interface_scores['beta_perc'] = (sheet_count/total_count)*100

    with open('de_novo_PPI_interface_scores/{name}.pickle'.format(name=name), 'wb') as handle:
        pickle.dump(interface_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/17 [00:00<?, ?it/s]

7zrv_A_B




7tyd_A_B




5vmr_A_B




5vli_A_B




7sh3_A_B




7xge_A_B




5vid_A_B




6wrv_A_B




6wrx_A_B




7n3t_A_B




8sk7_A_B




7opb_A_B




7zsd_A_B




7xyq_A_B




7n1j_A_B




7xad_A_B




7rdh_A_B


