In [None]:
from biopandas.pdb import PandasPdb
from tqdm.notebook import tqdm
import glob

import random
import urllib
import pickle
random.seed(52)

from protein_tools import *

### initialise PyRosetta
pr.init('-ignore_unrecognized_res -ignore_zero_occupancy -mute all -holes:dalphaball /work/lpdi/bin/rosetta_06_2021/main/source/external/DAlpahBall/DAlphaBall.gcc -corrections::beta_nov16 true -relax:default_repeats 1')

# Sample natural PPIs from CATH

In [None]:
# native pdb codes are in cath-domain-list-S100.txt, every line has this format:
#1oaiA00     1    10     8    10     1     1     1     1     1    59 1.000
# pdb code is the first 4 characters
# read all pdb codes from this file

native_pdb_codes = []
with open('cath-domain-list-S100.txt') as f:
    for line in f:
        native_pdb_codes.append(line[:4])

# remove duplicates
native_pdb_codes = list(set(native_pdb_codes))

# subselect random 1000
random.shuffle(native_pdb_codes)
native_pdb_codes = native_pdb_codes[:1029]

for pdb_code in native_pdb_codes:
    url = 'https://files.rcsb.org/download/' + pdb_code + '.pdb'
    filename = 'native_domain_database/' + pdb_code + '.pdb'
    if os.path.isfile(filename):
        continue
    print('Downloading ' + pdb_code + ' from ' + url + ' to ' + filename)
    try:
        urllib.request.urlretrieve(url, filename)
    except:
        print('Error downloading ' + pdb_code + ' from ' + url)

# Calculate the native PPI interface scores

In [None]:
for pdb in tqdm(glob.glob('native_binder_database_clean/*.pdb')):
    name = pdb.split('/')[-1].replace('.pdb','')
    print(name)
    pdb_file = pdb
    
    chain_A = pdb.split('/')[-1].split('_')[1]
    chain_B = pdb.split('/')[-1].split('_')[2].replace('.pdb', '')
    
    # randomly select weather to use chain A or chain B
    random_number = random.uniform(0, 1)
    if random_number < 0.5:
        chain = chain_A
        if len(chain_A) != 1:
            chain = chain_B
    else:
        chain = chain_B

    try:
        interface_scores, interface_AA, interface_residues_pdb_ids_str = score_interface(pdb_file, chain)
        total_count, helix_count, sheet_count, loop_count = obtain_sse_content_interface(pdb_file, chain)

        pose = pr.pose_from_pdb(pdb_file)
        interface_scores['interface_bb_hbonds'] = count_interface_bb_hbonds(pose)
        chain1_interface_residues, chain2_interface_residues = get_interface_residues(pose, chain_A, chain_B)
        interface_scores['interface_hydrophobicity'] = calculate_hydrophobicity(pose, chain1_interface_residues)
        interface_scores['loop_perc'] = (loop_count/total_count)*100
        interface_scores['helix_perc'] = (helix_count/total_count)*100
        interface_scores['beta_perc'] = (sheet_count/total_count)*100

        with open('native_PPI_interface_scores/{name}.pickle'.format(name=name), 'wb') as handle:
            pickle.dump(interface_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)
    except:
        print('error with:', name, 'skipping...')

In [None]:
for pdb in tqdm(glob.glob('de_novo_binder_database_clean/*.pdb')):
    name = pdb.split('/')[-1].replace('.pdb','')
    print(name)
    pdb_file = pdb
    
    chain_A = pdb.split('/')[-1].split('_')[1]
    chain_B = pdb.split('/')[-1].split('_')[2].replace('.pdb', '')
    
    chain = chain_B

    interface_scores, interface_AA, interface_residues_pdb_ids_str = score_interface(pdb_file, chain)
    total_count, helix_count, sheet_count, loop_count = obtain_sse_content_interface(pdb_file, chain)

    pose = pr.pose_from_pdb(pdb_file)
    interface_scores['interface_bb_hbonds'] = count_interface_bb_hbonds(pose)
    chain1_interface_residues, chain2_interface_residues = get_interface_residues(pose, chain_A, chain_B)
    interface_scores['interface_hydrophobicity'] = calculate_hydrophobicity(pose, chain1_interface_residues)
    interface_scores['loop_perc'] = (loop_count/total_count)*100
    interface_scores['helix_perc'] = (helix_count/total_count)*100
    interface_scores['beta_perc'] = (sheet_count/total_count)*100

    with open('de_novo_PPI_interface_scores/{name}.pickle'.format(name=name), 'wb') as handle:
        pickle.dump(interface_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)