In [2]:
import os
import subprocess
import pandas as pd
import re

In [3]:
def split_on_empty_lines(s):
    # greedily match 2 or more new-lines
    blank_line_regex = r"(?:\r?\n){2,}"
    return re.split(blank_line_regex, s.strip())

In [4]:
def parse_pocket_scores(proteins, out_path, threshold):
    dfs= []
    for protein in proteins:
        print(protein)
        summary_file = os.listdir(directory+protein)
        summary_file = [f for f in summary_file if f.endswith('_info.txt')]

        f = open(os.path.join(directory, protein, summary_file[0]), 'r')
        f = split_on_empty_lines(f.read())

        pocket_dataframe_list = []
        pocket_idx = 0
        for pocket in f:
            pocket_idx+=1
            value_lines = (pocket.split('\n'))[1:]
            entries = []

            for line in  value_lines:
                values = line.split('\t')
                formatted = [values[1].replace(' : ', ''), values[2]]
                entries.append(formatted)
            
            df = pd.DataFrame(entries)
            s = pd.Series(df[1]).append(pd.Series(pocket_idx)).reset_index(drop=True)
            columns = df[0]
            pocket_dataframe_list.append(s)
        
        df = pd.DataFrame(pocket_dataframe_list)
        columns = columns.tolist()
        columns.append('pocket_number')
        df.columns = columns
        df['protein'] = protein[:-4]
        dfs.append(df)
    
    df = pd.concat(dfs)
    
    if threshold:
        df = df.loc[pd.to_numeric(df['Druggability Score']) >= threshold]
        
    df.to_csv(out_path)

In [None]:
# Score Feig Lab Pockets
directory = 'pockets/FeigLab/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/feig_lab_score.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/feig_lab_druggable.csv', threshold=0.5)

In [None]:
# Score deepmind structures
directory = 'pockets/deepmind_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/deepmind_sctructures_score.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/deepmind_sctructures_druggable.csv', threshold=0.5)

In [None]:
# Score refined AlphaFold pockets
directory = 'pockets/AlphaFold_refined/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/alphafold_refined_score.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/alphafold_refined_druggable.csv', threshold=0.5)

In [None]:
# Score Korkin Lab Individual Structures
directory = 'pockets/korkin_lab/IndividualModels/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_individual_model_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/korkin_individual_model_druggable.csv', threshold=0.5)

In [None]:
# Score Korkin Lab Intraviral complex Structures
directory = 'pockets/korkin_lab/IntraViralComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_intraviral_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/korkin_intraviral_druggable.csv', threshold=0.5)

In [None]:
# Score Korkin Lab viral-human complexes
directory = 'pockets/korkin_lab/ViralHumanComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_viral_human_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/korkin_viral_human_druggable.csv', threshold=0.5)

In [5]:
# Score experimental structures
directory = 'pockets/experimental_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/experimental_structure_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/experimental_structure_druggable.csv', threshold=0.5)

5rgi_out
5ret_out
6w4b_out
5rfb_out
5re4_out
6vxs_out
5rfz_out
5rfe_out
5rfx_out
6w01_out
6w6y_out
5rfn_out
5ree_out
6y84_out
5reb_out
6vyb_out
5rfq_out
6w02_out
5rfw_out
5rf8_out
5r84_out
7btf_out
5rfc_out
5rgl_out
5re8_out
5rg0_out
6yb7_out
5rep_out
5rfs_out
6w75_out
5rg1_out
5rea_out
6w61_out
5rf0_out
5rey_out
6w9c_out
6m2n_out
5reh_out
5r82_out
5reg_out
6m3m_out
6m71_out
6w4h_out
6m03_out
5rgg_out
5rgh_out
5rgp_out
5rfa_out
5rex_out
6vsb_out
5reu_out
6vyo_out
5rer_out
5rgk_out
6yla_out
5rfd_out
6w9q_out
6m2q_out
5r81_out
5rfj_out
5re5_out
5rf2_out
5rec_out
5rf6_out
5rfl_out
5r7y_out
5rf4_out
6y2g_out
5rfo_out
5rfi_out
5rg3_out
5re6_out
6lvn_out
5rej_out
6m0j_out
5re7_out
6vxx_out
5rfg_out
5r8t_out
5rgr_out
6yi3_out
5rek_out
5rf9_out
6lzg_out
5r83_out
5reo_out
5rfu_out
5r80_out
6w41_out
6wcf_out
5rfr_out
5rei_out
5ren_out
5rev_out
5rgn_out
5rf1_out
5re9_out
5rgq_out
5rgo_out
6lxt_out
6vww_out
5rez_out
5rgm_out
5rfv_out
5rfp_out
6y2e_out
5rff_out
5rf5_out
5red_out
6vw1_out
5rew_out
5

In [6]:
# Score RaptorX models
directory = 'pockets/RaptorX/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/raptorx_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/raptorx_druggable.csv', threshold=0.5)

ORF10_out
ORF7b_out
nsp6_out
nsp4_out
M_protein_out
ORF6_out
nsp2_out
ORF3a_out
PL-PRO_out
ORF8_out
ORF10_out
ORF7b_out
nsp6_out
nsp4_out
M_protein_out
ORF6_out
nsp2_out
ORF3a_out
PL-PRO_out
ORF8_out


In [8]:
# Score CSGID  structrues
directory = 'pockets/csgid/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/csgid_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/csgid_druggable.csv', threshold=0.5)

6w4b_out
6vxs_out
7bv1_out
6w01_out
6w6y_out
6y84_out
6vyb_out
6u7h_out
6w02_out
7btf_out
6wji_out
7bqy_out
6yb7_out
6w75_out
6w61_out
6wiq_out
6w9c_out
6m2n_out
6m3m_out
6m71_out
6w4h_out
6m03_out
6vsb_out
6vyo_out
6yla_out
6w9q_out
6m2q_out
6y2g_out
6lvn_out
6m0j_out
6vxx_out
6yi3_out
6lzg_out
6wjt_out
6w41_out
6wcf_out
6lxt_out
6vww_out
6y2e_out
6vw1_out
6lu7_out
6y2f_out
6m17_out
7bv2_out
6wen_out
6w63_out
6w4b_out
6vxs_out
7bv1_out
6w01_out
6w6y_out
6y84_out
6vyb_out
6u7h_out
6w02_out
7btf_out
6wji_out
7bqy_out
6yb7_out
6w75_out
6w61_out
6wiq_out
6w9c_out
6m2n_out
6m3m_out
6m71_out
6w4h_out
6m03_out
6vsb_out
6vyo_out
6yla_out
6w9q_out
6m2q_out
6y2g_out
6lvn_out
6m0j_out
6vxx_out
6yi3_out
6lzg_out
6wjt_out
6w41_out
6wcf_out
6lxt_out
6vww_out
6y2e_out
6vw1_out
6lu7_out
6y2f_out
6m17_out
7bv2_out
6wen_out
6w63_out


In [9]:
# Score Modeller  structrues
directory = 'pockets/modeller/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/modeller_scores.csv', threshold=False)
parse_pocket_scores(proteins, out_path='pockets/modeller_druggable.csv', threshold=0.5)

ORF10_out
helicase_out
ORF7a_out
nsp6_out
Guanine-N7_out
nsp1_out
nsp4_out
ORF6_out
nsp2_out
envelope_membrane_out
papin1_out
Nucleoprotein_out
ROF3a_out
ORF10_out
helicase_out
ORF7a_out
nsp6_out
Guanine-N7_out
nsp1_out
nsp4_out
ORF6_out
nsp2_out
envelope_membrane_out
papin1_out
Nucleoprotein_out
ROF3a_out


In [11]:
# Score Feig Lab Pockets
directory = 'pockets/FeigLab/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/feig_lab_score.csv', threshold=False)

# Score deepmind structures
directory = 'pockets/deepmind_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/deepmind_sctructures_score.csv', threshold=False)

# Score refined alpha fold scores
directory = 'pockets/AlphaFold_refined/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/alphafold_refined_score.csv', threshold=False)

# Score Korkin Lab Individual Structures
directory = 'pockets/korkin_lab/IndividualModels/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_individual_model_scores.csv', threshold=False)

# Score Korkin Lab Intraviral complex Structures
directory = 'pockets/korkin_lab/IntraViralComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_intraviral_scores.csv', threshold=False)

# Score Korkin Lab viral-human complexes
directory = 'pockets/korkin_lab/ViralHumanComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_viral_human_scores.csv', threshold=False)

# Score experimental structures
directory = 'pockets/experimental_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/experimental_structure_scores.csv', threshold=False)

# Score RaptorX models
directory = 'pockets/RaptorX/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/raptorx_scores.csv', threshold=False)


ORF10_out
ORF7b_out
nsp6_out
nsp4_out
M_protein_out
ORF6_out
nsp2_out
ORF3a_out
PL-PRO_out
ORF8_out
nsp6_out
Protein_3a_out
nsp4_out
M_protein_out
nsp2_out
PL-PRO_C_terminal_out
nsp6_out
Protein_3a_out
nsp4_out
M_protein_out
nsp2_out
PL-PRO_C_terminal_out
wNsp8-6nurA_out
wNsp3_domain6-2k87A_out
wS-5xlr_C-6acj_C_out
wNsp4-3vc8A_out
wNsp3_domain1-2griA_out
wORF7a_1yo4A_out
wNsp16-2xyqA_out
wNsp7-1ysyA_out
wNsp12-6nurA_out
wNsp15-2h85A_out
wE-5x29A_out
wNsp3_domain3-2wctA_out
wNsp3_domain5-3e9sA_out
wNsp9-3ee7_out
wN-Nterminal_domain-1ssk_4ud1A_out
wNsp3_domain4-2kafA_out
wNsp10-2g9tA_out
wNsp13-6jytA_out
wN-Cterminal_domain-2jw8A_out
wNsp3_domain2-2acfA_out
wNsp1-2hsxA_out
wNsp14-5c8uB_out
wN-Nterminal_domain_homopentamer-4UD1_out
wNsp5_homodimer-2gt7_out
wNsp13_homodimer-6jyt_out
wN-Cterminal_domain_homodimer-2JW8_out
wNsp10_dodecamer-2g9t_out
wNsp7-wNsp8-wNsp12_heterotetramer-6nur_out
wE_homopentamer-5X29_out
wNsp10-wNsp16_heterodimer-2xyq_out
wS_homotrimer-Conf2-6CS1_out
wNsp3-domain3

In [12]:
# Produce list of druggable pockets
# Score Feig Lab Pockets
directory = 'pockets/FeigLab/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/feig_lab_druggable.csv', threshold=0.5)

# Score deepmind structures
directory = 'pockets/deepmind_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/deepmind_sctructures_druggable.csv', threshold=0.5)

# Score refined alpha fold scores
directory = 'pockets/AlphaFold_refined/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/alphafold_refined_druggable.csv', threshold=0.5)

# Score Korkin Lab Individual Structures
directory = 'pockets/korkin_lab/IndividualModels/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_individual_model_druggable.csv', threshold=0.5)

# Score Korkin Lab Intraviral complex Structures
directory = 'pockets/korkin_lab/IntraViralComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_intraviral_druggable.csv', threshold=0.5)

# Score Korkin Lab viral-human complexes
directory = 'pockets/korkin_lab/ViralHumanComplexes/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/korkin_viral_human_druggable.csv', threshold=0.5)

# Score experimental structures
directory = 'pockets/experimental_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/experimental_structure_druggable.csv', threshold=0.5)

# Score RaptorX structures
directory = 'pockets/experimental_structures/'
proteins = os.listdir(directory)
parse_pocket_scores(proteins, out_path='pockets/raptorx_druggable.csv', threshold=0.5)

ORF10_out
ORF7b_out
nsp6_out
nsp4_out
M_protein_out
ORF6_out
nsp2_out
ORF3a_out
PL-PRO_out
ORF8_out
nsp6_out
Protein_3a_out
nsp4_out
M_protein_out
nsp2_out
PL-PRO_C_terminal_out
nsp6_out
Protein_3a_out
nsp4_out
M_protein_out
nsp2_out
PL-PRO_C_terminal_out
wNsp8-6nurA_out
wNsp3_domain6-2k87A_out
wS-5xlr_C-6acj_C_out
wNsp4-3vc8A_out
wNsp3_domain1-2griA_out
wORF7a_1yo4A_out
wNsp16-2xyqA_out
wNsp7-1ysyA_out
wNsp12-6nurA_out
wNsp15-2h85A_out
wE-5x29A_out
wNsp3_domain3-2wctA_out
wNsp3_domain5-3e9sA_out
wNsp9-3ee7_out
wN-Nterminal_domain-1ssk_4ud1A_out
wNsp3_domain4-2kafA_out
wNsp10-2g9tA_out
wNsp13-6jytA_out
wN-Cterminal_domain-2jw8A_out
wNsp3_domain2-2acfA_out
wNsp1-2hsxA_out
wNsp14-5c8uB_out
wN-Nterminal_domain_homopentamer-4UD1_out
wNsp5_homodimer-2gt7_out
wNsp13_homodimer-6jyt_out
wN-Cterminal_domain_homodimer-2JW8_out
wNsp10_dodecamer-2g9t_out
wNsp7-wNsp8-wNsp12_heterotetramer-6nur_out
wE_homopentamer-5X29_out
wNsp10-wNsp16_heterodimer-2xyq_out
wS_homotrimer-Conf2-6CS1_out
wNsp3-domain3