A notebook for collecting the mean PAEs of trimmed proteins.

In [1]:
import pandas as pd
import project_pipeline.scripts.utils as utils
import numpy as np
import os

In [2]:
def mean_paes(df, path, affix, suffix, cluster=False):
    # Calculate the average pae for region 1 to region 1, region 2 to region 2, and region 1 to region 2

    print('Calculating mean pae...')

    for i in range(len(df)):
        uniprot = df.loc[i, 'uniprot']
        fn = affix + uniprot + suffix
        if cluster:
            cluster = df.loc[i, 'cluster']
            fn = uniprot + '_' + cluster + '_' + suffix
        region_1 = df.loc[i, 'region_1']
        region_2 = df.loc[i, 'region_2']

        # Region bounds are in the format [start, end] for each region. Regions with multiple sections look like [[start, end], [start, end], ...]
        reg1_bounds = utils.region_bounds(region_1)
        reg2_bounds = utils.region_bounds(region_2)

        reg1_array = np.array(reg1_bounds)
        reg2_array = np.array(reg2_bounds)

        # Because we have trimmed the proteins, we have to correct the region bounds. We subtract the minimum boundary - 1 from all values so that the minimum
        # boundary becomes 1.

        reg1_array, reg2_array = utils.adjust_trimmed_bounds(reg1_array, reg2_array)

        # Read in json file
        if cluster:
            subpath = os.path.join(path, uniprot)
            prot_array = utils.pae_from_json(subpath, fn)

        else:
            print(fn)
            prot_array = utils.pae_from_json(path, fn)

        '''
        We want means of reg1 compared against reg1, reg1 compared against reg2, and reg2 compared against reg2.
        '''

        if prot_array.any() == np.nan:
            mean11 = 0
            mean12 = 0
            mean22 = 0

        else:
            mean11 = utils.calculate_pae_mean(prot_array, reg1_array, reg1_array)
            mean12 = utils.calculate_pae_mean(prot_array, reg1_array, reg2_array)
            mean22 = utils.calculate_pae_mean(prot_array, reg2_array, reg2_array)
        

        df.loc[i, 'mean_pae_1_1'] = round(mean11, 3)
        df.loc[i, 'mean_pae_1_2'] = round(mean12, 3)
        df.loc[i, 'mean_pae_2_2'] = round(mean22, 3)
    
    return df

In [3]:
ai = pd.read_csv('./project_pipeline/data/af2t_ai_pdb_clusters.tsv', sep='\t')
# md = pd.read_csv('./data/md_pdb_clusters.tsv', sep='\t')

ai_path = 'project_pipeline/data/input/Colabfold_cif/trimmed/'
# md_path = 'data/input/Colabfold_cif/multi_domain/'

affix = ''
suffix = 'predicted_aligned_error_v1.json'

# Get rid of some redundancy because we only need the uniprots and clusters
ai = ai[['uniprot', 'cluster', 'region_1', 'region_2']].drop_duplicates().reset_index(drop=True) 
# md = md[['uniprot', 'cluster', 'region_1', 'region_2']].drop_duplicates().reset_index(drop=True)

# Calculate mean PAE for each cluster
ai_pae = mean_paes(ai, ai_path, affix, suffix, cluster=True)
# md_pae = main.mean_paes(md, md_path, affix, suffix, cluster=True)

# Save the files
ai_pae.to_csv('./project_pipeline/data/af2t_ai_cluster_pae.tsv', sep='\t', index=False)
# md_pae.to_csv('./data/md_cluster_pae.tsv', sep='\t', index=False)

Calculating mean pae...
