This notebook is for adjusting the files to give to Jen for a few PCA plots. 

In [141]:
from pdbecif.mmcif_io import CifFileReader, CifFileWriter
from biopandas.pdb import PandasPdb
import pandas as pd
import numpy as np
import project_pipeline.scripts.utils as utils
import os

In [142]:
def assert_equal_size(gt_trim_df, pred_trim_df):
    try:
        assert len(pred_trim_df) == len(gt_trim_df)
        return True
    except AssertionError:
        gt_sim = gt_trim_df.drop(['id', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'type_symbol', 'pdbx_formal_charge'], axis=1)
        pred_sim = pred_trim_df.drop(['id', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'type_symbol', 'pdbx_formal_charge'], axis=1)
        diff = pd.concat([gt_sim, pred_sim]).drop_duplicates(keep=False)
        diff.to_csv('./project_pipeline/data/AssertionError.tsv', sep='\t')
        print(diff)
        print('AssertionError! Check file')
        return False

def drop_unshared_atoms(pdb1, pdb2):

    present_atoms = []

    # Define atom_names for hydrogens
    hydrogens = ['HA', 'HB1', 'HB2', 'HB3', 'H', 'HA2', 'HA3', 'HG2', 'HG3', 'HD2', 'HD3', 'HE1', 'HE2',
                'HE3', 'HB', 'HG11', 'HG12', 'HG13', 'HG21', 'HG22', 'HG23', 'HE', 'HH11', 'HH12', 'HH21',
                'HH22', 'HE21', 'HE22', 'HD1', 'HZ', 'HH', 'HG1', 'HD21', 'HD22', 'HG', 'HD11', 'HD12', 
                'HD13', 'HD23', 'HZ1', 'HZ2', 'HZ3', 'HH2']
    # Define possible alternate conformations
    alt_locations = ['B', 'C', 'D', 'E']

    pdb1 = pdb1[(~pdb1['label_atom_id'].isin(hydrogens)) & (~pdb1['label_alt_id'].isin(alt_locations)) & (pdb1['pdbx_PDB_model_num'] == '1')].reset_index(drop=True)

    pdb2 = pdb2[(~pdb2['label_atom_id'].isin(hydrogens)) & (~pdb2['label_alt_id'].isin(alt_locations)) & (pdb2['pdbx_PDB_model_num'] == '1')].reset_index(drop=True)

    pdb1_sub = pdb1[['label_atom_id', 'label_comp_id', 'label_seq_id']]
    pdb2_sub = pdb2[['label_atom_id', 'label_comp_id', 'label_seq_id']]
    pdb_shared = pd.merge(pdb1_sub, pdb2_sub, on=['label_atom_id', 'label_comp_id', 'label_seq_id'], how='inner')

    pdb1_trim = pd.merge(pdb1, pdb_shared, on=['label_atom_id', 'label_comp_id', 'label_seq_id'], how='inner')
    pdb2_trim = pd.merge(pdb2, pdb_shared, on=['label_atom_id', 'label_comp_id', 'label_seq_id'], how='inner')

    return pdb1_trim, pdb2_trim

def trim_pdbs(df, gt_path_in, trim_path_out):

    uniprots = df['uniprot'].unique()

    for uniprot in uniprots:

        # Make sure the uniprot directory exists
        utils.uniprot_dirs(trim_path_out, uniprot=uniprot)

        # Read in the files
        print(f'Trying {uniprot}...')
        
        df_prot = df[df['uniprot'] == uniprot].reset_index(drop=True)
        pdbs = df_prot['pdb'].unique()

        # Loop through the pdbs, get the atoms for each, and then find the common atoms between all of them
        pdb_ids = []
        pdb_dfs = []
        pdb_objs = []

        for pdb in pdbs:

            chain1 = df_prot[df_prot['pdb'] == pdb]['chain'].unique()[0]
            pdb_fn = f'{pdb}.cif'
            uniprot_path = f'{uniprot}/'

            # Generate file paths using format templates
            pdb_fp = os.path.join(gt_path_in, uniprot_path, pdb_fn)

            # Initiate reader object
            cfr = CifFileReader()

            # Create dataframe with gt atoms in desired chain
            pdb_obj = cfr.read(pdb_fp, output='cif_dictionary', ignore=['_struct_conn'])
            pdb_all_chains = pd.DataFrame.from_dict(pdb_obj[pdb.upper()]['_atom_site'])
            pdbc = pdb_all_chains[pdb_all_chains['label_asym_id'] == chain1].reset_index(drop=True)

            pdb_dfs.append(pdbc)
            pdb_ids.append(pdb)
            pdb_objs.append(pdb_obj)
        
        # Find common atoms between files
        print(f'Comparing files for {uniprot}...')

        print([len(d) for d in pdb_dfs])
        print(pdb_ids)

        # Trim the files to only common atoms
        common_df = pdb_dfs[0][['label_atom_id', 'label_comp_id', 'label_seq_id']]

        for idx, pdb_df in enumerate(pdb_dfs[1:], start=1):
            common_df = pd.merge(common_df, pdb_df[['label_atom_id', 'label_comp_id', 'label_seq_id']], 
                                 on=['label_atom_id', 'label_comp_id', 'label_seq_id'], how='inner').reset_index(drop=True)
            print(len(common_df))
            print(idx)
            print(pdb_ids[idx])
            
        pdb_dfs = [pd.merge(pdb, common_df, on=['label_atom_id', 'label_comp_id', 'label_seq_id'], how='inner').reset_index(drop=True) for pdb in pdb_dfs]

        if len(set([len(pdb_df) for pdb_df in pdb_dfs])) != 1:
            print('Lengths of pdb_dfs are not equal')
            break
            
        else:
            print(f'Lengths of pdb_dfs are equal and have {len(pdb_dfs[0])} atoms')
            for i in range(len(pdb_dfs)):

                pdb_id = pdb_ids[i]
                pdb_df = pdb_dfs[i]
                pdb_obj = pdb_objs[i]

                # Convert back to mmCIF-like dictionary
                pdb_dict = pdb_df.to_dict(orient='list')
                pdb_obj[pdb_id.upper()]['_atom_site'] = pdb_dict
                
                pdb_fn = f'{pdb_id}.cif'
                trim_fp_out = os.path.join(trim_path_out, uniprot_path, pdb_fn)

                CifFileWriter(trim_fp_out).write(pdb_obj)

def trim_cf_pdb(df, gt_path_in, pred_path_in, pred_path_out, 
              gt_format='{uniprot}/{pdb}.cif', pred_format='{uniprot}/{filename}'):

    for i in range(len(df)):
        
        # Define parameters for selecting files
        uniprot = df.loc[i, 'uniprot']
        pdb = df.loc[i, 'pdb']
        cluster = df.loc[i, 'cluster']
        filename = df.loc[i, 'cf_filename']
        chain = df.loc[i, 'chain']

# Generate file paths using format templates
        gt_fn = os.path.join(gt_path_in, gt_format.format(uniprot=uniprot, pdb=pdb))
        pred_fn = os.path.join(pred_path_in, pred_format.format(uniprot=uniprot, filename=filename))
        pred_fn_out = os.path.join(pred_path_out, f'{uniprot}/{uniprot}_{cluster}.pdb')

        # Make sure the uniprot directory exists
        utils.uniprot_dirs(pred_path_out, uniprot=uniprot)

        print(f'Trying {pdb} - {cluster} for {uniprot}...')

        # Have to convert pdb column names to cif column names
        mapper = {'record_name': 'group_PDB',
                    'atom_number': 'id',
                    'element_symbol': 'type_symbol',
                    'atom_name': 'label_atom_id',
                    'alc_loc': 'label_alt_id',
                    'residue_name': 'label_comp_id',
                    'chain_id': 'label_asym_id', 
                    'residue_number': 'label_seq_id',
                    'x_coord': 'Cartn_x', 
                    'y_coord': 'Cartn_y', 
                    'z_coord': 'Cartn_z',
                    'occupancy': 'occupancy',
                    'b_factor': 'B_iso_or_equiv',
                    'charge': 'pdbx_formal_charge'}
        
        reverse_mapper = {v: k for k, v in mapper.items()}

        # Initiate reader objects
        cfr = CifFileReader()
        ppdb = PandasPdb()

        # Create dataframe with gt atoms in desired chain
        gt_obj = cfr.read(gt_fn, output='cif_dictionary', ignore=['_struct_conn'])
        gt_all_chains = pd.DataFrame.from_dict(gt_obj[pdb.upper()]['_atom_site'])
        gt = gt_all_chains[gt_all_chains['label_asym_id'] == chain].reset_index(drop=True)

        # Create dataframe with pred atoms. ColabFold files are only in pdb format
        pred_obj = ppdb.read_pdb(pred_fn)
        pred = pred_obj.df['ATOM']
        pred = pred.rename(columns=mapper)
        # Must convert all columns to string
        all_columns = list(pred.columns)
        pred[all_columns] = pred[all_columns].astype(str)

        print('Length of gt: ' + str(len(gt)) + ', Length of pred:' + str(len(pred)))

        # Find common atoms between files
        print(f'Comparing files for {pdb} - {cluster}...')
        atoms_pred, extra_atoms_gt = utils.compare_atoms(gt, pred)

        # Trim the files
        gt_trim, pred_trim = utils.drop_unshared_atoms(gt, pred, atoms_pred, extra_atoms_gt)

        print('Length of gt_trim: ' + str(len(gt_trim)) + ', Length of pred_trim: ' + str(len(pred_trim)))
    

        # Don't need to convert to dictionary, can just pass back as df
        pred_trim = pred_trim.rename(columns=reverse_mapper)
        # Convert column types back to original
        pred_trim[['atom_number', 'residue_number', 'line_idx']] = pred_trim[['atom_number', 'residue_number', 'line_idx']].astype(int)
        pred_trim[['x_coord', 'y_coord', 'z_coord', 'occupancy', 'b_factor', 'charge']] = pred_trim[['x_coord', 
                                                                                                    'y_coord', 
                                                                                                    'z_coord', 
                                                                                                    'occupancy', 
                                                                                                    'b_factor',
                                                                                                    'charge']].astype(float)
        pred_obj.df['ATOM'] = pred_trim

        # Check whether the trimmed files are the same length
        assertion = utils.assert_equal_size(gt_trim, pred_trim)

        if assertion == True:
            print('Trimmed files are the same length')
        else:
            break
        
        if len(gt_trim) == 0:
            print(f'No common atoms found for {pdb}. Removing from dataframe...')
        else:
            print(f'Success! Creating trimmed files for {pdb} - {cluster}...')
            # Write trimmed files
            pred_obj.to_pdb(path=pred_fn_out, records=None)

def trim_cifs(df, gt_path_in, gt_path_out, pred_path_in, pred_path_out):

    trim_values = []
    for i in range(len(df)):
        
        # Define parameters for selecting files
        uniprot = df.loc[i, 'uniprot']
        pdb = df.loc[i, 'pdb']
        chain = df.loc[i, 'chain']
        gt_fn = df.loc[i, 'gt_fn']
        uniprot_path = f'{uniprot}/'
        pred_fn = df.loc[i, 'af_filename']

        # Generate file paths using format templates
        gt_fp = os.path.join(gt_path_in, uniprot_path, gt_fn)
        gt_fp_out = os.path.join(gt_path_out, uniprot_path, gt_fn)
        pred_fp = os.path.join(pred_path_in, pred_fn)
        pred_fp_out = os.path.join(pred_path_out, uniprot_path, f'{uniprot}_full_depth.cif')

        # Make sure the uniprot directory exists
        utils.uniprot_dirs(gt_path_out, pred_path_out, uniprot=uniprot)
            
        print(f'Trying {pdb} for {uniprot}...')

        # Initiate reader object
        cfr = CifFileReader()

        # Create dataframe with gt atoms in desired chain
        gt_obj = cfr.read(gt_fp, output='cif_dictionary', ignore=['_struct_conn'])
        gt_all_chains = pd.DataFrame.from_dict(gt_obj[pdb.upper()]['_atom_site'])
        gt = gt_all_chains[gt_all_chains['label_asym_id'] == chain].reset_index(drop=True)

        # Create dataframe with pred atoms (pred file only contains our desired chain)
        pred_obj = cfr.read(pred_fp, output='cif_dictionary')
        pred = pd.DataFrame.from_dict(pred_obj[f'AF-{uniprot}-F1']['_atom_site'])

        print('Length of gt: ' + str(len(gt)) + ', Length of pred:' + str(len(pred)))

        # Find common atoms between files
        print(f'Comparing files for {pdb}...')
        atoms_pred, extra_atoms_gt = utils.compare_atoms(gt, pred)

        # Trim the files
        gt_trim, pred_trim = utils.drop_unshared_atoms(gt, pred, atoms_pred, extra_atoms_gt)

        print('Length of gt_trim: ' + str(len(gt_trim)) + ', Length of pred_trim: ' + str(len(pred_trim)))
        
        # Convert back to mmCIF-like dictionary
        gt_dict = gt_trim.to_dict(orient='list')
        gt_obj[pdb.upper()]['_atom_site'] = gt_dict

        pred_dict = pred_trim.to_dict(orient='list')
        pred_obj[f'AF-{uniprot}-F1']['_atom_site'] = pred_dict

        # Check whether the trimmed files are the same length
        assertion = utils.assert_equal_size(gt_trim, pred_trim)

        if assertion == True:
            print('Trimmed files are the same length')
        else:
            break
        
        if len(gt_trim) == 0:
            print(f'No common atoms found for {pdb}. Removing from dataframe...')
        else:
            print(f'Success! Creating trimmed files for {pdb}...')
            # Write trimmed files
            CifFileWriter(gt_fp_out).write(gt_obj)
            CifFileWriter(pred_fp_out).write(pred_obj)


        # Compile some information on the trimmed files
        trim_values_dict = utils.trim_stats(uniprot, pdb, gt, gt_trim, pred, pred_trim)
        trim_values.append(trim_values_dict)
    
    # Add trim values to dataframe
    df_trim = pd.DataFrame(trim_values)
    df = df.merge(df_trim, on = ['pdb', 'uniprot'])

    # Drop any files that have no common atoms.
    df = df[df['gt_trim_len'] != 0].reset_index(drop=True)


    return trim_values, df

In [143]:
# Our proteins of interest
prots = ['P35520']

# directories
gt_in_path = './project_pipeline/data/input/RCSB_cif_best/'
pred_in_path = './project_pipeline/data/input/Colabfold_cif/autoinhibited/'
fd_in_path = './project_pipeline/data/input/Alphafold_cif/'
trim_path = './project_pipeline/data/output/jen_pca/'

In [144]:
# Determine the structures with the maximum 2_comp difference between them for our proteins of interest
df = pd.read_csv('./project_pipeline/data/classified_files.tsv', sep='\t')
df2 = pd.read_csv('./project_pipeline/data/ai_pdb_clusters.tsv', sep='\t')

# Create a dataframe to work from
difs = df[df['uniprot'].isin(prots)].reset_index(drop=True)
# difs = difs[(difs['percent_region_1'] >= 99) & (difs['percent_region_2'] >= 99)] # Select only the best structures

# Trim the two experimental files
trim_pdbs(difs, gt_in_path, trim_path)

# Trim the predicted files to match only the first trimmed experimental file for each protein
for prot in prots:
    pdb1 = difs[difs['uniprot'] == prot]['pdb'].unique()[0]
    colabfold_df = df2[df2['pdb'] == pdb1].reset_index(drop=True)
    af_df = colabfold_df[['uniprot', 'pdb', 'af_filename', 'chain']].drop_duplicates().reset_index(drop=True)
    af_df['gt_fn'] = af_df['pdb'] + '.cif'

    trim_values, df = trim_cifs(af_df, trim_path, trim_path, fd_in_path, trim_path)

    trim_cf_pdb(colabfold_df, trim_path, pred_in_path, trim_path,
                                        gt_format = '{uniprot}/{pdb}.cif', pred_format = '{uniprot}/{filename}')

Trying P35520...
Comparing files for P35520...
[3626, 3631, 3795, 3557, 3689, 3921, 3921, 3921, 3921]
['4coo', '4l0d', '4l28', '4pcu', '7qgt', '8s5h', '8s5i', '8s5j', '8s5k']
3593
1
4l0d
3581
2
4l28
3524
3
4pcu
3524
4
7qgt
3515
5
8s5h
3515
6
8s5i
3515
7
8s5j
3515
8
8s5k
Lengths of pdb_dfs are equal and have 3515 atoms
Trying 4coo for P35520...
Length of gt: 3515, Length of pred:4252
Comparing files for 4coo...
Length of gt_trim: 3515, Length of pred_trim: 3515
Trimmed files are the same length
Success! Creating trimmed files for 4coo...
Trying 4coo - 121 for P35520...
Length of gt: 3515, Length of pred:4251
Comparing files for 4coo - 121...
Length of gt_trim: 3515, Length of pred_trim: 3515
Trimmed files are the same length
Success! Creating trimmed files for 4coo - 121...
Trying 4coo - 229 for P35520...
Length of gt: 3515, Length of pred:4251
Comparing files for 4coo - 229...
Length of gt_trim: 3515, Length of pred_trim: 3515
Trimmed files are the same length
Success! Creating trimmed

In [145]:
# uniprot = 'P22681'

# df3 = pd.read_csv('./project_pipeline/data/ai_cluster_pae.tsv', sep='\t').rename(columns={'region_1': 'im', 'region_2': 'fd',
#                                                                                           'mean_pae_1_1': 'mean_pae_im_im', 'mean_pae_1_2': 'mean_pae_im_fd',
#                                                                                           'mean_pae_2_2': 'mean_pae_fd_fd'})
# df3 = df3[df3['uniprot'] == uniprot]
# df3.to_csv(f'../for_jen/{uniprot.lower()}_pae.csv', index=False)