In [1]:
from pathlib import Path

import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm

from ppiref.utils.ppi import PPI
from ppiref.split import write_split, read_split, read_fold
from ppiref.comparison import IDist
from ppiref.definitions import PPIREF_DATA_DIR

tqdm.pandas()

# All putative PPIs (at least one distance contact) in the PDB

In [2]:
dir_ppi = PPIREF_DATA_DIR / 'ppiref/ppi_6A'

In [3]:
paths_ppi = [p for p in dir_ppi.rglob('*.pdb')]
ids_ppi = [p.stem.split('_')[0] for p in dir_ppi.rglob('*.pdb')]

In [5]:
write_split('ppiref_6A_raw', dir_ppi, {'whole': paths_ppi})

# Proper PPIs

In [5]:
def ppi_file_to_df_row(path):
    ppi = PPI(path)
    if path.suffix == '.noppi':
        return {}
    return {
        'PATH': path,
        **ppi.stats
    }


df_path = PPIREF_DATA_DIR / 'ppiref/ppi_6A_stats/raw_stats.csv'

# n_workers = 32
# pandarallel.initialize(progress_bar=True, use_memory_fs=False, nb_workers=n_workers)
# df = pd.Series(paths_ppi).parallel_apply(ppi_file_to_df_row)
# df = pd.DataFrame(list(df))
# df.to_csv(df_path, index=False)
df = pd.read_csv(
    df_path,
    dtype={'PATH': str, 'KIND': str, 'STRUCTURE METHOD': str}
)
df = df.dropna(how='all')
df

Unnamed: 0,PATH,KIND,EXTRACTION RADIUS,EXPANSION RADIUS,RESOLUTION,STRUCTURE METHOD,DEPOSITION DATE,RELEASE DATE,BSA
0,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.60,x-ray diffraction,2019-08-07,2020-06-17,659.734682
1,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.13,x-ray diffraction,2010-12-20,2011-01-19,50.450693
2,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.88,electron microscopy,2023-08-02,2023-08-30,3079.544686
3,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.20,x-ray diffraction,2014-04-09,2014-07-23,462.436532
4,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.20,x-ray diffraction,2019-08-07,2020-01-22,689.826471
...,...,...,...,...,...,...,...,...,...
765134,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,4.70,x-ray diffraction,2015-01-16,2015-02-04,281.223175
765135,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.10,x-ray diffraction,2004-01-08,2004-03-30,887.002861
765136,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.20,x-ray diffraction,2011-05-16,2011-08-10,1807.687696
765137,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.30,x-ray diffraction,2011-05-16,2011-08-17,1591.343698


In [6]:
df['PASSES FILTERS'] = (
    (df['STRUCTURE METHOD'].isin(['x-ray diffraction', 'electron microscopy']))
    & (df['RESOLUTION'] <= 3.5)
    & (df['BSA'] >= 500)
)

In [7]:
df_filtered = df[df['PASSES FILTERS']]
df_filtered

Unnamed: 0,PATH,KIND,EXTRACTION RADIUS,EXPANSION RADIUS,RESOLUTION,STRUCTURE METHOD,DEPOSITION DATE,RELEASE DATE,BSA,PASSES FILTERS
0,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.60,x-ray diffraction,2019-08-07,2020-06-17,659.734682,True
2,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.88,electron microscopy,2023-08-02,2023-08-30,3079.544686,True
4,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.20,x-ray diffraction,2019-08-07,2020-01-22,689.826471,True
10,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.20,x-ray diffraction,2003-07-25,2003-09-09,1620.549480,True
14,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.00,electron microscopy,2021-10-22,2022-02-02,1211.471481,True
...,...,...,...,...,...,...,...,...,...,...
765132,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.20,x-ray diffraction,2011-05-16,2011-08-10,2089.356343,True
765135,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,2.10,x-ray diffraction,2004-01-08,2004-03-30,887.002861,True
765136,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.20,x-ray diffraction,2011-05-16,2011-08-10,1807.687696,True
765137,/scratch/project/open-26-23/antonb/PPIRef/ppir...,heavy,6.0,0.0,3.30,x-ray diffraction,2011-05-16,2011-08-17,1591.343698,True


In [15]:
paths_ppi_filtered = df_filtered['PATH'].apply(Path).tolist()
write_split('ppiref_6A_filtered', dir_ppi, {'whole': paths_ppi_filtered})



# Deduplicated (clustered) proper PPIs

In [10]:
idist = IDist()
idist.read_embeddings(PPIREF_DATA_DIR / 'ppiref/ppi_6A_stats/idist_emb.csv', dropna=True)  # Embeddings for all filtered PPIs
len(idist.embeddings)

349459

In [13]:
idist.deduplicate_embeddings()

Processing adjacency chunks: 100%|██████████| 911/911 [42:51<00:00,  2.82s/it]


In [14]:
len(idist.embeddings)

51755

In [16]:
write_split('ppiref_6A_filtered_clustered_04', dir_ppi, {'whole': list(idist.embeddings.keys())})



# Test results

In [8]:
names = ['ppiref_6A_raw', 'ppiref_6A_filtered', 'ppiref_6A_filtered_clustered_04']
for name in names:
    ppis = read_fold(name, 'whole')
    print(name, len(ppis))
    for ppi in tqdm(ppis, leave=False):
        assert ppi.exists()

                                                                      

ppiref_6A_raw 765139


                                                                      

ppiref_6A_filtered 349685


                                                                      

ppiref_6A_filtered_clustered_04 51755


                                                        

# Test EquiDock split

In [24]:
ppis = read_fold('dips_equidock', 'whole')
not_existing = [ppi for ppi in ppis if not ppi.exists()]
print(len(not_existing), len(ppis))

                                                                      

0 40143


