# Aggregate feature subsets from HPC runs

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
from pathlib import Path
from time import time
import numpy as np
import pandas as pd
from glob import glob
import matplotlib
import matplotlib.pyplot as plt
print(Path.cwd())

/vol/ml/apartin/projects/covid-19/mol-features/nbs


In [2]:
def sizeof(data, verbose=True):
    sz = sys.getsizeof(data)/1e9
    if verbose: print(f'Size in GB: {sz}')
    return sz

In [4]:
FEA_MAIN_DIR = Path('../data/raw/fea-subsets-hpc/')
FEA_TYPE = 'descriptors'
FEA_DIR = FEA_MAIN_DIR/FEA_TYPE
fea_files = sorted( FEA_DIR.glob('OZD-*.csv') )
print(len(fea_files))

668


# Load descriptors

In [4]:
dd_prfx = 'dd'
dd_sep = '_'

In [5]:
dd_fea_names = pd.read_csv(FEA_MAIN_DIR/'dd_fea_names.csv').columns.tolist()
dd_fea_names = [c.strip() for c in dd_fea_names] # clean col names
print(dd_fea_names[:3])
print(len(dd_fea_names))

['ABC', 'ABCGG', 'nAcid']
1826


In [6]:
dd_fea_names = [dd_prfx+dd_sep+str(c) for c in dd_fea_names] # prefix fea cols
cols = ['CAT', 'TITLE', 'SMILES'] + dd_fea_names

# Load single fea df

In [7]:
dd0 = pd.read_csv( Path(fea_files[0]), names=cols )
print(dd0.shape)
print('Expected size when including drugs', sizeof(dd0, verbose=False) * len(fea_files))
display(dd0[:2])

(10000, 1829)
Expected size when including drugs 99.221657888


Unnamed: 0,CAT,TITLE,SMILES,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,dd_SpMax_A,dd_SpDiam_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,ZINC000095370606,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,22.494165,16.342575,0.0,3.0,37.88776,2.578878,5.044375,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,ZINC000040149497,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,22.20804,15.742774,0.0,3.0,37.409843,2.317886,4.635773,...,9.928814,63.701084,417.19205,7.319159,3030.0,38.0,142.0,158.0,8.527778,6.527778


# Agg all fea files to single file

In [8]:
# dfs = []
# dd0 = pd.read_csv( Path(fea_files[0]), names=cols )
# dd1 = pd.read_csv( Path(fea_files[1]), names=cols )
# dfs.append(dd0)
# dfs.append(dd1)
# df = pd.concat(dfs, axis=0).shape
# sizeof(dfs[0])*600

In [9]:
df = None
t0 = time()
dfs = []
lens = []

for i, f in enumerate(fea_files):
    if ( i+1 )%50==0:
        print(f'Load {i+1} ... {f.name}')
    dd = pd.read_csv( Path(fea_files[i]), names=cols )
    lens.append(dd.shape[0])
    # print(sizeof(dd));
    dfs.append(dd)
    
runtime = time() - t0
print('\nRuntime: {:.2f} mins'.format( runtime/60 ))    

Load 50 ... OZD-1420000-1430000.csv
Load 100 ... OZD-1880000-1890000.csv
Load 150 ... OZD-2320000-2330000.csv
Load 200 ... OZD-2780000-2790000.csv
Load 250 ... OZD-3220000-3230000.csv
Load 300 ... OZD-3680000-3690000.csv
Load 350 ... OZD-4120000-4130000.csv
Load 400 ... OZD-4580000-4590000.csv
Load 450 ... OZD-5020000-5030000.csv
Load 500 ... OZD-5480000-5490000.csv
Load 550 ... OZD-5930000-5940000.csv
Load 600 ... OZD-6380000-6390000.csv
Load 650 ... OZD-820000-830000.csv

Runtime: 29.06 mins


In [10]:
fea_df = pd.concat(dfs, axis=0)
fea_df = fea_df.reset_index(drop=True)
print(fea_df.shape)
print(sizeof(fea_df));

(6678493, 1829)
Size in GB: 99.1504175
99.1504175


In [11]:
fea_df[:2]

Unnamed: 0,CAT,TITLE,SMILES,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,dd_SpMax_A,dd_SpDiam_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,ZINC000095370606,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,22.494165,16.342575,0.0,3.0,37.88776,2.578878,5.044375,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,ZINC000040149497,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,22.20804,15.742774,0.0,3.0,37.409843,2.317886,4.635773,...,9.928814,63.701084,417.19205,7.319159,3030.0,38.0,142.0,158.0,8.527778,6.527778


In [12]:
print(fea_df.shape)
fea_df = fea_df.drop_duplicates(subset=['TITLE'])
print(fea_df.shape)
fea_df = fea_df.reset_index(drop=True)

(6678493, 1829)
(6678493, 1829)


# Load docking scores

In [13]:
# meta_path = Path('/vol/ml/apartin/projects/covid-19/mol-features/nbs/OZD.May29.all.csv')
dock_main_dir = Path('/vol/ml/apartin/projects/covid-19/mol-features/data/raw/dock-2020-06-01/OZD')
dock = pd.read_csv(dock_main_dir/'3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.csv')
print(dock.shape)
display(dock[:2])

(6109329, 4)


Unnamed: 0,Inchi-key,SMILES,TITLE,Chemgauss4
0,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285
1,DGLCKPUNCXSIDP-UHFFFAOYSA-P,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,ZINC000040149497,-14.439569


In [14]:
dd_trg = pd.merge(dock, fea_df, how='inner', on=['TITLE', 'SMILES'])
print(dd_trg.shape)
print(sizeof(dd_trg));
display(dd_trg[:2])

(6109328, 1831)
Size in GB: 91.308132886
91.308132886


Unnamed: 0,Inchi-key,SMILES,TITLE,Chemgauss4,CAT,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,OZD,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,DGLCKPUNCXSIDP-UHFFFAOYSA-P,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,ZINC000040149497,-14.439569,OZD,22.20804,15.742774,0.0,3.0,37.409843,...,9.928814,63.701084,417.19205,7.319159,3030.0,38.0,142.0,158.0,8.527778,6.527778


In [16]:
meta_cols = ['CAT', 'Inchi-key', 'SMILES', 'TITLE', 'Chemgauss4']
dd_trg = dd_trg[ meta_cols + dd_fea_names ]
print(dd_trg.shape)
print(sizeof(dd_trg));
display(dd_trg[:2])

(6109328, 1831)
Size in GB: 91.308132886
91.308132886


Unnamed: 0,CAT,Inchi-key,SMILES,TITLE,Chemgauss4,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,DGLCKPUNCXSIDP-UHFFFAOYSA-P,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,ZINC000040149497,-14.439569,22.20804,15.742774,0.0,3.0,37.409843,...,9.928814,63.701084,417.19205,7.319159,3030.0,38.0,142.0,158.0,8.527778,6.527778
