In [1]:
import os
import copy
import sys
import pyarrow as pa
import math
import pandas as pd
import pyarrow.feather as feather
from rdkit.Chem import rdMolDescriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import mols2grid






In [2]:
#number of record batches to read in at one time.
batch_size = 5

filename = '/data/newdockop/dockop/code/mod_code_base/test0.feather'
#open a stream from an arrow Nativefile.
reader = pa.ipc.open_file(filename)

#read the number of record batches in the stream
total_record_batches = reader.num_record_batches
print(f'The feather has a total of {total_record_batches} record batches.')

#determine how many chunks there will be based on the recordbatch total.
chunks = math.ceil(total_record_batches/batch_size)
print(f'The feather will be read in {chunks} chunks.')



The feather has a total of 299 record batches.
The feather will be read in 60 chunks.


In [3]:
batch_count = list()
top_scores = []
x=1
for x in range(chunks):
    try:
        total_batches_read = sum(batch_count)
        
        #use get_batch to return the batches for this iteration
        batches = [reader.get_batch(i) for i in range(total_batches_read, total_batches_read+batch_size)]
        #Convert the batches into a pyarrow table, a group of record batches
        table = pa.Table.from_batches(batches)
        
        #sent the table to pandas, then keep the top 500 scoring compounds. This could be
        #more selective.. meaning one could pick a cutoff or something else. This is just
        #to demo an interative read on a larger than memory dataset.
        df = pa.Table.to_pandas(table)

        df1 = df.sort_values('pred_list',ascending = False).head(500)
        
        #send the top scores to a list of dataframes. You can combine these, and save to a new
        #feather, or to a parquet.
        top_scores.append(df1)
        batch_count.append(len(batches))
        
        print(f'got {len(batches)} batches, batches remaining: {total_record_batches-sum(batch_count)}')
        
    except:
        total_batches_read = sum(batch_count)
        batches = [reader.get_batch(i) for i in range(total_batches_read, total_record_batches)]
        
        table = pa.Table.from_batches(batches)
        
        df = pa.Table.to_pandas(table)

        df1 = df.sort_values('pred_list',ascending = False).head(500)
        
        top_scores.append(df1)
        
        batch_count.append(len(batches))
        
        batch_count.append(len(batches))
        print(f'got {len(batches)} batches, all batches read')

got 5 batches, batches remaining: 294
got 5 batches, batches remaining: 289
got 5 batches, batches remaining: 284
got 5 batches, batches remaining: 279
got 5 batches, batches remaining: 274
got 5 batches, batches remaining: 269
got 5 batches, batches remaining: 264
got 5 batches, batches remaining: 259
got 5 batches, batches remaining: 254
got 5 batches, batches remaining: 249
got 5 batches, batches remaining: 244
got 5 batches, batches remaining: 239
got 5 batches, batches remaining: 234
got 5 batches, batches remaining: 229
got 5 batches, batches remaining: 224
got 5 batches, batches remaining: 219
got 5 batches, batches remaining: 214
got 5 batches, batches remaining: 209
got 5 batches, batches remaining: 204
got 5 batches, batches remaining: 199
got 5 batches, batches remaining: 194
got 5 batches, batches remaining: 189
got 5 batches, batches remaining: 184
got 5 batches, batches remaining: 179
got 5 batches, batches remaining: 174
got 5 batches, batches remaining: 169
got 5 batche

In [4]:
bigdata = pd.concat(top_scores, ignore_index=True, sort=False)

In [6]:
sorted_hits = bigdata.sort_values('pred_list',ascending = False)
sorted_hits.head(500)

Unnamed: 0,pred_list,smiles,scores,names
4000,0.999318,CCCc1nc(=NC(=O)Nc2cccc(CC(=O)O)c2)s[nH]1,-60.81,ZINC000515809359
20000,0.998493,CNS(=O)(=O)c1cc(C(=O)N(CC(=O)O)C(C)C)co1,-57.61,ZINC000567863307
15000,0.997563,CNS(=O)(=O)c1ccc(C(=O)NCc2cc(C(=O)O)c(C)o2)o1,-69.21,ZINC000362578108
17500,0.997123,CCCC[C@H](NC(=O)c1coc(S(=O)(=O)NC)c1)C(=O)O,-73.54,ZINC000263427499
16000,0.997123,CCCC[C@@H](NC(=O)c1coc(S(=O)(=O)NC)c1)C(=O)O,-58.9,ZINC000263427487
...,...,...,...,...
505,0.937052,CN(Cc1nc2ccsc2c(=O)[nH]1)C(=O)c1cc(C(=O)O)co1,-61.99,ZINC000183764830
12507,0.936906,O=C(O)c1nc(I)c(Cl)cc1F,-62.95,ZINC000095611713
16017,0.936772,Cc1oc(CNC(=O)C2=NN(C)C(=O)CC2)cc1C(=O)O,-63.41,ZINC000038059852
27014,0.936763,Cn1c(O)ncc(C(=O)NCCCCCCCC(=O)O)c1=O,-57.02,ZINC000262129765
