### Imports

In [1]:
from spiq.streamer.data_streamer import DataStreamer
from spiq.utils.fingerprints import FingerprintCalculator

### FINGERPRINT MODULE

Simple example on how the API works

In [2]:
# Define a list of SMILES strings
smiles_list = ["CCO", "C1CCCCC1", "O=C=O"]

# Define fingerprint parameters
params = {'fpSize': 2048, 'radius': 2}

# Create an instance of FingerprintCalculator
calculator = FingerprintCalculator()

# Compute fingerprints for the list of SMILES strings
fingerprints = calculator.FingerprintFromSmiles(smiles_list, 'morgan', **params)

# Display the shape of the output fingerprint array
print(f"Fingerprints shape: {fingerprints.shape}")
print("Fingerprint 1", fingerprints[1])

Fingerprints shape: (3, 2048)
Fingerprint 1 [0 0 1 ... 0 0 0]


How to use the API for loading fingerprints in chunks

In [5]:
# Import iterator method
ds = DataStreamer()

chunksize = 1_230
smiles= ds.parse_input(input_path='../data/data_lite.txt', chunksize=chunksize)
print(type(smiles)) # This is only the generator, in order to get each chunk of data we need to iterate

count = 0
for smiles_chunk in smiles:
     count += len(smiles_chunk)
     calculator.FingerprintFromSmiles(smiles_chunk, 'morgan', **params)
     print(f"\r Fingerprints calculated: {count:,}", end='', flush=True)

<class 'generator'>
 Fingerprints calculated: 10,000

If we want to save each chunk as a separate file -ideal for large chunks that we could use later- then `save_chunk` from the `helper_functions.py`is provided

In [7]:
from spiq.utils.helper_functions import save_chunk

smiles= ds.parse_input(input_path='../data/data_lite.txt', chunksize=chunksize)

count = 0
for idx, smiles_chunk in enumerate(smiles):
    count += len(smiles_chunk)
    fp_chunk = calculator.FingerprintFromSmiles(smiles_chunk, 'morgan', **params)
    save_chunk(fp_chunk, output_dir='../data/', chunk_index=idx, file_format='npy')
    print(f"\r Fingerprints calculated: {count:,}", end='', flush=True)

 Fingerprints calculated: 10,000