In [2]:
#Use bloom filters to search for a molecule from a list of molecules

import mmh3
from bitarray import bitarray

class BloomFilter:
    
    def __init__(self, size, hash_count):
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def add(self, element):
        for seed in range(self.hash_count):
            result = mmh3.hash(element, seed) % self.size
            self.bit_array[result] = 1

    def __contains__(self, element):
        for seed in range(self.hash_count):
            result = mmh3.hash(element, seed) % self.size
            if self.bit_array[result] == 0:
                return False
        return True

In [5]:
import pandas as pd

df=pd.read_csv("lipo.csv")
smiles=df['smiles']

In [9]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

m=[Chem.MolFromSmiles(i) for i in smiles] 


for mol in m:
    AllChem.Compute2DCoords(mol)
X = []
for mol in m:
    arr = np.zeros((0,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    DataStructs.ConvertToNumpyArray(fp, arr)
    X.append(arr)

In [28]:
X=[list(i) for i in X]

In [36]:
# Create a bloom filter with size 2048 and 100 hash functions
bf = BloomFilter(2048, 100)

# Add some elements to the bloom filter
array_list = X[3:]   #Let's start the array from the third element
for array in array_list:
    bf.add(str(array))

In [37]:
# Check if an array exists in the bloom filter
array_to_check = X[60]
if str(array_to_check) in bf:
    # Perform a full search to confirm if the array is in the list
    if array_to_check in array_list:
        print("The array is in the list of arrays.")
    else:
        print("The array is not in the list of arrays.")
else:
    print("The array is definitely not in the list of arrays.")

The array is in the list of arrays.


In [38]:
# Check if an array exists in the bloom filter
array_to_check = X[1]
if str(array_to_check) in bf:
    # Perform a full search to confirm if the array is in the list
    if array_to_check in array_list:
        print("The array is in the list of arrays.")
    else:
        print("The array is not in the list of arrays.")
else:
    print("The array is definitely not in the list of arrays.")

The array is not in the list of arrays.
