In [1]:
from pathlib import Path
from rdkit.Chem import MolFromSmiles

from camcann.data.io import DataReader, Datasets
from camcann.data.featurise import ECFPCountFeaturiser, SMILESHashes

HERE = Path(".")
HASH_PATH =  HERE / "full_hash.csv"
FEATURES_PATH = HERE / "features_df.csv"

In [2]:
all_data = DataReader(Datasets.QIN_AND_NIST_ANIONICS).df
all_data.describe()

Unnamed: 0,log CMC
count,306.0
mean,3.424285
std,1.278517
min,-0.79588
25%,2.567742
50%,3.518514
75%,4.249067
max,6.414973


In [3]:
all_molecules = [MolFromSmiles(smile) for smile in all_data.SMILES]

In [4]:
if HASH_PATH.exists():
    smiles_hashes = SMILESHashes.load(HASH_PATH)
    featuriser = ECFPCountFeaturiser(smiles_hashes)
else:
    featuriser = ECFPCountFeaturiser()

In [5]:
all_features = featuriser.featurise_molecules(all_molecules, 2, add_new_hashes=True)
featuriser.smiles_hashes.save(HASH_PATH)
print(f"Number of unique groups: {len(featuriser.smiles_hashes)}.")

Number of unique groups: 624.


In [6]:
features_df = featuriser.label_features(all_features, all_data.SMILES)
features_df.to_csv(FEATURES_PATH)

In [7]:
count_nonzero = features_df > 0
nnz = count_nonzero.sum()
num_shared = (nnz > 1).sum()
print(f"Number of groups that occur in multiple compounds: {num_shared}")

Number of groups that occur in multiple compounds: 416
