# ECFP6 NN Data preprocessing

This ipynb is intended to be used in tandem with the 'ecfp_nn.ipynb' file. Please run this file first to construct the 

In [5]:
import pandas as pd
import pickle
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

## Data preprocessing and Loading

In [6]:
def string_to_bitstr(smiles):
    mol = Chem.MolFromSmiles(smiles) # convert smiles to molecule
    mol = Chem.AddHs(mol) # include hydrogen's in EFCP
    # next line converts the molecule into a ~ECFP6 fingerprint string with 2048 bits
    fprint = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=3, nBits=2048)
    # convert that to the one hot encoding of the ECFP6
    rtrn = np.fromiter(fprint.ToBitString(), "u1")
    return rtrn

In [3]:
def load_B3DB(path="B3DB/B3DB/B3DB_classification.tsv"):
    # load dataset from B3DB
    data = pd.read_csv(path, sep="\t")
    fingerprints = []
    labels = []
    for idx, row in data.iterrows(): # iter thru data
        smiles = row["SMILES"] # get smiles
        fingerprints.append(string_to_bitstr(smiles)) # convert smiles to bitstring
        if row['BBB+/BBB-'] == 'BBB+': # assing 1 for BBB+, 0 for BBB-
            labels.append(1)
        else:
            labels.append(0)
    fingerprints = np.array(fingerprints)
    labels = np.array(labels)
    return (fingerprints, labels)

def pickle_database(obj, name='fps_and_labels.pickle'):
    with open('fps_and_labels.pickle', 'wb') as f:
        pickle.dump(obj, f)
    return

### Build and pickle the processed data

To build your pickled ecfp data object, all that is necessary is that the proper path to reach the "B3DB_classification.tsv" be provided. It can be enter below, or if not entered, an attempt to access it as though it resided in the same folder as this file will be made.

In [2]:
# To set a path, uncomment the line below and type in the appropriate string.
# path = ""

In [10]:
# This pickling was done because rdkit is unusable in google colab
fingerprints, labels = load_B3DB(path)
pickle_database((fingerprints, labels))