In [24]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import deepchem as dc
import rdkit
from rdkit import Chem


In [25]:
def convert_to_categorical(dataframe, cols=None):
    """
    Converts non-numerical categorical values to integers.  This function is most useful for ordinal variables.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - cols: list of categorical columns to convert to integers
    
    Returns
    -------
    - modified dataframe with user selected columns with categorical string values converted to integer values
    """
    
    # generate binary values using get_dummies
    if cols is not None:
        if type(cols) == str: # allow single columns to be input as strings
            cols = [cols]
        else:
            pass
        
        for col in cols:
            if len(df1[col].shape) == 1: # for 1D arrays (usally y, target variables)
                # define label encoder
                encoder = preprocessing.LabelEncoder()
                # create new columns and preserve the original columns
                dataframe.loc[:,col+'_encoded'] = encoder.fit_transform(dataframe[col])
            else: # for 2D arrays (usually X, input features)
                # define ordinal encoder
                encoder = preprocessing.LabelEncoder()
                # create new columns and preserve the original columns
                dataframe.loc[:,col+'_encoded'] = encoder.fit_transform(dataframe[col])
        
    else:
        pass
    
    return dataframe


In [26]:
def convert_to_dataset(dataframe, 
                       X_col: str = 'X', 
                       y_col: str = 'y', 
                       w_col: str = None ,
                       id_col: str = None, 
                       return_csv = False):
    """
    Converts dataframe into a deepchem dataset object.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - X_col: (str or List[str]) name(s) of the column(s) containing the X array.
    - y_col: (str or List[str]) name(s) of the column(s) containing the y array.
    - w_col: (str or List[str]) name(s) of the column(s) containing the w array.
    - id_col: (str) name of the column containing the ids.
    - return_csv: (True/False) whether a viewable csv of the data will be returned with the dataset object.

    Returns
    -------
    - data_objs: tuple containing deepchem dataset object at index [0] and csv at index[1], or just deepchem dataset object
    """
    # define x
    X = dataframe[X_col].values
    if type(X[0][0]) is np.ndarray:
        X = np.stack(X).reshape(len(dataframe), -1)
    else:
        X = np.stack(X).reshape(len(dataframe))
        
    # define y
    y = dataframe[y_col].values
    
    # define weight
    if w_col is not None:
        w = dataframe[w_col].values
    else:
        w = None
    
    # define ids
    if id_col is not None:
        ids = dataframe[id_col].values
    else:
        ids = None
    
    # create deepchem dataset object
    dataset = dc.data.NumpyDataset(X, y, w, ids)
    
    # return dataset and csv of current dataframe if return_csv equals True
    # return only dataset if return_csv equals False or unspecified
    if return_csv is True:
        csv = dataframe.to_csv(index = True, header=True)
        print('returned value contains both dataset object and csv file')
        data_objs = (dataset, csv)
    else:
        data_objs = (dataset)
        print(type(data_objs))
        
    return data_objs



In [27]:
def data_transformation(dataset, 
                        transformations: list = ['NormalizationTransformer'], 
                        to_transform: list = [],  
                        **kwargs):
    """
    Transforms and splits deepchem dataset

    Parameters
    ----------
    - dataset: deepchem dataset to be transformed
    - transformations: (List[str]) list of transformation methods to pass dataset through
    - to_transform: (list[str]) list of elements to transform, and can include 'X', 'y', or 'w'
    - **kwargs: keyword arguments to be passed to the selected transformer

    Returns
    -------
    - transformed dataset
    - list of transformer objects
    """
    
    # make a list to store transformer object, which can later be used to untransform data
    transformer_list = []

    # feed dataset into list of transformers sequentially, returning a single transformed dataset
    for transformation in transformations:
        if to_transform is not None:
        
            if(all(elem in to_transform for elem in ['X', 'y', 'w'])):
                transformer = getattr(dc.trans, transformation)(transform_X=True, transform_y=True, transform_w=True, dataset=dataset, **kwargs)
            elif(all(elem in to_transform for elem in ['X', 'y'])):
                transformer = getattr(dc.trans, transformation)(transform_X=True, transform_y=True, dataset=dataset, **kwargs)
            elif 'X' in to_transform:
                transformer = getattr(dc.trans, transformation)(transform_X=True, dataset=dataset, **kwargs)
            elif 'y' in to_transform:
                transformer = getattr(dc.trans, transformation)(transform_y=True, dataset=dataset, **kwargs)
            else:
                transformer = getattr(dc.trans, transformation)(dataset=dataset, **kwargs)
        else:
            transformer = getattr(dc.trans, transformation)(dataset=dataset, **kwargs)
            
        transformer_list.append(transformer)
        
        dataset = transformer.transform(dataset)
    
    return dataset, transformer_list

In [28]:
def data_splitting(dataset, 
                   splitter: str = 'RandomSplitter', 
                   split_type: str = 'train_valid_test_split', 
                   **kwargs):
    """
    Transforms and splits deepchem dataset

    Parameters
    ----------
    - dataset: deepchem dataset to be split
    - splitter: (str) class of deepchem split method
    - split_type: (str) type of split (k_fold_split/train_test_split/train_valid_test_split)
    - **kwargs: keyword arguments to be passed to the selected splitter

    Returns
    -------
    Split dataset
    """

    # split data
    data_splitter = getattr(dc.splits, splitter)
    
    try:
        split = data_splitter(**kwargs)
    except TypeError:
        split = data_splitter()
        
    # this only allows the following three split_types to be used
    # the 'split' option is excluded since it seems to do the same thing as 'train_valid_test_split' but returns non-dataset objects
    # might want to write code to reset certain defaults depending on which split_type is chosen (i.e. from None to 0.8)
    
    if split_type == 'k_fold_split' or split_type == 'k':
        data_split = split.k_fold_split(dataset=dataset, **kwargs)
    elif split_type == 'train_test_split' or split_type == 'tt':
        data_split = split.train_test_split(dataset=dataset, **kwargs)
    elif split_type == 'train_valid_test_split' or split_type =='tvt':
        data_split = split.train_valid_test_split(dataset=dataset, **kwargs)
    elif split_type == 'generate_scaffolds':
        if hasattr(split, 'generate_scaffolds'):
            data_split = split.generate_scaffolds(dataset=dataset, **kwargs) # Unsure about functionality, code may need to be added
        else:
            raise AttributeError ('split_type may only be set as generate_scaffolds if splitter set as ScaffoldSplitter')
    else: 
        print('split_type string is not a recognized split') # should change this to raise an error of some kind
    
    return data_split

In [29]:
def dataset_prep (dataframe, 
                  transformations: list = None, 
                  to_transform: list = None, 
                  input_features = None,
                  label = None, 
                  weights = None, 
                  id_col = None, 
                  splitter = None, 
                  splitter_type = None, 
                  return_csv: bool = False, 
                  **kwargs):
    """
    Wrapping functions for convert_to_dataset, data_transformation, and data_splitting
    
    Parameters
    ----------
    - dataframe: dataframe to be converted to dataset object
    - transformations: (List[str]) list of transformation methods to pass dataset through
    - to_transform: (list[str]) list of elements to transform, and can include 'X', 'y', or 'w'
    - input_features: (str or List[str]) name(s) of the column(s) containing the X array.
    - label: (str or List[str]) name(s) of the column(s) containing the y array.
    - weights: (str or List[str]) name(s) of the column(s) containing the w array.
    - id_col: (str) name of the column containing the ids.
    - splitter: (str) class of deepchem split method
    - split_type: (str) type of split (k_fold_split/train_test_split/train_valid_test_split)
    - return_csv: (True/False) whether a viewable csv of the data will be returned with the dataset object.
    
    Returns
    -------
    - split_data: tuple or list containing dataset splits
    - transformer_list: list of transformer objects used
    - csv: csv file of input dataframe which can be saved for future use
    """
    
    
    # convert dataframe to dataset object
    data = convert_to_dataset(dataframe = dataframe, X_col = input_features, y_col = label, w_col = weights,id_col = id_col, return_csv = return_csv)
    
    if return_csv is True:
        dataset = data[0]
        csv = data[1]
        print('csv file created')
    else:
        dataset = data
        csv = None
        print('no csv file created')
    
    # transform data
    if transformations is not None:
        transformed_dataset, transformer_list = data_transformation(dataset = dataset, transformations = transformations, to_transform = to_transform,  **kwargs)
    else:
        transformed_dataset = dataset, transformer_list = []
        print('no transformations performed')
        
    # split data
    split_data = data_splitting(dataset = transformed_dataset, splitter = splitter, split_type = splitter_type, **kwargs)
    
    return split_data, transformer_list, csv
    
    
    

## Test CSV - ChEMBL

In [3]:
"""
Input/Output module for loading data and generating output files.
"""

import pandas as pd
import os


def load_data(csv_file, col_id=None):
    """
    Loads a data file into a dataframe containing the raw data.
    Parameters
    ----------
    datafile: the file containing the data to be loaded
    col_id: the column (names?numbers?either?) that the user wants to
        use to remove duplicates (ie remove any duplicates based on
        the inputted column id), default=None
    Returns
    -------
    DataFrame containg the raw data from the csv,
    with duplicates removed if a column for doing so
    is passed
    """
    # assert file exists and contains data
    assert os.path.exists(csv_file), 'File name does not exist'
    # run any more checks specific to the data that we may want to add
    # load a csv file into a dataframe
    df = pd.read_csv(csv_file)
    if col_id is not None:
        df_1 = df.drop_duplicates(subset=col_id)
    else:
        df_1 = df
    df_2 = df_1.reset_index()
    # remove/ignore unwanted columns?
    return df_2
"""
Input/Output module for loading data and generating output files.
"""

import pandas as pd
import os


def load_data(csv_file, col_id=None):
    """
    Loads a data file into a dataframe containing the raw data.
    Parameters
    ----------
    datafile: the file containing the data to be loaded
    col_id: the column (names?numbers?either?) that the user wants to
        use to remove duplicates (ie remove any duplicates based on
        the inputted column id), default=None
    Returns
    -------
    DataFrame containg the raw data from the csv,
    with duplicates removed if a column for doing so
    is passed
    """
    # assert file exists and contains data
    assert os.path.exists(csv_file), 'File name does not exist'
    # run any more checks specific to the data that we may want to add
    # load a csv file into a dataframe
    df = pd.read_csv(csv_file)
    if col_id is not None:
        df_1 = df.drop_duplicates(subset=col_id)
    else:
        df_1 = df
    df_2 = df_1.reset_index()
    # remove/ignore unwanted columns?
    return df_2

In [4]:
df_orig = load_data('ChemBL_ATCC25922_MIC_2.csv')

In [5]:
df_orig

Unnamed: 0,index,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Smiles,Standard Type,Standard Relation,...,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,0,CHEMBL4545569,,0,520.56,1,-0.08,C[C@@](CCN1Cc2cc(C#CC#CCOC(=O)N3CCC(O)CC3)cn2C...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4321865,1,Scientific Literature,J Med Chem,2020.0,,
1,1,CHEMBL4462541,,0,862.12,2,3.60,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
2,2,CHEMBL4463662,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
3,3,CHEMBL4476233,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
4,4,CHEMBL4593661,,0,447.51,0,0.34,C[C@]1(CO)C[C@@H]1C#CC#Cc1cc2n(c1)C(=O)N(CC[C@...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4321867,1,Scientific Literature,J Med Chem,2020.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5292,5292,CHEMBL2022927,,0,394.43,0,3.81,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,
5293,5293,CHEMBL4536413,,0,361.24,0,3.30,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,
5294,5294,CHEMBL4551417,,0,615.76,1,4.03,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513139,40,CO-ADD antimicrobial screening data,,,,
5295,5295,CHEMBL4543834,,0,220.66,0,3.00,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,


In [6]:
data = load_data('ChemBL_ATCC25922_MIC_2.csv', col_id='Molecule ChEMBL ID')

In [7]:
data

Unnamed: 0,index,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Smiles,Standard Type,Standard Relation,...,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,0,CHEMBL4545569,,0,520.56,1,-0.08,C[C@@](CCN1Cc2cc(C#CC#CCOC(=O)N3CCC(O)CC3)cn2C...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4321865,1,Scientific Literature,J Med Chem,2020.0,,
1,1,CHEMBL4462541,,0,862.12,2,3.60,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
2,2,CHEMBL4463662,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
3,3,CHEMBL4476233,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,
4,4,CHEMBL4593661,,0,447.51,0,0.34,C[C@]1(CO)C[C@@H]1C#CC#Cc1cc2n(c1)C(=O)N(CC[C@...,MIC,'=',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4321867,1,Scientific Literature,J Med Chem,2020.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4601,5292,CHEMBL2022927,,0,394.43,0,3.81,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,
4602,5293,CHEMBL4536413,,0,361.24,0,3.30,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,
4603,5294,CHEMBL4551417,,0,615.76,1,4.03,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513139,40,CO-ADD antimicrobial screening data,,,,
4604,5295,CHEMBL4543834,,0,220.66,0,3.00,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,MIC,'>',...,Escherichia coli,Escherichia coli,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,


In [8]:
def molstr_to_Mol(dataframe, strcolumnID='InChI String'):
    """
    Converts DataFrame column containing the string representations of
    molecules and add a corresponding column of Mol objects.
    Parameters
    ----------
    dataframe: DataFrame containing a column with string representations
    of molecules
    strcolumnID: label for the column containg the string representations for
    converstion, default='InChI String'- can be changed for the user's
    standard column name
    Returns
    -------
    DataFrame with additional column containing Mol objects for each molecule,
    column label is 'Mol'
    """
    mols = []
    if 'inchi' in strcolumnID.lower():
        for inchi in dataframe[strcolumnID]:
            mol = Chem.MolFromInchi(inchi)
            mols.append(mol)

    elif 'smiles' in strcolumnID.lower():
        for smiles in dataframe[strcolumnID]:
            mol = Chem.MolFromSmiles(smiles)
            mols.append(mol)

    dataframe['Mol'] = mols
    return dataframe


def add_features(dataframe, MolcolumnID='Mol', method='CircularFingerprint'):
    """
    Featurizes a set of Mol objects using the desired feturization method.
    note: may want to change setup of parameters here
    note: may be better suited as a class
    note: might not need separate featurizer for graph featurization-
    could include here I think
    list of possible featurizers to wrap up here:
    https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html
    Parameters
    ----------
    dataframe: a DataFrame containing a column with Mol objects-
    may want to play around with how we want to input the set of Mols
    for featurization
    columnID: label of the column containing the Mol objects, default based
    on function for adding a Mol column
    Returns
    -------
    DataFrame containing a column of the featurized representation of the Mol
    object with the featurization method as the column ID
    """
    # Check that set contains Mol objects
    assert isinstance(dataframe['Mol'][0], rdkit.Chem.Mol),\
        'Mol column does not contain Mol object'
    featurizer = getattr(dc.feat, method)()
    f_list = []
    for mol in dataframe['Mol']:
        f = featurizer.featurize(mol)
        f_list.append(f)
    dataframe[method] = f_list

    # assert isinstance(object_in_set, rdkit.Chem.rdchem.Mol)

    # molecules format optins-  rdkit.Chem.rdchem.Mol /
    # SMILES string / iterable
    # either a loop or pass an iterable set of Mol
    # iterable = convert column of Mol to array
    # features = featurizer.featurize(iterable)
    # add the featurized representation into the passed dataframe
    # dataframe[method] = features

    return dataframe


def get_descriptors(dataframe, descriptor_type):
    """
    Extracts molecular descriptors and adds them to a new column
    Something to look into in the future
    What descriptors might we want? how can we get them?
    will need a featurizing function that can take any descriptors we
    extract here
    """
    # dataframe[descriptor_type] = descriptorx_list
    return dataframe

In [9]:
data = data[data['Smiles'].notna()]

data = data[data['Standard Value'].notna()]

data_w_mol = molstr_to_Mol(data, strcolumnID='Smiles')

data_feat = add_features(data_w_mol)
data_feat = data_feat.reset_index(drop=True)

In [10]:
data_feat

Unnamed: 0,index,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Smiles,Standard Type,Standard Relation,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Mol,CircularFingerprint
0,0,CHEMBL4545569,,0,520.56,1,-0.08,C[C@@](CCN1Cc2cc(C#CC#CCOC(=O)N3CCC(O)CC3)cn2C...,MIC,'=',...,ORGANISM,CHEMBL4321865,1,Scientific Literature,J Med Chem,2020.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841d36f030>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
1,1,CHEMBL4462541,,0,862.12,2,3.60,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841d36f170>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,2,CHEMBL4463662,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2d0210>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,3,CHEMBL4476233,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,ORGANISM,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2e2850>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,4,CHEMBL4593661,,0,447.51,0,0.34,C[C@]1(CO)C[C@@H]1C#CC#Cc1cc2n(c1)C(=O)N(CC[C@...,MIC,'=',...,ORGANISM,CHEMBL4321867,1,Scientific Literature,J Med Chem,2020.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2e2210>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4578,5292,CHEMBL2022927,,0,394.43,0,3.81,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,MIC,'>',...,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fb70>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4579,5293,CHEMBL4536413,,0,361.24,0,3.30,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,MIC,'>',...,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fbc0>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4580,5294,CHEMBL4551417,,0,615.76,1,4.03,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,MIC,'>',...,ORGANISM,CHEMBL4513139,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fc10>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
4581,5295,CHEMBL4543834,,0,220.66,0,3.00,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,MIC,'>',...,ORGANISM,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fc60>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [16]:
data_feat2 = add_features(data_w_mol, method='ConvMolFeaturizer')
data_feat2 = data_feat2.reset_index(drop=True)

In [17]:
data_feat2

Unnamed: 0,index,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Mol,CircularFingerprint,ConvMolFeaturizer
0,0,CHEMBL4545569,,0,520.56,1,-0.08,C[C@@](CCN1Cc2cc(C#CC#CCOC(=O)N3CCC(O)CC3)cn2C...,MIC,'=',...,CHEMBL4321865,1,Scientific Literature,J Med Chem,2020.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841d36f030>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
1,1,CHEMBL4462541,,0,862.12,2,3.60,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841d36f170>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
2,2,CHEMBL4463662,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2d0210>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
3,3,CHEMBL4476233,,0,976.14,2,4.34,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'=',...,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2e2850>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
4,4,CHEMBL4593661,,0,447.51,0,0.34,C[C@]1(CO)C[C@@H]1C#CC#Cc1cc2n(c1)C(=O)N(CC[C@...,MIC,'=',...,CHEMBL4321867,1,Scientific Literature,J Med Chem,2020.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2e2210>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4578,5292,CHEMBL2022927,,0,394.43,0,3.81,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,MIC,'>',...,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fb70>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
4579,5293,CHEMBL4536413,,0,361.24,0,3.30,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,MIC,'>',...,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fbc0>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
4580,5294,CHEMBL4551417,,0,615.76,1,4.03,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,MIC,'>',...,CHEMBL4513139,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fc10>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
4581,5295,CHEMBL4543834,,0,220.66,0,3.00,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,MIC,'>',...,CHEMBL4513135,40,CO-ADD antimicrobial screening data,,,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0fc60>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...


In [21]:
sample_dataframe = data_feat2.sample(n=5)
sample_dict = sample_dataframe.to_dict(orient = 'list')
print(sample_dict)

{'index': [4829, 5079, 731, 749, 3488], 'Molecule ChEMBL ID': ['CHEMBL1276627', 'CHEMBL4457967', 'CHEMBL2424894', 'CHEMBL2424889', 'CHEMBL3086842'], 'Molecule Name': [nan, nan, nan, nan, nan], 'Molecule Max Phase': [0, 0, 0, 0, 0], 'Molecular Weight': ['406.53', '880.11', '450.51', '497.58', '232.24'], '#RO5 Violations': ['0', '2', '0', '0', '0'], 'AlogP': ['4.12', '3.74', '2.66', '1.81', '1.95'], 'Smiles': ['COc1ccc(C2c3ccc4ccccc4c3OC3NC(=S)NC(=S)C32)cc1', 'CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@H](N(C)C)[C@H]3O)[C@](C)(OC)C[C@@H](C)[C@H](O)[C@H](C)CN(C)[C@H](Cn3cc(-c4ccc(F)cc4)nn3)COC(=O)[C@@H]2C)O[C@@H](C)[C@@H]1O', 'O=c1ccc2ccc(F)c3c2n1CC3CN1CCC(NCc2cc3c(cn2)OCCO3)CC1', 'COc1ccc2nccc([C@H](O)[C@@H](O)[C@H]3CC[C@H](NCc4ccc5c(n4)NC(=O)CS5)CO3)c2n1', 'Cc1cc(=O)cc(Cc2cc(O)cc(O)c2)o1'], 'Standard Type': ['MIC', 'MIC', 'MIC', 'MIC', 'MIC'], 'Standard Relation': ["'='", "'>'", "'='", "'='", "'>'"], 'Standard Value': [500.0, 128.0, 0.001, 0.001, 64.0], 'Standard Units'

In [22]:
testing_df = pd.DataFrame.from_dict(sample_dict)
testing_df

Unnamed: 0,index,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Mol,CircularFingerprint,ConvMolFeaturizer
0,4829,CHEMBL1276627,,0,406.53,0,4.12,COc1ccc(C2c3ccc4ccccc4c3OC3NC(=S)NC(=S)C32)cc1,MIC,'=',...,CHEMBL1275291,1,Scientific Literature,Eur. J. Med. Chem.,2010.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af07e90>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
1,5079,CHEMBL4457967,,0,880.11,2,3.74,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,MIC,'>',...,CHEMBL4334458,1,Scientific Literature,Eur J Med Chem,2019.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f840af0c210>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
2,731,CHEMBL2424894,,0,450.51,0,2.66,O=c1ccc2ccc(F)c3c2n1CC3CN1CCC(NCc2cc3c(cn2)OCC...,MIC,'=',...,CHEMBL2424615,1,Scientific Literature,J. Med. Chem.,2013.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2f6f30>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
3,749,CHEMBL2424889,,0,497.58,0,1.81,COc1ccc2nccc([C@H](O)[C@@H](O)[C@H]3CC[C@H](NC...,MIC,'=',...,CHEMBL2424615,1,Scientific Literature,J. Med. Chem.,2013.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b2f7530>,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...
4,3488,CHEMBL3086842,,0,232.24,0,1.95,Cc1cc(=O)cc(Cc2cc(O)cc(O)c2)o1,MIC,'>',...,CHEMBL3085737,1,Scientific Literature,J. Nat. Prod.,2013.0,,,<rdkit.Chem.rdchem.Mol object at 0x7f841b326a80>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[<deepchem.feat.mol_graphs.ConvMol object at 0...


In [30]:
dataset, csv = convert_to_dataset(data_feat2, 
                             X_col= 'CircularFingerprint', 
                             y_col= 'Standard Value', 
                             w_col= None ,
                             id_col = None, 
                             return_csv = True)

returned value contains both dataset object and csv file


In [37]:
isinstance(dataset, dc.data.datasets.NumpyDataset)

True

In [42]:
dataset.X.shape[1]

2048

In [63]:
transformed_data, transformer_list = data_transformation(dataset, 
                        transformations= ['MinMaxTransformer', 'NormalizationTransformer'], 
                        to_transform= ['X'])

  X = np.nan_to_num((X - self.X_means) / self.X_stds)


In [64]:
isinstance(transformer_list[0], dc.trans.transformers.NormalizationTransformer)

False

In [97]:
train, test = data_splitting(dataset = transformed_data, 
                                                splitter = 'RandomSplitter',
                                                split_type = 'tt', 
                                                frac_train = 0.8)

In [101]:
import math

a = train.X.shape[0]/test.X.shape[0]
b = 4
math.isclose(a, b, rel_tol=0.05)

True

# Unused Fuctions
#### These functions are either redundant or incompatible with the rest of the current package code, but the functions themselves are still independently functional and can be used for other purposes.

### Unused categorical converter
This function converts categorical columns to integers using the Pandas get_dummies function (similar to OneHotEncoding).  It was removed from used given that part of its functionality (assigning each category its own column) is made redundant by the modeling functions downstream.

In [None]:
def convert_to_categorical(dataframe, cols=None):
    """
    Converts non-numerical categorical values to integers.  This function is most useful for nominal variables.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - cols: list of categorical columns to convert to integers
    
    Returns
    -------
    - modified dataframe with user selected columns with categorical string values converted to integer values
    - list of list containing new column names separated by originating column
    """
    
    # create list if new column prefixes
    prefix_list = []
    count = 0
    for col in cols:
        count = count + 1
        prefix_list.append('categorical_column'+str(count))
        
    # create dict to link original column names to newly created prefixes
    new_column_dict = dict(zip(cols, prefix_list))
    
    # generate binary values using get_dummies
    if cols is not None:
        if type(cols) == str: # allow single columns to be input as strings
            cols = [cols]
            dataframe = pd.get_dummies(dataframe, prefix=prefix_list, columns=cols)
        elif type(cols) == list:
            dataframe = pd.get_dummies(dataframe, prefix=prefix_list, columns=cols)
        
        # get list of new categorical columns
        categories_list = []
        for col in cols:
            var_list_name = col
            var_list_name = []
            for column_name in list(dataframe):
                if new_column_dict[col] in column_name:
                    var_list_name.append(column_name)
                else:
                    continue
            categories_list.append(var_list_name)
            
    else:
        pass
    
    return dataframe, categories_list
