In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import deepchem as dc

In [4]:
def convert_to_categorical(dataframe, cols=None):
    """
    Converts non-numerical categorical values to integers.  This function is most useful for ordinal variables.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - cols: list of categorical columns to convert to integers
    
    Returns
    -------
    - modified dataframe with user selected columns with categorical string values converted to integer values
    """
    
    # generate binary values using get_dummies
    if cols is not None:
        if type(cols) == str: # allow single columns to be input as strings
            cols = [cols]
        else:
            pass
        
        for col in cols:
            if len(df1[col].shape) == 1: # for 1D arrays (usally y, target variables)
                # define label encoder
                encoder = preprocessing.LabelEncoder()
                # create new columns and preserve the original columns
                dataframe.loc[:,col+'_encoded'] = encoder.fit_transform(dataframe[col])
            else: # for 2D arrays (usually X, input features)
                # define ordinal encoder
                encoder = preprocessing.LabelEncoder()
                # create new columns and preserve the original columns
                dataframe.loc[:,col+'_encoded'] = encoder.fit_transform(dataframe[col])
        
    else:
        pass
    
    return dataframe


In [5]:
def convert_to_dataset(dataframe, 
                       X_col: str = 'X', 
                       y_col: str = 'y', 
                       w_col: str = None ,
                       id_col: str = None, 
                       return_csv = False):
    """
    Converts dataframe into a deepchem dataset object.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - X_col: (str or List[str]) name(s) of the column(s) containing the X array.
    - y_col: (str or List[str]) name(s) of the column(s) containing the y array.
    - w_col: (str or List[str]) name(s) of the column(s) containing the w array.
    - id_col: (str) name of the column containing the ids.
    - return_csv: (True/False) whether a viewable csv of the data will be returned with the dataset object.

    Returns
    -------
    - deepchem dataset
    - (conditional) csv of dataframe
    """
    # define x
    X = dataframe[X_col]
    
    # define y
    y = dataframe[y_col]
    
    # define weight
    if w_col is not None:
        w = dataframe[w_col]
    else:
        w = None
    
    # define ids
    if id_col is not None:
        ids = dataframe[id_col]
    else:
        ids = None
    
    # create deepchem dataset object
    to_dataset = dc.data.NumpyDataset(X, y, w, ids)
    dataset = to_dataset.from_dataframe(dataframe)
    
    # return dataset and csv of current dataframe if return_csv equals True
    # return only dataset if return_csv equals False or unspecified
    if return_csv is True:
        csv = dataframe.to_csv(index = True, header=True)
        return dataset, csv
    else:
        return dataset


In [50]:
def data_transformation(dataset, 
                        transformations: list = ['NormalizationTransformer'], 
                        to_transform: list = [],  
                        **kwargs):
    """
    Transforms and splits deepchem dataset

    Parameters
    ----------
    - dataset: deepchem dataset to be transformed
    - transformations: (List[str]) list of transformation methods to pass dataset through
    - to_transform: (list[str]) list of elements to transform, and can include 'X', 'y', or 'w'
    - **kwargs: keyword arguments to be passed to the selected transformer

    Returns
    -------
    Transformed dataset
    """

    # feed dataset into list of transformers sequentially, returning a single transformed dataset
    for transformation in transformations:
        if to_transform is not None:
        
            if(all(elem in to_transform for elem in ['X', 'y', 'w'])):
                transformer = getattr(dc.trans, transformation)(transform_X=True, transform_y=True, transform_w=True, dataset=dataset, **kwargs)
            elif(all(elem in to_transform for elem in ['X', 'y'])):
                transformer = getattr(dc.trans, transformation)(transform_X=True, transform_y=True, dataset=dataset, **kwargs)
            elif 'X' in to_transform:
                transformer = getattr(dc.trans, transformation)(transform_X=True, dataset=dataset, **kwargs)
            elif 'y' in to_transform:
                transformer = getattr(dc.trans, transformation)(transform_y=True, dataset=dataset, **kwargs)
            else:
                transformer = getattr(dc.trans, transformation)(dataset=dataset, **kwargs)
        else:
            transformer = getattr(dc.trans, transformation)(dataset=dataset, **kwargs)
            
        dataset = transformer.transform(dataset)
    
    return dataset

In [99]:
def data_splitting(dataset, 
                   splitter: str = 'RandomSplitter', 
                   split_type: str = 'train_valid_test_split', 
                   **kwargs):
    """
    Transforms and splits deepchem dataset

    Parameters
    ----------
    - dataset: deepchem dataset to be split
    - splitter: (str) class of deepchem split method
    - split_type: (str) type of split (k_fold_split/train_test_split/train_valid_test_split)
    - **kwargs: keyword arguments to be passed to the selected splitter

    Returns
    -------
    Split dataset
    """

    # split data
    data_splitter = getattr(dc.splits, splitter)
    
    if hasattr(data_splitter, __init__):
        data_splitter = data_splitter(**kwargs)
    else:
        pass

    # this only allows the following three split_types to be used
    # the 'split' option is excluded since it seems to do the same thing as 'train_valid_test_split' but returns non-dataset objects
    # might want to write code to reset certain defaults depending on which split_type is chosen (i.e. from None to 0.8)
    
    if split_type == 'k_fold_split' or split_type == 'k':
        data_split = data_splitter.k_fold_split(dataset, **kwargs)
    elif split_type == 'train_test_split' or split_type == 'tt':
        data_split = data_splitter.train_test_split(dataset, **kwargs)
    elif split_type == 'train_valid_test_split' or split_type =='tvt':
        data_split = data_splitter.train_valid_test_split(dataset=dataset, **kwargs)
    elif split_type == 'generate_scaffolds':
        if hasattr(data_splitter, 'generate_scaffolds'):
            data_split = data_splitter.generate_scaffolds(dataset, **kwargs) # Unsure about functionality, code may need to be added
        else:
            raise AttributeError ('split_type may only be set as generate_scaffolds if splitter set as ScaffoldSplitter')
    else: 
        print('split_type string is not a recognized split') # should change this to raise an error of some kind
    
    return data_split

In [100]:
"""
This is a version to use which works with the RandomSplitter 
that we can use for testing while I fix the bugs in the correct version.
"""

def data_splitting(dataset,  
                   split_type: str = 'train_valid_test_split', 
                   k: int = None, 
                   frac_train: float = None, 
                   frac_valid: float = None, 
                   frac_test: float = None, 
                   log_every_n: int = None):
    """
    Transforms and splits deepchem dataset

    Parameters
    ----------
    - dataset: deepchem dataset to be split
    - splitter: (str) class of deepchem split method
    - split_type: (str) type of split (k_fold_split/train_test_split/train_valid_test_split)
    - k: int
    - frac_train: float 
    - frac_valid: float 
    - frac_test: float 
    - log_every_n: int

    Returns
    -------
    Split dataset
    """

    # split data
    #data_splitter = getattr(dc.splits, splitter)
    data_splitter = dc.splits.RandomSplitter()

    # this only allows the following three split_types to be used
    # the 'split' option is excluded since it seems to do the same thing as 'train_valid_test_split' but returns non-dataset objects
    
    if split_type == 'k_fold_split' or split_type == 'k':
        data_split = data_splitter.k_fold_split(dataset=dataset, k=k)
    elif split_type == 'train_test_split' or split_type == 'tt':
        data_split = data_splitter.train_test_split(dataset=dataset, frac_train=frac_train)
    elif split_type == 'train_valid_test_split' or split_type =='tvt':
        data_split = data_splitter.train_valid_test_split(dataset=dataset, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test)
    elif split_type == 'generate_scaffolds':
        if hasattr(data_splitter, 'generate_scaffolds'):
            data_split = data_splitter.generate_scaffolds(dataset, log_every_n) # Unsure about functionality, code may need to be added
        else:
            raise AttributeError ('split_type may only be set as generate_scaffolds if splitter set as ScaffoldSplitter')
    else: 
        print('split_type string is not a recognized split') # should change this to raise an error of some kind
    
    return data_split

## Test dataframe

### Create new dataframe

In [8]:
cars = {'id': [1, 2, 3, 4, 5, 6, 7, 8],
        'Make': ['Honda','Toyota','Ford','Audi','Subaru','Subaru','Ford','Jeep'],
        'Model': ['Civic','Corolla','Focus','A4','Legacy','Outback','Fiesta','Grand'],
        'Price': [22000,25000,27000,35000,90000,10000,23000,27000],
        'Engine_HP': [150.5, 220.0, 124.5, 190.0, 120.0, 124.5, 185.5, 190.0],
        'Color': ['Blue','Black','Blue','Pink','Orange','Green','Pink','Black']
        }

df1 = pd.DataFrame(cars, columns = ['id', 'Make', 'Model', 'Price', 'Engine_HP', 'Color'])

print (df1)

   id    Make    Model  Price  Engine_HP   Color
0   1   Honda    Civic  22000      150.5    Blue
1   2  Toyota  Corolla  25000      220.0   Black
2   3    Ford    Focus  27000      124.5    Blue
3   4    Audi       A4  35000      190.0    Pink
4   5  Subaru   Legacy  90000      120.0  Orange
5   6  Subaru  Outback  10000      124.5   Green
6   7    Ford   Fiesta  23000      185.5    Pink
7   8    Jeep    Grand  27000      190.0   Black


### Create categorical columns

In [9]:
df2 = convert_to_categorical(dataframe = df1, cols = ['Make', 'Model', 'Color'])

#### Print new dataframe

In [10]:
df2

Unnamed: 0,id,Make,Model,Price,Engine_HP,Color,Make_encoded,Model_encoded,Color_encoded
0,1,Honda,Civic,22000,150.5,Blue,2,1,1
1,2,Toyota,Corolla,25000,220.0,Black,5,2,0
2,3,Ford,Focus,27000,124.5,Blue,1,4,1
3,4,Audi,A4,35000,190.0,Pink,0,0,4
4,5,Subaru,Legacy,90000,120.0,Orange,4,6,3
5,6,Subaru,Outback,10000,124.5,Green,4,7,2
6,7,Ford,Fiesta,23000,185.5,Pink,1,3,4
7,8,Jeep,Grand,27000,190.0,Black,3,5,0


### Convert to deepchem dataset object

In [11]:
dataset_cars = convert_to_dataset(dataframe=df2, X_col='Engine_HP', y_col='Price', w_col=None, id_col='id')

#### Print dataset object

In [12]:
dataset_cars

<NumpyDataset X.shape: (8, 0), y.shape: (8, 0), w.shape: (8, 0), ids: [0 1 2 3 4 5 6 7], task_names: []>

### Transform and split the dataset

In [55]:
transformed_data = data_transformation(dataset_cars, 
                        transformations = ['NormalizationTransformer', 'MinMaxTransformer'], 
                        to_transform = ['X'])
transformed_data

<NumpyDataset X.shape: (8, 0), y.shape: (8, 0), w.shape: (8, 0), ids: [0 1 2 3 4 5 6 7], task_names: []>

### Split dataset
This version of data splitting only works for RandomSplitter

In [85]:
splittest = dc.splits.RandomSplitter()
test_splitting = splittest.train_valid_test_split(dataset = transformed_data, frac_train = 0.5, frac_valid = 0.25, frac_test = 0.25)
test_splitting

(<NumpyDataset X.shape: (4, 0), y.shape: (4, 0), w.shape: (4, 0), ids: [0 5 2 7], task_names: []>,
 <NumpyDataset X.shape: (2, 0), y.shape: (2, 0), w.shape: (2, 0), ids: [3 1], task_names: []>,
 <NumpyDataset X.shape: (2, 0), y.shape: (2, 0), w.shape: (2, 0), ids: [6 4], task_names: []>)

# Unused Fuctions
#### These functions are either redundant or incompatible with the rest of the current package code, but the functions themselves are still independently functional and can be used for other purposes.

### Unused categorical converter
This function converts categorical columns to integers using the Pandas get_dummies function (similar to OneHotEncoding).  It was removed from used given that part of its functionality (assigning each category its own column) is made redundant by the modeling functions downstream.

In [None]:
def convert_to_categorical(dataframe, cols=None):
    """
    Converts non-numerical categorical values to integers.  This function is most useful for nominal variables.
    
    Parameters
    ----------
    - dataframe: featurized dataframe
    - cols: list of categorical columns to convert to integers
    
    Returns
    -------
    - modified dataframe with user selected columns with categorical string values converted to integer values
    - list of list containing new column names separated by originating column
    """
    
    # create list if new column prefixes
    prefix_list = []
    count = 0
    for col in cols:
        count = count + 1
        prefix_list.append('categorical_column'+str(count))
        
    # create dict to link original column names to newly created prefixes
    new_column_dict = dict(zip(cols, prefix_list))
    
    # generate binary values using get_dummies
    if cols is not None:
        if type(cols) == str: # allow single columns to be input as strings
            cols = [cols]
            dataframe = pd.get_dummies(dataframe, prefix=prefix_list, columns=cols)
        elif type(cols) == list:
            dataframe = pd.get_dummies(dataframe, prefix=prefix_list, columns=cols)
        
        # get list of new categorical columns
        categories_list = []
        for col in cols:
            var_list_name = col
            var_list_name = []
            for column_name in list(dataframe):
                if new_column_dict[col] in column_name:
                    var_list_name.append(column_name)
                else:
                    continue
            categories_list.append(var_list_name)
            
    else:
        pass
    
    return dataframe, categories_list
