In [4]:
### Here are some code examples you can modify in order to get started.
### Please activate a fresh jupyter notebook in the uonsol2_env environment,
### then cut and paste the following into the first cell

### GENERAL SCHEME ###
import warnings
warnings.filterwarnings('ignore') # minimize warning messages output for excess memory load etc

import uonsol.gng.gng_core as gng # you will need to git pull the latest version of origin/main
import deepchem as dc # this is needed to make some deepchem models.

db_path = 'test_enabled3.db' # the database file you are writing to (can be non-existent)
dataset_csv = 'delaney-processed-original.csv' # the csv you might want to bring into the database.
seed = 42 # a fixed integer seed to make randomised processes reproducible throughout.
comments = 'test arbitrary new db' # comments about the specific run



sdf = gng.DeepchemLearning(seed, db_path, comments) # an instance of the DeepchemLearning class.
sdf.input_raw_dataset(dataset_csv, 'esol') #this funtion does not need to be run each time.
sdf.prepare_attenuated_dataset(tablename='esol', smiles_field='smiles', attenuation=100) # prepare attenuated dataset
sdf.determine_splits(split_method='simple', # splits the dataset by a scaffold method
        scaffold_method='auto', # splits by the conventional (dodgy) rdkit bemis murcko implementation
        lookup_table=None, # alternatively, if there is a designated lookup table with alternative splits 
        smiles_field=None, # for each smiles string, they can be specified here.
        scaffold_field=None)


folded_df = sdf.get_folded_training_df() # you can check how the folds are distributes in the dataset here
featurizer = dc.feat.ConvMolFeaturizer() # set the appropriate deepchem featurizer with instantiation
label = 'measured_log_solubility_in_mols_per_litre' # set the field containing target vector y
smiles_field = 'smiles' # set where the smiles are
index_field = 'mol_index' # this should be the default
loader = dc.data.InMemoryLoader(tasks=[label], featurizer=featurizer) # set up a deepchem dataset loader

# the following methodology for setting up the deepchem functions is wierd but nescessary.
# I want as many regression functionalities as possible to be testable.
# for different argument types, take care of the positional arguments
# refer to: https://deepchem.readthedocs.io/en/latest/api_reference/models.html#model-cheatsheet
model_classname = 'deepchem.models.GraphConvModel' # here pass a STRING representing the desired dc.model
positional_args = [1] # set a LIST of positional arguments. This 1 is required.
for dense in 128, 256:
    keyword_arguments = {'mode': 'regression',   # regression is not default so this needs to be specified
                        'dense_layer_size': dense, 
                        'number_atom_features': 75}
    sdf.set_deepchem_model_object(model_classname, positional_args, keyword_arguments) # set the model by parameters
    sdf.cross_validate_deepchem_model(featurizer,  
                                      smiles_field, 
                                      label,  
                                      cross_validate=True) # true by default. set as False to run only fold 0
    sdf.kfold_metrics # display the dataframe of prediciton metrics.

split method not recognised
fitting fold: 0
fitting fold: 1
fitting fold: 2
fitting fold: 3
fitting fold: 4
fitting fold: 5
fitting fold: 6


fitting fold: 7
fitting fold: 8
fitting fold: 9
fitting fold: 0


fitting fold: 1
fitting fold: 2
fitting fold: 3
fitting fold: 4
fitting fold: 5
fitting fold: 6
fitting fold: 7


fitting fold: 8
fitting fold: 9


In [14]:
help(sdf.determine_splits)

Help on method determine_splits in module uonsol.gng.gng_core:

determine_splits(split_method='scaffold', scaffold_method='auto', lookup_table=None, smiles_field=None, scaffold_field=None) method of uonsol.gng.gng_core.DeepchemLearning instance
    Splits attenuated dataset according to values and stores outcome of session based 
    on choice of methods. Writes in a split table from the folds attribute of the split method, 
    which is determined through the KFoldSplitter class.
    session seed is applied on all split methods.
    
    split_method:      generates fold attributes in kfold splitter object
        'simple: -> calls KFoldSplitter.simple_split()'
        'straight' -> calls KFoldSplitter.straight_split()
        'scaffold' -> calls KFoldSplitter.scaffold_split()
            scaffold_method  
                'auto' -> calls __main__.scaffold_generator function
                'lookup' -> looks up scaffold from precalculated sql table 
                    'lookup_table' d