In [6]:
#from automol.datasets import Dataset
#from automol.features import FeatureGenerator
#from automol.models import *
from automol.pipeline import Pipeline

In [7]:
config_yaml = 'qm9_dataset_test_config.yaml'
pipeline = Pipeline(config_yaml)

In [8]:
pipeline.data_set.data['smiles'][0:10]

0    O=C([O-])C12C[NH2+]C1C2
1                CN1CC1C#CCO
2               OCC1COCOC=N1
3              COCCC(C#N)C=O
4            C1CC2OC2C2CN2C1
5           CC12C3OC1C2NC3=O
6                C#CCN(C)C=O
7            CC1OC2CC1C2(C)C
8              O=C1NC2CCC2O1
9          CC1(C(=O)C=O)CCC1
Name: smiles, dtype: object

In [9]:
pipeline.data_set.data['homo'][0:10]

0   -0.1795
1   -0.2284
2   -0.2521
3   -0.2701
4   -0.2201
5   -0.2270
6   -0.2516
7   -0.2310
8   -0.2575
9   -0.2452
Name: homo, dtype: float64

In [19]:
from rdkit.Chem import Descriptors
from rdkit import Chem
from collections import OrderedDict
import inspect
import pandas as pd
import numbers

calc_props = OrderedDict(inspect.getmembers(Descriptors, inspect.isfunction))
smiles = pipeline.data_set.data['smiles']
sanitize = True
axis = 0

for key in list(calc_props.keys()):
    if key.startswith('_'):
        del calc_props[key]
        continue
    if key == 'setupAUTOCorrDescriptors':
        del calc_props[key]
        continue

df = pd.DataFrame(columns=list(calc_props.keys()), index=range(len(smiles)))

for i, s in enumerate(smiles):
    mol = Chem.MolFromSmiles(s)
    features = [val(mol) for key, val in calc_props.items()]
    non_number_features = [f for f in features if not isinstance(f, numbers.Number)]
    df.loc[i, :] = features

if sanitize:
    df.dropna(axis=axis, how='any', inplace=True)
    
non_number_features

[]

In [15]:
features

[2.367,
 2.411,
 -0.743,
 -1.0,
 0.0,
 0.0,
 0.0,
 -0.0,
 -0.065,
 0.093,
 -0.743,
 -1.0,
 2.238,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.065,
 0.093,
 -0.743,
 -1.0,
 0.0,
 0.0,
 1.581,
 0.0,
 -0.0,
 -0.065,
 0.093,
 -0.743,
 -1.0,
 0.0,
 0.0,
 0.0,
 -0.0,
 0.539,
 -0.065,
 0.093,
 -0.743,
 -1.0,
 0.0,
 0.0,
 0.0,
 0.284,
 0.156,
 -0.21,
 0.0,
 -1.083,
 -1.262,
 0.0,
 0.0,
 0.0,
 0.571,
 0.701,
 1.029,
 2.057,
 2.571,
 0.0,
 0.0,
 0.0,
 0.0,
 0.571,
 0.701,
 1.029,
 2.057,
 2.571,
 0.0,
 0.0,
 0.0,
 0.0,
 0.571,
 0.701,
 1.029,
 2.057,
 2.571,
 0.0,
 0.0,
 0.0,
 0.571,
 2.366,
 0.701,
 1.029,
 2.057,
 2.571,
 0.0,
 0.0,
 0.0,
 0.571,
 0.701,
 1.029,
 2.564,
 2.057,
 2.571,
 0.0,
 0.0,
 0.0,
 0.36,
 0.542,
 1.263,
 2.294,
 2.699,
 2.57,
 0.0,
 0.0,
 0.0,
 2.565,
 1.989,
 0.845,
 0.0,
 0.0,
 0.0,
 2.187,
 2.338,
 2.083,
 1.34,
 0.375,
 2.573,
 0.0,
 0.0,
 0.0,
 2.344,
 2.536,
 2.51,
 1.922,
 0.793,
 0.0,
 0.0,
 1.992,
 0.0,
 4.01,
 4.154,
 4.548,
 3.497,
 2.079,
 0.0,
 0.0,
 0.0,
 0.076,
 0.847,
 0

In [5]:
pipeline.feature_generator.get_feature('fingerprint')

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
pipeline.train()

Training model RandomForestRegressor(n_estimators=42) with X = fingerprint and y = homo
