Now that necessary data is retrieved and saved, we can import the pickle file straight away. In this notebook, we will only try conventional machine learning methods with basic chemical features for comparison later.  

In [25]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

Import structure data

In [2]:
data = pd.read_pickle(os.path.join(os.getcwd(), 'data.pkl'))
print (data.shape)
data.head(1)

(28263, 19)


Unnamed: 0,smiles,category,mw_freebase,alogp,hba,hbd,psa,rtb,acd_logp,acd_logd,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,hba_lipinski,hbd_lipinski,mol,agrochemical
0,Cl.O=C(NCc1ccncc1)[C@@H]2CCCN2C(=O)[C@@H]3CCCN3,toxin,302.38,0.44,4,2,74.33,4,1.04,-0.77,338.84,1,22,0.85,302.174,6,2,<rdkit.Chem.rdchem.Mol object at 0x117e3f538>,0


Drop columns that are not required for learning using Gradient Boosting Classifier

In [3]:
data = data.drop(['smiles', 'mol', 'category'], axis=1)

Assign features (X) and class (Y)

In [4]:
X = data.drop(['agrochemical'], axis=1)
Y = data['agrochemical']

Ensure that all columns of X are converted into readable floats

In [6]:
X = X.astype('float')

Split dataset into training set and testing set

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=88)

### Set up conventional machine learning models

1) Gradient Boosting Classifier

In [11]:
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=1.0, 
                                max_depth=3, random_state=0).fit(x_train, y_train)

In [12]:
gbc.score(x_test, y_test)

0.8048823633468954

2) Support Vector Machines

In [17]:
clf = svm.SVC(gamma=0.01)

Use exhausted search (GridSearchCV) to find the best gamma to use for RBF kernel

In [19]:
parameters = {'gamma':[1, 0.1, 0.01, 0.001]}
gridsearch = GridSearchCV(clf, parameters, cv=5)

In [20]:
gridsearch.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
print (gridsearch.best_params_, gridsearch.best_score_)
clf = gridsearch.best_estimator_

{'gamma': 0.1} 0.7131800088456435


In [24]:
clf.score(x_test, y_test)

0.7304086325844684

3) Nearest Neighbor Algorithm

In [32]:
nn = KNeighborsClassifier(n_neighbors=3)

In [33]:
parameters = {'n_neighbors': [1, 3, 5, 7],
              'weights': ['uniform', 'distance'],
              'algorithm': ['ball_tree', 'kd_tree', 'brute']
                }
gridsearch = GridSearchCV(nn, parameters, cv=5)

In [34]:
gridsearch = gridsearch.fit(x_train, y_train)

In [35]:
print (gridsearch.best_params_, gridsearch.best_score_)
nn = gridsearch.best_estimator_

{'algorithm': 'brute', 'n_neighbors': 3, 'weights': 'distance'} 0.6969924812030075


In [36]:
nn.score(x_test, y_test)

0.7132496019812489