In [None]:
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem import Descriptors
from rdkit.Chem.rdmolops import RDKFingerprint
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn import cross_validation
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
#from rdkit.Avalon.pyAvalonTools import GetAvalonFP
#from rdkit.Avalon import pyAvalonTools


#Read solubility data
training_data = pd.read_csv('train_huuskonsen.csv')
print(training_data.head())

def estate_fingerprint_and_mw(mol):
    return np.append(FingerprintMol(mol)[0], Descriptors.MolWt(x))

#Add some new columns
training_data['Mol'] = training_data['smiles'].apply(Chem.MolFromSmiles)
num_mols = len(training_data)

#Create X and y
#Convert to Numpy arrays
y = training_data['solubility'].values

Next we make a bunch of different fingerprints. 
To do this, I have created a fingerprint object, which stores the name of the fingerprint and contains a method for applying the fingerprint and then converting the output into a NumPy array.

In [None]:
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint, GetHashedAtomPairFingerprintAsBitVect
from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect
from rdkit.Chem.AtomPairs.Sheridan import GetBPFingerprint
from rdkit.Chem.EState.Fingerprinter import FingerprintMol
#from rdkit.Avalon.pyAvalonTools import GetAvalonFP #GetAvalonCountFP  #int vector version
from rdkit.Chem.AllChem import  GetMorganFingerprintAsBitVect, GetErGFingerprint
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
import rdkit.DataStructs.cDataStructs

def ExplicitBitVect_to_NumpyArray(bitvector):
    bitstring = bitvector.ToBitString()
    intmap = map(int, bitstring)
    return np.array(list(intmap))

class fingerprint():
    def __init__(self, fp_fun, name):
        self.fp_fun = fp_fun
        self.name = name
        self.x = []

    def apply_fp(self, mols):
        for mol in mols:
            fp = self.fp_fun(mol)
            if isinstance(fp, tuple):
                fp = np.array(list(fp[0]))
            if isinstance(fp, rdkit.DataStructs.cDataStructs.ExplicitBitVect):
                fp = ExplicitBitVect_to_NumpyArray(fp)
            if isinstance(fp,rdkit.DataStructs.cDataStructs.IntSparseIntVect):
                fp = np.array(list(fp))

            self.x += [fp]

            if (str(type(self.x[0])) != "<class 'numpy.ndarray'>"):
                print("WARNING: type for ", self.name, "is ", type(self.x[0]))

def make_fingerprints(length = 512, verbose=False):
    fp_list = [
         #fingerprint(lambda x : GetBPFingerprint(x, fpfn=AtomPair), 
         #            "Physiochemical properties (1996)"), ##NOTE: takes a long time to compute
         fingerprint(lambda x : GetHashedAtomPairFingerprintAsBitVect(x, nBits = length),
                     "Atom pair (1985)"),
         fingerprint(lambda x : GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits = length),
                     "Topological torsion (1987)"),
         fingerprint(lambda x : GetMorganFingerprintAsBitVect(x, 2, nBits = length),
                     "Morgan circular "),
         fingerprint(FingerprintMol, "Estate (1995)"),
        # fingerprint(lambda x: GetAvalonFP(x, nBits=length),
        #            "Avalon bit based (2006)"),
        # fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)),
        #            "Avalon+mol. weight"),
         fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"),
         fingerprint(lambda x : RDKFingerprint(x, fpSize=length),
                     "RDKit fingerprint")
    ]

    for fp in fp_list:
        if (verbose): print("doing", fp.name)
        fp.apply_fp(list(training_data['Mol']))

    return fp_list

fp_list = make_fingerprints()


In [None]:
def test_model_cv(model, x, y, cv=20):
    scores = cross_validation.cross_val_score(model, x, y, cv=cv, n_jobs=-1, 
    scoring='neg_mean_absolute_error')

    scores = -1*scores

    return scores.mean()


def test_fingerprints(fp_list, model, y, verbose = True):

    fingerprint_scores = {}

    for fp in fp_list:
        if verbose: print("doing ", fp.name)
        fingerprint_scores[fp.name] = test_model_cv(model, fp.x, y)

    sorted_names = sorted(fingerprint_scores, key=fingerprint_scores.__getitem__, reverse=False)

    print("\\begin{tabular}{c c}")
    print("           name        &  avg abs error in CV (kJ/cc) \\\\")
    print("\\hline")
    for i in range(len(sorted_names)):
        name = sorted_names[i]
        print("%30s & %5.3f \\\\" % (name, fingerprint_scores[name]))
    print("\\end{tabular}")


test_fingerprints(fp_list, Ridge(alpha=1e-9), y, verbose=True)

In [None]:
def estate_fingerprint(mol):
    return FingerprintMol(mol)[0]

#Scale X to unit variance and zero mean
training_data['Fingerprint'] = training_data['Mol'].apply(estate_fingerprint)

X = np.array(list(training_data['Fingerprint']))

st = StandardScaler()
X = np.array(list(training_data['Fingerprint']))
X = st.fit_transform(X)

In [None]:
KRmodel = GridSearchCV(KernelRidge(), cv=10,
              param_grid={"alpha": np.logspace(-10, -5, 10),
             "gamma": np.logspace(-12, -9, 10), "kernel" : ['laplacian', 'rbf']}, scoring='neg_mean_absolute_error', n_jobs=-1)

KRmodel = KRmodel.fit(X, y)
Best_KernelRidge = KRmodel.best_estimator_
print("Best Kernel Ridge model")
print(KRmodel.best_params_)
print(-1*KRmodel.best_score_)

In [None]:
Rmodel = GridSearchCV(Ridge(), cv=20,
              param_grid={"alpha": np.logspace(-10, -5, 30),}, scoring='neg_mean_absolute_error', n_jobs=-1)

Rmodel = Rmodel.fit(X, y)
Best_Ridge = Rmodel.best_estimator_
print("Best Ridge model")
print(Rmodel.best_params_)
print(-1*Rmodel.best_score_)

In [None]:
GPmodel = GridSearchCV(GaussianProcessRegressor(normalize_y=True), cv=20,
              param_grid={"alpha": np.logspace(-15, -10, 30),}, scoring='neg_mean_absolute_error', n_jobs=-1)
GPmodel = GPmodel.fit(X, y)
Best_GaussianProcessRegressor = GPmodel.best_estimator_
print("Best Gaussian Process model")
print(GPmodel.best_params_)
print(-1*GPmodel.best_score_)

In [None]:
RFmodel = GridSearchCV(RandomForestRegressor(), cv=20,
              param_grid={"n_estimators": np.linspace(50, 150, 25).astype('int')}, scoring='neg_mean_absolute_error', n_jobs=-1)

RFmodel = RFmodel.fit(X, y)
Best_RandomForestRegressor = RFmodel.best_estimator_
print("Best Random Forest model")
print(RFmodel.best_params_)
print(-1*RFmodel.best_score_)

In [None]:
import pickle
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.metrics.pairwise import rbf_kernel

def make_scatter_plot(y_pred_train, y_pred_test, y_train, y_test, title='', figsize=(6,4), fontsize=16):    
    plt.clf()
    plt.figure(figsize=figsize)
    plt.scatter(y_train,y_pred_train, label = 'Train', c='blue')
    plt.title(title,fontsize=fontsize+5)
    plt.xlabel('Experimental Solubility (mol/L)', fontsize=fontsize)
    plt.ylabel('Predicted Solubility (mol/L)', fontsize=fontsize)
    plt.scatter(y_test,y_pred_test,c='lightgreen', label='Test', alpha = 0.8)
    plt.legend(loc=4)
    plt.show()


def test_models_and_plot(x, y, model_dict, plots=True):
    ''' test a bunch of models and print out a sorted list of CV accuracies
        inputs: 
            x: training data features, numpy array or Pandas dataframe
            y: training data labels, numpy array or Pandas dataframe
            model_dict: a dictionary of the form {name : model()}, where 'name' is a string
                        and 'model()' is a sci-kit-learn model object. 
    '''

    mean_scores = {}
    percent_errors = {}

    for (name, model) in model_dict.items():
        #print("running %s" % name)
        scores = cross_validation.cross_val_score(model, x, y, cv=20, n_jobs=-1, scoring='neg_mean_absolute_error')
        scores = -1*scores
        mean_score = scores.mean()
        mean_scores[name] = mean_score

        X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1)

        model.fit(X_train, y_train)
        
        # save RF model in pickle object
        if name == 'Random forest':
            f = open('rf_model.pkl', 'wb')
            pickle.dump(model, f)
            f.close()
            
        y_pred_train = model.predict(X_train)

        y_pred_test  = model.predict(X_test)

        percent_error = np.mean( 100*np.abs(y_test -y_pred_test)/np.abs(y_pred_test))

        percent_errors[name] = percent_error

        fulltitle = name+'\n mean % error: '+str(percent_error)

        if plots:
            make_scatter_plot(y_pred_train, y_pred_test, y_train, y_test, title=fulltitle, figsize = (8,6))

    sorted_names = sorted(percent_errors, key=mean_scores.__getitem__, reverse=False)

    print("\\begin{tabular}{c c c}")
    print("           name     &      % test err   & .    abs error in CV \\\\ ")
    print("\\hline")
    for i in range(len(sorted_names)):
        name = sorted_names[i]
        print("%30s & %5.3f & %5.3f \\\\" % (name, percent_errors[name], mean_scores[name]))
    print("\\end{tabular}")


alpha_grid = {'alpha': np.logspace(1e-11,1e-1,8)}

model_dict = {
            'Linear Regression': LinearRegression(),
            #'Kernel Ridge Regression': Best_KernelRidge,
            #'Ridge Regression':Best_Ridge,
            #'Guassian Process Regressor': Best_GaussianProcessRegressor,
            'Support Vector Regression': SVR(),
            'KNeighborsRegressor': KNeighborsRegressor(),
            #'Neural Network': MLPRegressor(alpha=100,max_iter=8000, hidden_layer_sizes=[8,6], early_stopping=False),
            'Gradient Boosted Trees': GradientBoostingRegressor(n_estimators=100),
            'Random forest': Best_RandomForestRegressor
            }

test_models_and_plot(X, y, model_dict)