In [None]:
import jupyter_black

jupyter_black.load(
    lab=False,
    line_length=79,
    verbosity="DEBUG",
)

# Predicting solubilities from molecular descriptors

The supporting information to the article [ESOL: Estimating Aqueous Solubility Directly from Molecular Structure](https://pubs.acs.org/doi/10.1021/ci034243x) contains a data set with molecules (smiles) and their
measured and predicted (by the ESOL model described in the article) aqueous solubilities. We can (down)load this data from [GitHub](https://github.com/dataprofessor)):

In [None]:
import pathlib
import requests
import pandas as pd

In [None]:
def download_data_file(url, output_file):
    if pathlib.Path(output_file).is_file():
        print(f"File {output_file} exists - skipping download")
        return output_file
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0"
        }
    )
    response = session.get(url, allow_redirects=True)
    if response:
        with open(output_file, "w") as output:
            output.write(response.text)
        print(f"Downloaded file to: {output_file}")
        return output_file
    else:
        print(f"Could not download file: {response.status_code}")
        return None

In [None]:
download_data_file(
    "https://raw.githubusercontent.com/dataprofessor/data/master/delaney.csv",
    "esol.csv",
)

In [None]:
data = pd.read_csv("esol.csv")
data

In [None]:
names = data["Compound ID"].values
measured = data["measured log(solubility:mol/L)"].values
esol = data["ESOL predicted log(solubility:mol/L)"].values

## Having a quick look at the raw data

First, we will plot the distributions of the measured and predicted solubilities and calculate the
[coefficient of determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) and the
[mean absolute error](https://en.wikipedia.org/wiki/Root-mean-square_deviation#Mean_absolute_error)
for the ESOL model.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error

plt.style.use("seaborn-talk")
%matplotlib inline
sns.set_theme(style="ticks", context="talk", palette="muted")

In [None]:
def add_scatterplot(ax, measured, predicted, model_name=None):
    """Add a measured vs. predicted scatter plot."""
    rsquared = r2_score(measured, predicted)
    mae = mean_absolute_error(measured, predicted)
    label = f"R²: {rsquared:.2f}\nMAE = {mae:.2f}"
    if model_name:
        label = f"{model_name}\n{label}"
    ax.scatter(
        measured,
        predicted,
        label=label,
        alpha=0.8,
    )

In [None]:
fig, (ax1, ax2) = plt.subplots(
    constrained_layout=True, ncols=2, figsize=(12, 5)
)
_, _, hist1 = ax1.hist(measured, density=True, alpha=0.5)
_, _, hist2 = ax1.hist(esol, density=True, alpha=0.5)
sns.kdeplot(
    data=data,
    x="measured log(solubility:mol/L)",
    ax=ax1,
    label="Measured",
    color=hist1.patches[0].get_facecolor(),
    lw=5,
)
sns.kdeplot(
    data=data,
    x="ESOL predicted log(solubility:mol/L)",
    ax=ax1,
    label="ESOL",
    color=hist2.patches[0].get_facecolor(),
    lw=5,
)
ax1.legend()
ax1.set(xlabel="log (solubility)", title="Distribution of solubilities")

ax2.scatter([], [])  # cycle colors
add_scatterplot(ax2, measured, esol)
ax2.set(
    xlabel="Measured log (solubility)",
    ylabel="Predicted (ESOL)",
    title="Measured vs. predicted",
)
ax2.legend()
sns.despine(fig=fig, offset=10)

That looks reasonable. The model overestimates the solubility between −4.2 and −1.2 −4.2 to −1.2 and
underestimates for < −5 and > 0.

We can also have a look at the molecules in the data set:

In [None]:
from tqdm import tqdm  # add a progress bar
from rdkit import Chem
from rdkit.Chem import (
    AllChem,
    Draw,
    rdCoordGen,
)
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG

IPythonConsole.ipython_useSVG = True

In [None]:
def make_molecules_from_smiles(smiles):
    molecules = []
    for smilei in tqdm(smiles):
        mol = Chem.MolFromSmiles(smilei)
        rdCoordGen.AddCoords(mol)
        molecules.append(mol)
    return molecules

In [None]:
molecules = make_molecules_from_smiles(data["SMILES"])

Let us show the molecules with the highest and lowest solubility: 

In [None]:
mols = []
legends = []
idx_max, idx_min = np.argmax(measured), np.argmin(measured)
for i in (idx_max, idx_min):
    mols.append(molecules[i])
    legends.append(f"{names[i]}\nlog solubility = {measured[i]:.3g}")

drawing = Draw.rdMolDraw2D.MolDraw2DSVG(600, 280, 300, 280)
options = drawing.drawOptions()
options.drawMolsSameScale = False
options.fixedBondLength = 50
options.legendFraction = 0.25
drawing.DrawMolecules(mols, legends=legends)
drawing.FinishDrawing()
SVG(drawing.GetDrawingText())

And the 6 molecules with the largest relative errors.
The (logarithmic) solubilities can be zero, so here I will use a
variant of the [relative difference](https://en.wikipedia.org/wiki/Relative_change_and_difference):

In [None]:
error = abs(measured - esol) / (0.5 * (abs(measured) + abs(esol)))
idx = np.argsort(error)[-6:]

mols = []
legends = []
for i in idx:
    mols.append(molecules[i])
    legends.append(
        f"{names[i]}\nSolubility = {measured[i]:.2g}\nESOL: {esol[i]:.2g}"
    )

drawing = Draw.rdMolDraw2D.MolDraw2DSVG(1000, 600, 300, 300)
options = drawing.drawOptions()
options.drawMolsSameScale = False
options.fixedBondLength = 30
options.legendFraction = 0.25
drawing.DrawMolecules(mols, legends=legends)
drawing.FinishDrawing()
SVG(drawing.GetDrawingText())

## Calculating molecular descriptors

For creating a predictive model, we need some variables. I will here just calculate [all molecular
descriptors available in RDKit](https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors):

In [None]:
from rdkit.Chem import Descriptors, Descriptors3D
from rdkit.ML.Descriptors import MoleculeDescriptors

In [None]:
def calculate_rdkit_descriptors(molecules):
    """Calculate rdkit 2D-descriptors for a set of molecules."""
    descriptors = [i[0] for i in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
    values = [calculator.CalcDescriptors(mol) for mol in tqdm(molecules)]
    values = np.array(values)
    data = pd.DataFrame(values, columns=descriptors)
    return data

In [None]:
rdkit_descriptors = calculate_rdkit_descriptors(molecules)
rdkit_descriptors

Let us do some preprocessing here:

1. Remove columns with nan/inf:

In [None]:
columns_before = set(list(rdkit_descriptors.columns))
rdkit_descriptors = rdkit_descriptors.apply(
    pd.to_numeric, errors="coerce", axis=1
)
rdkit_descriptors = rdkit_descriptors.replace([np.inf, -np.inf], np.nan)
rdkit_descriptors = rdkit_descriptors.dropna(axis=1)
columns_after = set(list(rdkit_descriptors.columns))
diff = columns_before - columns_after
if len(diff) > 0:
    print("Removed:", list(diff))

2. Remove variables with low variance:

In [None]:
from sklearn.feature_selection import VarianceThreshold

columns_before = set(list(rdkit_descriptors.columns))
threshold = VarianceThreshold()
threshold.fit(rdkit_descriptors)
columns_after = list(threshold.get_feature_names_out())
diff = columns_before - set(columns_after)
rdkit_descriptors = rdkit_descriptors[columns_after]
if len(diff) > 0:
    print("Removed:", list(diff))

3. Remove highly correlated columns. Some of the descriptors are essentially
   measuring the same thing, for instance, the different molecular weights:

In [None]:
rdkit_descriptors[["MolWt", "HeavyAtomMolWt", "ExactMolWt"]].corr()

In [None]:
corr = rdkit_descriptors.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.975)]
if len(to_drop) > 0:
    print("Removed:", to_drop)
    rdkit_descriptors.drop(labels=to_drop, axis=1, inplace=True)
rdkit_descriptors

## Creating a predictive model

We now have some variables and can create a predictive model. For this, I will use [CatBoost](https://catboost.ai/) - it usually gives good results without too much parameter tuning. One could also use [XGBoost](https://xgboost.readthedocs.io/en/stable/), [LightGBM](https://lightgbm.readthedocs.io/), or linear models such as [LASSO](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) or [Elastic net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html).

### Creating the training and test sets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
y_raw = data["measured log(solubility:mol/L)"].to_numpy().reshape(-1, 1)
variables = rdkit_descriptors.columns  # Just select all variables
X_raw = rdkit_descriptors[variables].to_numpy()

In [None]:
def split_and_scale(X_raw, y_raw):
    """Split into training and test sets and scale."""
    X_train, X_test, y_train, y_test = train_test_split(
        X_raw,
        y_raw,
        test_size=0.33,
        random_state=5,
    )

    scale_x = StandardScaler()
    scale_y = StandardScaler()
    scale_x.fit(X_train)
    scale_y.fit(y_train)

    X_train = scale_x.transform(X_train)
    X_test = scale_x.transform(X_test)

    y_train = scale_y.transform(y_train)
    y_test = scale_y.transform(y_test)
    return X_train, X_test, y_train, y_test, scale_x, scale_y

In [None]:
X_train, X_test, y_train, y_test, scale_x, scale_y = split_and_scale(
    X_raw, y_raw
)

### Training the model
I said above that [CatBoost](https://catboost.ai/) usually gives good results without too much parameter tuning.
So I will do no parameter tuning here.

In [None]:
import catboost as cb

In [None]:
%%time
model = cb.CatBoostRegressor(verbose=False)
model.fit(X_train, y_train)

### Assessing the model
To assess the model, I will plot the predicted and measured solubilities:

In [None]:
def plot_test_train_model(model, X_train, y_train, X_test, y_test):
    """Plot measured vs. predicted for test and train."""
    fig, (ax1, ax2) = plt.subplots(
        constrained_layout=True, ncols=2, figsize=(12, 5)
    )
    # Training:
    add_scatterplot(ax1, y_train, model.predict(X_train))
    ax1.set(xlabel="measured", ylabel="predicted", title="Training set")
    ax1.legend()
    # Testing:
    add_scatterplot(ax2, y_test, model.predict(X_test))
    ax2.set(xlabel="measured", ylabel="predicted", title="Test set")
    ax2.legend()
    sns.despine(fig=fig, offset=10)

In [None]:
plot_test_train_model(model, X_train, y_train, X_test, y_test)

That looks promising (**Note:** the `MAE` is here calculated for the scaled data).

Let us compare with the measured solubilities and the ESOL predicted solubilities.
For the comparison, I transform the output from the model back to solubilities:

In [None]:
X = scale_x.transform(X_raw)
y_pred = model.predict(X)
model_predict = scale_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
models_table = {
    "Measured": measured,
    "ESOL": esol,
    "CatBoost": model_predict,
}
models_table = pd.DataFrame(models_table)

In [None]:
fig, (ax1, ax2) = plt.subplots(
    constrained_layout=True, ncols=2, figsize=(12, 5)
)
ax2.scatter([], [])  # Just to cycle colors
for key in models_table:
    sns.kdeplot(
        data=models_table,
        x=key,
        ax=ax1,
        label=key,
        lw=5,
    )
    if key != "Measured":
        add_scatterplot(
            ax2, models_table["Measured"], models_table[key], model_name=key
        )

ax1.legend()
ax1.set(xlabel="Solubility", title="Distribution of solubilities")
ax2.set(
    xlabel="Measured solubility",
    ylabel="Predicted solubility",
    title="Measured vs. predicted",
)
ax2.legend()
sns.despine(fig=fig, offset=10)

The model seems to improve the over/underestimation in ESOL.
But the new model uses **many** variables. Let us inspect it to see
if we can simplify it.

### Inspecting the model and creating a simplified model
To simplify the model, I aim to create a linear model with few (say 4) features. To select these features I
inspect their [importance](https://catboost.ai/en/docs/concepts/fstr#regular-feature-importance) and
[shap values](https://shap.readthedocs.io/en/latest/index.html):

In [None]:
feature_importance = model.get_feature_importance()
idx = np.argsort(feature_importance)
pos = np.arange(len(idx))

# Just show the 10 most important:
fig, ax = plt.subplots(constrained_layout=True, figsize=(8, 6))
ax.set_yticks(pos)
ax.set_yticklabels(variables[idx])
ax.barh(pos[-10:], feature_importance[idx[-10:]])
ax.set(xlabel="Feature importance")
sns.despine(fig=fig, offset=10)

In [None]:
import shap

explainer = shap.Explainer(model, feature_names=variables)
shap_values = explainer(X)

In [None]:
fig, ax = plt.subplots()
shap.summary_plot(
    shap_values,
    features=X,
    show=False,
    max_display=10,
)
cbar = fig.axes[-1]
cbar.set_aspect("auto")
fig.tight_layout()
cbar.set_box_aspect(25)

Here we see, for instance, that a higher `MolLogP` has a negative
impact on solubility and that a lower molecular weight (`MolWt`)
has a positive impact. This is probably what you could have guessed before
making the model. Here is a closer inspection of the molecule with the highest solubility:

In [None]:
shap.plots.waterfall(shap_values[idx_max])

and for the lowest solubility:

In [None]:
shap.plots.waterfall(shap_values[idx_min])

We see here that `MolLogP` has a positive impact on the prediction for the molecule with highest solubility, and a negative impact for the molecule with the lowest solubility.

OK, so we have an idea of the most important variables. Let pick 4 simple ones (from the first plot
of the feature importance) and make a linear model,
for instance:

1. `MolLogP` (Wildman-Crippen LogP value.)
2. `MolWt` (The molecular weight.)
3. `MinPartialCharge` (Smallest [Gasteiger](https://doi.org/10.1016/0040-4020(80)80168-2) partial charge.)
4. `NOCount` (Number of Nitrogens and Oxygens.)

In [None]:
variables2 = [
    "MolLogP",
    "MolWt",
    "MinPartialCharge",
    "NOCount",
]

In [None]:
X_raw2 = rdkit_descriptors[variables2].to_numpy()

X_train2, X_test2, y_train2, y_test2, scale_x2, scale_y2 = split_and_scale(
    X_raw2, y_raw
)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
parameters = {
    "alpha": np.logspace(-3, 0, 20),
}
grid = GridSearchCV(
    Lasso(fit_intercept=False, max_iter=10000),
    parameters,
    cv=10,
)
grid.fit(X_train2, y_train2)
model2 = grid.best_estimator_
model2

In [None]:
plot_test_train_model(model2, X_train2, y_train2, X_test2, y_test2)

The simplified model is:

In [None]:
from IPython.display import display, Math

terms = [
    f"{i:.2g}×(\\text{{{var}}})" for i, var in zip(model2.coef_, variables2)
]
equation = "y =" + "".join(terms)
display(Math(equation))

This simplified model has a performance similar to the ESOL model.