In [None]:
!conda activate rdkit
!pip install dtreeviz rdkit-pypi PyTDC

In [None]:
import pandas as pd #data table manipulation
from rdkit import Chem # basic cheminformatics
from rdkit.Chem.Descriptors import MolWt, MolLogP, NumAromaticRings, NumHDonors, NumHAcceptors
import math #needed for log10
import seaborn as sns #plotting
from sklearn.tree import DecisionTreeClassifier, plot_tree # decision trees
from sklearn.model_selection import train_test_split # split a dataset
from tqdm import tqdm # progress bar
from matplotlib import pyplot as plt # plotting
from dtreeviz.trees import * #plotting decision trees
from sklearn.metrics import roc_auc_score, matthews_corrcoef, cohen_kappa_score, plot_roc_curve, plot_confusion_matrix # model stats
from tdc.single_pred import ADME
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.rdBase import BlockLogs



In [None]:
tqdm.pandas()
data = ADME(name = 'Solubility_AqSolDB')
sol_df = data.get_data()
sol_df.columns = ["Name","SMILES","LogS"]
## Reordering columns names
sol_df = sol_df[['SMILES','Name','LogS']]
sol_df



Before we calculate descriptors, we're going to standardize the molecules. This process removes counterions, neutralizes molecules, and generates a standard tautomer. This process also generates a lot of logging output. We don't want to look at that output, so we're going to use the function BlockLogs to temporarily turn off logging. Once we're finished we delete the variable block and logging is back on.


In [None]:
block = BlockLogs()
#sol_df['mol'] = sol_df.SMILES.progress_apply(standardize)
del block

sol_df['IsSol'] = sol_df.LogS > math.log10(200 * 1e-6)

sns.set_context('talk') # Set the fonts the way I like them
sns.displot(x='LogS',hue="IsSol",data=sol_df)

sol_df.to_csv("data/soldata.csv")