In [None]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from mordred import Calculator, descriptors
from rdkit import Chem, RDLogger

In [None]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
RDLogger.DisableLog('rdApp.*')

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='sans-serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)

In [None]:
class CorrelationTreshold:
    
    def __init__(self, threshold=None):
        self.threshold = threshold if threshold is not None else 1.0
    
    def fit(self, X, y=None):
        corr = np.abs(np.corrcoef(X, rowvar=False))
        self.mask = ~(np.triu(corr, k=1) > self.threshold).any(axis=1)
        return self
    
    def transform(self, X, y=None):
        return X[:, self.mask]
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)
    
    def get_support(self, indices=False):
        return self.mask if not indices else np.where(self.mask)[0]

In [None]:
# Load the trained classifier.
with open('rf.pickle', 'rb') as pf:
    classifier = pickle.load(pf)

In [None]:
# Calculate features using Mordred.
mordred_calculator = Calculator(descriptors, ignore_3D=True)

In [None]:
# Get the original feature labels used during training.
compounds = pd.read_csv('../data/compound_smiles.csv')
mols = compounds['SMILES (Canonical)'].apply(Chem.MolFromSmiles)
features_orig = pd.DataFrame(mordred_calculator.pandas(mols)
                             .select_dtypes(exclude='object')
                             .astype(np.float32))
feature_labels = features_orig.columns

In [None]:
# Read PubChem SMILES data.
# https://chemistry.stackexchange.com/a/122118
compounds = pd.read_csv('../data/CID-SMILES', sep='\t', header=None,
                        names=['SMILES'], index_col=0, squeeze=True, nrows=1000)
mols = compounds.apply(Chem.MolFromSmiles)
# Exclude features not encountered during training.
features = pd.DataFrame(mordred_calculator.pandas(mols)[feature_labels]
                        .astype(np.float32))

In [None]:
pred_scores = classifier.predict_proba(features.values)[:, 1]

In [None]:
width = 7
height = width / 1.618    # Golden ratio.
fig, ax = plt.subplots(figsize=(width, height))

sns.histplot(pred_scores, bins=np.arange(0, 1.01, 0.05))

ax.set_xlabel('Predicted probability to observe on epidermis')

sns.despine()

plt.savefig('pubchem_predictions.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()