In [None]:
import os
import subprocess

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm.notebook as tqdm
import xml.etree.ElementTree as ET
from mordred import Calculator, descriptors
from rdkit import Chem, RDLogger

from correlation_threshold import CorrelationThreshold

In [None]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
RDLogger.DisableLog('rdApp.*')

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='sans-serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)
sns.set_palette(['#6da7de', '#9e0059', '#dee000', '#d82222', '#5ea15d',
                 '#943fa6', '#63c5b5', '#ff38ba', '#eb861e', '#ee266d'])

## Parse FDA approved drugs from DrugBank

In [None]:
if not os.path.isfile('../data/processed/fda.csv'):
    # Parse DrugBank XML file.
    ns = '{http://www.drugbank.ca}'
    tree = ET.parse('../data/external/drugbank_517.xml')
    rows = [(drug.findtext(ns + 'drugbank-id[@primary="true"]'),
             drug.findtext(ns + 'name'),
             '|'.join([group.text for group in
                       drug.findall(f'{ns}groups/{ns}group')]),
             '|'.join([code.get('code') for code in
                       drug.findall(f'{ns}atc-codes/{ns}atc-code')]),
             drug.findtext(f'{ns}calculated-properties/{ns}'
                           f'property[{ns}kind="SMILES"]/{ns}value'))
            for drug in tree.getroot()]

    approved_drugs = (pd.DataFrame(rows, columns=['drugbank_id', 'name',
                                                  'groups', 'atc_codes',
                                                  'smiles'])
                     .dropna(subset=['smiles']))
    # Filter on FDA approved drugs.
    approved_drugs = approved_drugs[approved_drugs['groups']
                                    .str.contains('approved')]
    # Only retain drugs with valid and unique SMILES.
    smiles = []
    for drug_smiles in approved_drugs['smiles']:
        mol = Chem.MolFromSmiles(drug_smiles)
        smiles.append(Chem.MolToSmiles(mol, False)
                      if mol is not None else None)
    approved_drugs['smiles'] = smiles
    approved_drugs = (approved_drugs.dropna(subset=['smiles'])
                      .drop_duplicates('smiles')
                      .reset_index(drop=True))
    approved_drugs.to_csv('../data/processed/fda.csv', index=False)
else:
    approved_drugs = pd.read_csv('../data/processed/fda.csv')

## Epidermis probability for FDA approved drugs

In [None]:
# Load the trained classifier.
classifier = joblib.load('rf.joblib')

In [None]:
# Calculate features using Mordred.
mordred_calculator = Calculator(descriptors, ignore_3D=True)

In [None]:
# Get the original feature labels used during training.
compounds = pd.read_csv('../data/compound_smiles.csv')
mols = compounds['smiles'].apply(Chem.MolFromSmiles)
features_orig = pd.DataFrame(mordred_calculator.pandas(mols)
                             .select_dtypes(exclude='object')
                             .astype(np.float32))
feature_labels = features_orig.columns

In [None]:
if 'epidermis_prob' not in approved_drugs.columns:
    # Generate features for the FDA approved drugs.
    # (Exclude features not encountered during training.)
    mols = approved_drugs['smiles'].apply(Chem.MolFromSmiles)
    features = pd.DataFrame(mordred_calculator.pandas(mols)[feature_labels]
                            .astype(np.float32))

    approved_drugs['epidermis_prob'] = \
        classifier.predict_proba(features.values)[:, 1]
    approved_drugs.to_csv('../data/processed/fda.csv', index=False)
else:
    approved_drugs = pd.read_csv('../data/processed/fda.csv')

In [None]:
(approved_drugs.sort_values('epidermis_prob', ascending=False)
 [['name', 'epidermis_prob']].head(10))

## Epidermis probability for drug biotransformation products

In [None]:
data_dir = os.path.abspath(os.path.join(
    os.getcwd(), '../data/processed/biotransformer'))
if not os.path.isdir(data_dir):
    os.makedirs(data_dir)
bin_dir = os.path.abspath(os.path.join(os.getcwd(), '../bin/biotransformer'))

# https://bitbucket.org/djoumbou/biotransformer/
def _biotransform(drugbank_id, smiles):
    cmd = f"""cd "{bin_dir}" && \
        java -jar biotransformer-2.0.1.jar \
        -b allHuman \
        -ismi "{smiles}" \
        -k pred \
        -ocsv "{data_dir}/{drugbank_id}.csv" >/dev/null 2>&1"""
    if not os.path.isfile(os.path.join(data_dir, f'{drugbank_id}.csv')):
        subprocess.run(cmd, shell=True)

In [None]:
if not os.path.isfile('../data/processed/fda_biotransformations.csv'):
    # Generate human biotransformation products.
    joblib.Parallel(n_jobs=-1)(
        joblib.delayed(_biotransform)(drugbank_id, smiles)
        for drugbank_id, smiles in tqdm.tqdm(
            zip(approved_drugs['drugbank_id'], approved_drugs['smiles']),
            desc='Biotransformations predicted', total=len(approved_drugs)))
    # Read all biotransformation files for each drug.
    biotransformations = []
    for drugbank_id in approved_drugs['drugbank_id']:
        filename = os.path.join(data_dir, f'{drugbank_id}.csv')
        if os.path.isfile(filename):
            biotransform = pd.read_csv(filename, usecols=['SMILES'])
            biotransform['drugbank_id'] = drugbank_id
            biotransformations.append(biotransform)
    biotransformations = (pd.merge(pd.concat(biotransformations),
                                   approved_drugs[['drugbank_id',
                                                   'atc_codes']],
                                   on='drugbank_id')
                       .copy().dropna()
                       .rename(columns={'SMILES': 'smiles'}))
    # Only retain biotransformations with valid and unique SMILES.
    smiles = []
    for biotransform_smiles in biotransformations['smiles']:
        mol = Chem.MolFromSmiles(biotransform_smiles)
        smiles.append(Chem.MolToSmiles(mol, False)
                      if mol is not None else None)
    biotransformations['smiles'] = smiles
    biotransformations = (biotransformations.dropna(subset=['smiles'])
                          .drop_duplicates('smiles')
                          .reset_index(drop=True))
    biotransformations.to_csv('../data/processed/fda_biotransformations.csv',
                              index=False)
else:
    biotransformations = pd.read_csv(
        '../data/processed/fda_biotransformations.csv')

In [None]:
if 'epidermis_prob' not in biotransformations.columns:
    # Generate features for the biotransformations.
    # (Exclude features not encountered during training.)
    mols = biotransformations['smiles'].apply(Chem.MolFromSmiles)
    features = pd.DataFrame(mordred_calculator.pandas(mols)[feature_labels]
                            .astype(np.float32))
    
    biotransformations['epidermis_prob'] = \
        classifier.predict_proba(features.values)[:, 1]
    biotransformations.to_csv('../data/processed/fda_biotransformations.csv',
                              index=False)
else:
    biotransformations = pd.read_csv(
        '../data/processed/fda_biotransformations.csv')

## Plotting

In [None]:
atc_map = {'A': 'Alimentary tract and metabolism',
           'B': 'Blood and blood forming organs',
           'C': 'Cardiovascular system',
           'D': 'Dermatologicals',
           'G': 'Genito-urinary system and sex hormones',
           'H': 'Systemic hormonal preparations, excluding sex hormones and insulins',
           'J': 'Antiinfectives for systemic use',
           'L': 'Antineoplastic and immunomodulating agents',
           'M': 'Musculo-skeletal system',
           'N': 'Nervous system',
           'P': 'Antiparasitic products, insecticides and repellents',
           'R': 'Respiratory system',
           'S': 'Sensory organs',
           'V': 'Various'}

approved_drugs['type'] = 'FDA approved drugs'
biotransformations['type'] = 'Human biotransformations'
combined = pd.concat([approved_drugs, biotransformations], ignore_index=True)
combined['atc_codes'] = combined['atc_codes'].str.split('|')
combined = combined.explode('atc_codes')
combined['atc_level1'] = combined['atc_codes'].str[:1].map(atc_map)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

sns.kdeplot(data=combined, x='epidermis_prob', hue='type', ax=ax,
            common_norm=False, fill=True)

ax.set_xlim(0, 1.01)
ax.xaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))

ax.set_xlabel('Predicted probability to observe on epidermis')

ax.legend_.set_title('')

sns.despine()

plt.savefig('fda_biotransformations_predictions.png', dpi=300,
            bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height * 2))

combined_atc = combined[~combined['atc_level1'].isin(['', 'Dermatologicals'])]
order = (combined_atc.groupby(['atc_level1']).median()
         .sort_values(by='epidermis_prob', ascending=False).index)

sns.violinplot(x='epidermis_prob', y='atc_level1', hue='type',
               data=combined_atc, order=order, split=True, orient='h', ax=ax)

ax.set_xlim(0, 1.01)
ax.xaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))

ax.set_xlabel('Predicted probability to observe on epidermis')
ax.set_ylabel('ATC level 1')

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.06), ncol=2)

sns.despine()

plt.savefig('fda_biotransformations_predictions_atc.png', dpi=300,
            bbox_inches='tight')
plt.show()
plt.close()