Will do a experimental run with 3A4 inhibition data for practice.

Utilizing SMILES representation to calculate descriptors and filter out the features that will be needed.

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors
from mordred._base import Descriptor

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

In [4]:
# Load data consisting of SMILES data
cyp3a4_inhib_train = pd.read_csv('data/cyp3a4_inhibitor_train.csv')
cyp3a4_inhib_test = pd.read_csv('data/cyp3a4_inhibitor_test.csv')
cyp3a4_inhib_val = pd.read_csv('data/cyp3a4_inhibitor_val.csv')


In [5]:
# Figuring out the descriptor features to use in the data
smiles_list = cyp3a4_inhib_train['smiles_standarized'][0:100]
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
mols = [mol for mol in mols if mol is not None]  # remove failed conversions

# Initialize the Mordred calculator with all descriptors
calc = Calculator(descriptors, ignore_3D=True)
df_desc = calc.pandas(mols, nproc=4)

# Filter descriptors with low variance: constants that provide minimal information and not useful for modeling
selector = VarianceThreshold(threshold=0.01)  
df_filtered = pd.DataFrame(selector.fit_transform(df_desc),
                            columns=df_desc.columns[selector.get_support()])

# Normalize data with standard scaler: useful as data contain logarithmic values
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_filtered),
    columns=df_filtered.columns,
    index=df_filtered.index)

# Finding correlation between descriptors to drop highly correlated features (duplicate features)
# Set threhold for corelation: 0.85
threshold = 0.85
corr_matrix = df_scaled.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) # Get upper triangle of correlation matrix
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
df_reduced = df_filtered.drop(columns=to_drop)


 58%|█████▊    | 58/100 [00:54<01:17,  1.85s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 100/100 [01:12<00:00,  1.39it/s]
  self.variances_ = np.nanvar(X, axis=0)


In [6]:
# Extract the descriptors from the reduced DataFrame
print(len(df_reduced.columns))  # Number of descriptors after filtering
selected_desc = df_reduced.columns.tolist()

# Extracting the descriptors from the original Mordred calculator
calc_all = Calculator(descriptors, ignore_3D=True)
mol = Chem.MolFromSmiles("CCO")
df_all = calc_all.pandas([mol])
desc_map = {}
for desc in calc_all.descriptors:
    name = str(desc)
    if name in df_all.columns:
        desc_map[name] = desc

# Filtering selected descriptors from the original Mordred calculator
selected_desc_objects = [desc_map[name] for name in selected_desc if name in desc_map]



266


100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


In [8]:
def desc_extract_batch(df, batch_size=1000):
    tqdm.pandas()
    #calc = Calculator(selected_desc, ignore_3D=True)
    calc_selected = Calculator(selected_desc_objects, ignore_3D=True)
    df = df['smiles_standarized']
    
    all_desc = []

    for i in range(0, len(df), batch_size):
        batch = df[i:i+batch_size]
        print(f"\n▶ Processing batch {i}–{i+len(batch)-1}...")

        mols = [Chem.MolFromSmiles(s) for s in tqdm(batch, desc="SMILES → Mol")]
        mols = [mol for mol in mols if mol is not None]
        df_desc = calc_selected.pandas(mols, nproc=4)
        all_desc.append(df_desc)

    # Combine all batches
    df_all = pd.concat(all_desc, axis=0)

    return df_all

In [157]:
train_df_batch = desc_extract_batch(cyp3a4_inhib_train)


▶ Processing batch 0–999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 12832.70it/s]



▶ Processing batch 1000–1999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13579.91it/s]



▶ Processing batch 2000–2999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 6972.63it/s]



▶ Processing batch 3000–3999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14229.85it/s]



▶ Processing batch 4000–4999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 12898.09it/s]



▶ Processing batch 5000–5999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13283.33it/s]



▶ Processing batch 6000–6999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13484.38it/s]



▶ Processing batch 7000–7999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 12914.09it/s]



▶ Processing batch 8000–8999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13380.84it/s]



▶ Processing batch 9000–9999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13719.29it/s]



▶ Processing batch 10000–10999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 12089.87it/s]



▶ Processing batch 11000–11999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 12347.96it/s]



▶ Processing batch 12000–12999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14393.04it/s]



▶ Processing batch 13000–13999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14192.48it/s]



▶ Processing batch 14000–14999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14216.44it/s]



▶ Processing batch 15000–15999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14301.85it/s]



▶ Processing batch 16000–16999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14732.93it/s]



▶ Processing batch 17000–17999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 14541.89it/s]



▶ Processing batch 18000–18427...


SMILES → Mol: 100%|██████████| 428/428 [00:00<00:00, 14221.58it/s]


In [162]:
train_df_batch = train_df_batch.reset_index(drop=True)
concat_3a4_train = pd.concat([cyp3a4_inhib_train, train_df_batch], axis = 1)
print(train_df_batch.info())
print(concat_3a4_train.info())

# Save the processed DataFrame to a CSV file
concat_3a4_train.to_csv('cyp3a4_inhibitor_train_desc.csv', index=False)

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 18428 entries, 0 to 18427
Columns: 266 entries, ABC to n9FAHRing
dtypes: float64(76), int64(56), object(134)
memory usage: 37.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18428 entries, 0 to 18427
Columns: 269 entries, smiles_standarized to n9FAHRing
dtypes: float64(77), int64(56), object(136)
memory usage: 37.8+ MB
None


In [159]:
test_df_batch = desc_extract_batch(cyp3a4_inhib_test)


▶ Processing batch 0–999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13466.98it/s]



▶ Processing batch 1000–1999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13845.60it/s]



▶ Processing batch 2000–2306...


SMILES → Mol: 100%|██████████| 307/307 [00:00<00:00, 13823.71it/s]


In [2]:
test_df_batch = test_df_batch.reset_index(drop=True)
concat_3a4_test = pd.concat([cyp3a4_inhib_test, test_df_batch], axis = 1)
print(test_df_batch.info())
print(concat_3a4_test.info())

# Save the processed DataFrame to a CSV file
concat_3a4_test.to_csv('cyp3a4_inhibitor_test_desc.csv', index=False)

NameError: name 'test_df_batch' is not defined

In [9]:
val_df_batch = desc_extract_batch(cyp3a4_inhib_val)
val_df_batch = val_df_batch.reset_index(drop=True)
concat_3a4_val = pd.concat([cyp3a4_inhib_val, val_df_batch], axis = 1)
print(val_df_batch.info())
print(concat_3a4_val.info())
# Save the processed DataFrame to a CSV file
concat_3a4_val.to_csv('cyp3a4_inhibitor_val_desc.csv', index=False)


▶ Processing batch 0–999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 11253.41it/s]
100%|██████████| 1000/1000 [01:10<00:00, 14.27it/s]



▶ Processing batch 1000–1999...


SMILES → Mol: 100%|██████████| 1000/1000 [00:00<00:00, 13624.73it/s]
100%|██████████| 1000/1000 [00:58<00:00, 17.16it/s]



▶ Processing batch 2000–2304...


SMILES → Mol: 100%|██████████| 305/305 [00:00<00:00, 13400.61it/s]
100%|██████████| 305/305 [00:13<00:00, 22.37it/s]


<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 2305 entries, 0 to 2304
Columns: 266 entries, ABC to n9FAHRing
dtypes: float64(113), int64(56), object(97)
memory usage: 4.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2305 entries, 0 to 2304
Columns: 269 entries, smiles_standarized to n9FAHRing
dtypes: float64(114), int64(56), object(99)
memory usage: 4.7+ MB
None
