In [25]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, MACCSkeys
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyRegressor, LazyClassifier
from rdkit.Chem import PandasTools, AllChem
from rdkit import Chem, DataStructs

### Loading data

In [31]:
df = pd.read_csv('irritation.csv')
df

Unnamed: 0,SMILES,Class
0,CC#CC,1
1,CCC=C,1
2,O=CC=O,1
3,CCC=O,1
4,FCC#N,1
...,...,...
5215,CC1OC(CC(O)C1O)OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C...,0
5216,CC1OC(CC(O)C1O)OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C...,0
5217,CN(C(=O)C)c1c(I)c(NC(=O)C)c(I)c(C(=O)NC(C=O)C(...,0
5218,CC(=O)N(CC(O)CO)c1c(I)c(C(=O)NCCO)c(I)c(C(=O)N...,0


In [32]:
X_train, X_test, y_train, y_test = train_test_split(df['SMILES'], df['Class'], test_size=0.2, random_state=10)

### Descriptor generation

In [33]:
class MolecularDescriptorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, descriptor_type='standard'):
        self.descriptor_type = descriptor_type
        self.descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
        self.get_descriptors = rdMolDescriptors.Properties(self.descriptor_names)

    def smi_to_descriptors(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            descriptors = np.array(self.get_descriptors.ComputeProperties(mol))
            return descriptors
        else:
            return np.zeros(len(self.descriptor_names))

    def smi_to_MACCS(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            maccs = np.array(list(MACCSkeys.GenMACCSKeys(mol).ToBitString())).astype(int)
            return maccs
        else:
            return np.zeros(167)
        
    def smi_to_Morgan(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=2048)
            ar = np.zeros((1,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp, ar)
            return ar
        else:
            return np.zeros(2048)


    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.descriptor_type == 'standard':
            return np.vstack(X.apply(self.smi_to_descriptors))
        elif self.descriptor_type == 'MACCS':
            return np.vstack(X.apply(self.smi_to_MACCS))
        elif self.descriptor_type == 'Morgan':
            return np.vstack(X.apply(self.smi_to_Morgan))
        else:
            raise ValueError("Unsupported descriptor type")


### Pipelines


In [34]:
descriptor_pipeline = Pipeline([
    ('descriptors', MolecularDescriptorTransformer(descriptor_type='standard')),
    ('scaler', StandardScaler())
])

X_train_transformed = descriptor_pipeline.fit_transform(X_train)
X_test_transformed = descriptor_pipeline.transform(X_test)

In [35]:
X_train_transformed.shape

(4176, 43)

In [36]:
maccs_pipeline = Pipeline([
    ('MACCS', MolecularDescriptorTransformer(descriptor_type='MACCS')),
    ('scaler', StandardScaler())
])

X_train_maccs = maccs_pipeline.fit_transform(X_train)
X_test_maccs = maccs_pipeline.transform(X_test)


In [37]:
Morgan_pipeline = Pipeline([
    ('Morgan', MolecularDescriptorTransformer(descriptor_type='Morgan')),
    ('scaler', StandardScaler())
])

X_train_morgan = maccs_pipeline.fit_transform(X_train)
X_test_morgan = maccs_pipeline.transform(X_test)


## Training models

### RDKit descriptors

In [38]:
reg = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=10)
models, predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)
print("Results for standard descriptors:")
print(models)

100%|██████████| 29/29 [00:08<00:00,  3.29it/s]

[LightGBM] [Info] Number of positive: 3105, number of negative: 1071
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5882
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.743534 -> initscore=1.064421
[LightGBM] [Info] Start training from score 1.064421
Results for standard descriptors:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.94               0.92     0.92      0.94   
XGBClassifier                      0.94               0.91     0.91      0.94   
BaggingClassifier                  0.94               0.91     0.91      0.94   
SGDClassifier                      0.93               0.91     0.91     




### MACCS fingerprints

In [39]:
reg_maccs = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=10)
models_maccs, predictions_maccs = reg_maccs.fit(X_train_maccs, X_test_maccs, y_train, y_test)
print("Results for MACCS descriptors:")
print(models_maccs)

  0%|          | 0/29 [00:00<?, ?it/s]

100%|██████████| 29/29 [00:07<00:00,  3.67it/s]

[LightGBM] [Info] Number of positive: 3105, number of negative: 1071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.743534 -> initscore=1.064421
[LightGBM] [Info] Start training from score 1.064421
Results for MACCS descriptors:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.95               0.93     0.93      0.95   
ExtraTreesClassifier               0.95               0.92     0.92      0.95   
XGBClassifier                      0.95               0.92     0.92      0.94   
RandomFores




### Morgan fingerprints(2048)

In [40]:
reg_morgan = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=10)
models_morgan, predictions_morgan = reg_morgan.fit(X_train_morgan, X_test_morgan, y_train, y_test)
print("Results for MACCS descriptors:")
print(models_morgan)

100%|██████████| 29/29 [00:07<00:00,  3.64it/s]

[LightGBM] [Info] Number of positive: 3105, number of negative: 1071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.743534 -> initscore=1.064421
[LightGBM] [Info] Start training from score 1.064421
Results for MACCS descriptors:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.95               0.93     0.93      0.95   
ExtraTreesClassifier               0.95               0.92     0.92      0.95   
XGBClassifier                      0.95               0.92     0.92      0.94   
RandomFores


