In [46]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, MACCSkeys
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from rdkit.Chem import PandasTools, AllChem
from rdkit import Chem, DataStructs

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import svm

from sklearn.metrics import f1_score, mean_squared_error


### Loading data

In [273]:
df = pd.read_csv('irritation.csv')
df.head(3)

Unnamed: 0,SMILES,Class
0,CC#CC,1
1,CCC=C,1
2,O=CC=O,1


In [274]:
# y - target value (df['logPerm] or df['Class])
X_train, X_test, y_train, y_test = train_test_split(df['SMILES'], df['Class'], test_size=0.2, random_state=10)

### Descriptor generation

In [217]:
class MolecularDescriptorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, descriptor_type='standard'):
        self.descriptor_type = descriptor_type
        self.descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
        self.get_descriptors = rdMolDescriptors.Properties(self.descriptor_names)

    def smi_to_descriptors(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            descriptors = np.array(self.get_descriptors.ComputeProperties(mol))
            return descriptors
        else:
            return np.zeros(len(self.descriptor_names))

    def smi_to_MACCS(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            maccs = np.array(list(MACCSkeys.GenMACCSKeys(mol).ToBitString())).astype(int)
            return maccs
        else:
            return np.zeros(167)
        
    def smi_to_Morgan(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=2048)
            ar = np.zeros((1,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp, ar)
            return ar
        else:
            return np.zeros(2048)


    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.descriptor_type == 'standard':
            return np.vstack(X.apply(self.smi_to_descriptors))
        elif self.descriptor_type == 'MACCS':
            return np.vstack(X.apply(self.smi_to_MACCS))
        elif self.descriptor_type == 'Morgan':
            return np.vstack(X.apply(self.smi_to_Morgan))
        else:
            raise ValueError("Unsupported descriptor type")


### Pipelines


In [218]:
descriptor_pipeline = Pipeline([
    ('descriptors', MolecularDescriptorTransformer(descriptor_type='standard')),
    ('scaler', StandardScaler())
])

X_train_ds = descriptor_pipeline.fit_transform(X_train)
X_test_ds = descriptor_pipeline.transform(X_test)

In [219]:
maccs_pipeline = Pipeline([
    ('MACCS', MolecularDescriptorTransformer(descriptor_type='MACCS')),
    ('scaler', StandardScaler())
])

X_train_maccs = maccs_pipeline.fit_transform(X_train)
X_test_maccs = maccs_pipeline.transform(X_test)


In [220]:
Morgan_pipeline = Pipeline([
    ('Morgan', MolecularDescriptorTransformer(descriptor_type='Morgan')),
    ('scaler', StandardScaler())
])

X_train_morgan = Morgan_pipeline.fit_transform(X_train)
X_test_morgan = Morgan_pipeline.transform(X_test)


## Training models

### RDKit descriptors

In [239]:
# Paste model you need
model = MLPClassifier(random_state=10)
model.fit(X_train_ds, y_train)

In [240]:
# R2-score or accuracy (depends on task)
model.score(X_test_ds, y_test)

0.9454022988505747

In [241]:
# Uncomment if you solve classification task
#y_pred = model.predict(X_test_ds)
#f1 = f1_score(y_test, y_pred)
#f1

0.9634381013470172

In [170]:
# Uncomment if you solve regression task
#y_pred = model.predict(X_test_ds)
#mse = mean_squared_error(y_test, y_pred)
#mse

5.052186102148048

### MACCS fingerprints

In [261]:
# Paste model you need
model2 = MLPClassifier(random_state=10)
model2.fit(X_train_maccs, y_train)

In [262]:
model2.score(X_test_maccs, y_test)

0.9415708812260536

In [247]:
#y_pred_maccs = model2.predict(X_test_maccs)
#mse = mean_squared_error(y_test, y_pred_maccs)
#mse

In [263]:
#y_pred_maccs = model2.predict(X_test_maccs)
#f1 = f1_score(y_test, y_pred_maccs)
#f1

0.9607211848036059

### Morgan fingerprints(2048)

In [270]:
# Paste model you need
model3 = GradientBoostingClassifier(random_state=10)
model3.fit(X_train_morgan, y_train)

In [271]:
model3.score(X_test_morgan, y_test)

0.9348659003831418

In [272]:
#y_pred_morgan = model3.predict(X_test_morgan)
#f1_morgan = f1_score(y_test, y_pred_morgan)
#f1_morgan

0.9569620253164557

In [214]:
#y_pred_morgan = model3.predict(X_test_morgan)
#mse = mean_squared_error(y_test, y_pred_morgan)
#mse

7.806883728478475