In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
import pickle 
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV

## Load data

In [2]:
df = pd.read_csv('irritation.csv')
df.head(3)

Unnamed: 0,SMILES,Class
0,CC#CC,1
1,CCC=C,1
2,O=CC=O,1


Converting to MACCS fp

In [3]:
smiles = df['SMILES'].to_list()

In [4]:
mols = [Chem.MolFromSmiles(i) for i in smiles]
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
for i in range(len(mols)):
    ds = list(MACCSkeys.GenMACCSKeys(mols[i]).ToBitString())
    MACCS_list.append(ds)
df2 = pd.DataFrame(MACCS_list,columns=header)
df2.insert(loc=0, column='smiles', value=smiles)
df2.head(3)

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,CC#CC,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CCC=C,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,O=CC=O,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [5]:
x = np.array(df2.iloc[:, 1:168])
y = np.array(df['Class'])

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=10)

In [7]:
model = lgb.LGBMClassifier(random_state=10)
model.fit(xtrain, ytrain)

In [8]:
model.score(xtest, ytest)

0.9454022988505747

In [9]:
ypred = model.predict(xtest)

In [10]:
f1 = f1_score(ytest, ypred)
f1

0.9633911368015414

### LGBMClassifier hyperparameter optimization

In [19]:
model = lgb.LGBMClassifier(random_state=10)

gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16],
    'max_bin':[255, 510],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(model, gridParams, verbose=1, cv=10, n_jobs=-1)
# Run the grid
grid.fit(xtrain, ytrain)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 1728 candidates, totalling 17280 fits
[LightGBM] [Info] Number of positive: 3105, number of negative: 1071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 296
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.743534 -> initscore=1.064421
[LightGBM] [Info] Start training from score 1.064421
{'colsample_bytree': 0.64, 'learning_rate': 0.005, 'max_bin': 255, 'n_estimators': 8, 'num_leaves': 6, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.7435343591155783


In [20]:
opimized_model = lgb.LGBMClassifier(colsample_bytree =0.64, 
                                    learning_rate = 0.005, 
                                    max_bin = 255, 
                                    n_estimators = 8, 
                                    num_leaves = 6, 
                                    reg_alpha = 1, 
                                    reg_lambda = 1, 
                                    subsample = 0.7)
opimized_model.fit(xtrain, ytrain)

[LightGBM] [Info] Number of positive: 3105, number of negative: 1071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 296
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.743534 -> initscore=1.064421
[LightGBM] [Info] Start training from score 1.064421


In [21]:
opimized_model.score(xtest, ytest)

0.7365900383141762

Defaults hyperparameters show the best results

## Case study
Here we evaluate mean probability of melanin binding of valid molecules obtained by FREEDD++ or updraded FREED++ model

In [35]:
# Paste your csv file obtained after generation
smiles = pd.read_csv('smiles_ML_1000.csv').to_list()
smiles

Unnamed: 0,Smiles
0,O=C(O)Cc1c(Cl)cccc1C1CCN(S(=O)(=O)[NH2+]C2CCC(...
1,CCOC(=O)C(C(=O)O)c1c(Cc2ccccn2)cccc1C1COc2ccc(...
2,CC(S)c1c(CC(=O)O)cccc1C1CC(O)C(C2CS(=O)(=O)CC2...
3,CCC(N)C(=O)c1c(-c2cc(C3CCCCC3)ccc2[N+](=O)[O-]...
4,NC(=S)C(C(=O)O)c1c(C(=S)Nc2cccs2)cccc1-c1ccc2c...
...,...
2324,C=CC1CCCC1c1ccccc1C(OC(=O)Cl)C(=O)O
2325,O=CCc1ccc(C(F)(F)F)cc1C(CC(=O)C(F)(F)F)C(=O)O
2326,O=C(O)Cc1cccc(C(=O)OOC(=O)C2CCCC2)c1C(F)(F)F
2327,O=C(OC(C(=O)O)c1ccccc1)C(=O)C(=O)C1CCCC1


In [37]:
mols = [Chem.MolFromSmiles(i) for i in smiles]
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
for i in range(len(mols)):
    ds = list(MACCSkeys.GenMACCSKeys(mols[i]).ToBitString())
    MACCS_list.append(ds)
df3 = pd.DataFrame(MACCS_list,columns=header)
df3.insert(loc=0, column='smiles', value=smiles)
df3.head(3)

In [None]:
X = np.array(df3.iloc[:, 1:-1])

In [40]:
irritation = model.predict_proba(X)[:, 1]
irritation

array([0.0178672 , 0.01900226, 0.01392666, ..., 0.2140262 , 0.05431613,
       0.04460562])

In [41]:
irritation_mean = np.mean(irritation)
irritation_mean

0.12116287882111564

In [None]:
std = np.std(irritation_mean)
std

### Save model

In [11]:
pkl_filename = "irritation.pkl"

In [12]:
with open(pkl_filename, 'wb') as file: 
    pickle.dump(model, file)