In [3]:
import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import f1_score, accuracy_score
from kan.KAN import KAN
import itertools
from rdkit import Chem
from rdkit.Chem import MACCSkeys

### Preparing data

In [8]:
df = pd.read_csv('MACCS_melanin_classes.csv')


In [9]:
X = np.array(df.iloc[:, 0:166])
y = np.array(df['Class'])

In [10]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=10)

### Train KAN

In [12]:
dataset = {} 
dataset['train_input'] = torch.tensor(Xtrain, dtype=torch.int64) 
dataset['train_label'] = torch.tensor(ytrain.reshape(-1, 1), dtype=torch.int64) 
dataset['test_input'] = torch.tensor(Xtest, dtype=torch.int64) 
dataset['test_label'] = torch.tensor(ytest.reshape(-1, 1), dtype=torch.int64) 
 
X = dataset['train_input'] 
y = dataset['train_label']

In [13]:
model = KAN(width=[166,1,1], grid=10, k=5, seed=2024)

def train_acc(): 
    return torch.mean((torch.round(model(dataset['train_input'])[:,0]) == dataset['train_label'][:,0]).float()) 
 
def test_acc(): 
    return torch.mean((torch.round(model(dataset['test_input'])[:,0]) == dataset['test_label'][:,0]).float()) 
 
results = model.train(dataset, opt="LBFGS", steps=10, metrics=(train_acc, test_acc), lamb = 0.1, lamb_entropy = 0.0) 
results['train_acc'][-1], results['test_acc'][-1]

description:   0%|                                                           | 0/10 [00:00<?, ?it/s]

train loss: 3.90e-01 | test loss: 4.25e-01 | reg: 2.60e+00 : 100%|██| 10/10 [00:21<00:00,  2.18s/it]


(0.7820512652397156, 0.7628205418586731)

## Hyperparameter optimization

In [40]:
grid_values = [1, 5, 10]  
k_values = [3, 5, 7] 

best_accuracy = 0
best_params = {}

for grid_val, k_val in itertools.product(grid_values, k_values):
    model = KAN(width=[166,1,1], grid=grid_val, k=k_val, seed=2024)
    results = model.train(dataset, opt="LBFGS", steps=10, metrics=(train_acc, test_acc))
    
    train_accuracy = results['train_acc'][-1]
    test_accuracy = results['test_acc'][-1]
    
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_params = {'grid': grid_val, 'k': k_val}

print("Best parameters found:")
print(best_params)
print("Best test accuracy:", best_accuracy)

train loss: 2.82e-01 | test loss: 4.36e-01 | reg: 6.44e+01 : 100%|██| 10/10 [00:13<00:00,  1.39s/it]
train loss: 2.89e-01 | test loss: 4.26e-01 | reg: 7.76e+01 : 100%|██| 10/10 [00:16<00:00,  1.68s/it]
train loss: 2.92e-01 | test loss: 4.08e-01 | reg: 9.76e+01 : 100%|██| 10/10 [00:21<00:00,  2.15s/it]
train loss: 2.71e-01 | test loss: 4.38e-01 | reg: 4.80e+01 : 100%|██| 10/10 [00:15<00:00,  1.59s/it]
train loss: 2.90e-01 | test loss: 4.47e-01 | reg: 4.31e+01 : 100%|██| 10/10 [00:20<00:00,  2.06s/it]
train loss: 2.57e-01 | test loss: 4.88e-01 | reg: 6.09e+01 : 100%|██| 10/10 [00:26<00:00,  2.66s/it]
train loss: 2.75e-01 | test loss: 5.15e-01 | reg: 2.80e+01 : 100%|██| 10/10 [00:19<00:00,  1.93s/it]
train loss: 2.23e-01 | test loss: 5.25e+00 | reg: 2.78e+01 : 100%|██| 10/10 [00:24<00:00,  2.46s/it]
train loss: 2.12e-01 | test loss: 4.99e-01 | reg: 3.51e+01 : 100%|██| 10/10 [00:30<00:00,  3.09s/it]

Best parameters found:
{'grid': 1, 'k': 7}
Best test accuracy: 0.7564102411270142





In [14]:
test_preds = model(dataset['test_input']).detach().numpy()

In [15]:
test_preds

array([[0.95113873],
       [0.5657139 ],
       [0.73611265],
       [0.8275491 ],
       [0.62381995],
       [0.7098993 ],
       [0.6238558 ],
       [0.7183525 ],
       [0.73597646],
       [0.9105602 ],
       [0.8609364 ],
       [1.0432518 ],
       [0.79464006],
       [0.7719816 ],
       [0.8002501 ],
       [0.8024981 ],
       [0.7598028 ],
       [0.8387475 ],
       [0.89063656],
       [1.0432518 ],
       [0.61337054],
       [0.9940603 ],
       [0.8984593 ],
       [0.78452015],
       [0.7372575 ],
       [0.7932718 ],
       [0.99514437],
       [0.7864554 ],
       [0.82995546],
       [0.7357609 ],
       [0.7572347 ],
       [0.87642276],
       [0.7813022 ],
       [0.65637255],
       [0.8895056 ],
       [0.6791941 ],
       [0.8387133 ],
       [0.75524503],
       [0.7044053 ],
       [0.93268216],
       [0.8594047 ],
       [0.8971331 ],
       [0.7597382 ],
       [0.70162654],
       [0.8816009 ],
       [0.8925695 ],
       [0.88326514],
       [0.796

## Case study
Convering SMILES to RDKit descriptors

In [57]:
smiles = pd.read_csv('smiles_noML_1000.csv')['Smiles'].to_list()

In [58]:
mols = [Chem.MolFromSmiles(i) for i in smiles]
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
for i in range(len(mols)):
    ds = list(MACCSkeys.GenMACCSKeys(mols[i]).ToBitString())
    MACCS_list.append(ds)
df2 = pd.DataFrame(MACCS_list,columns=header)
df2.insert(loc=0, column='smiles', value=smiles)
df2

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,O=CC(=O)N(c1ccc(CC(=O)O)cc1)C(O)F,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1,COCCc1cc(Cl)c(CC(=O)O)c(NN)c1C1CC1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2,C#CC(F)(C#N)Cc1cccc(C(NN)C(=O)O)c1,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
3,CC(C)ONC(=S)c1ccc(C(=O)[O-])cc1C(C#N)C(=O)O,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,C=CCOc1cc(O)cc(C(C(=O)O)c2ccoc2)c1C=C,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2577,O=C(O)C(Cc1ccccc1)c1ccc(C(F)(F)F)c(C(F)(F)F)c1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
2578,O=C(O)C(Cc1ccccc1)c1cccc(C(F)(F)F)c1C(F)(F)F,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
2579,O=C(O)C(c1c(-c2ccco2)ccc(C(F)(F)F)c1C(F)(F)F)C...,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
2580,O=C(O)Cc1cc(C(F)(F)F)ccc1C(F)(Cc1ccccc1)C(F)(F)F,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0


In [59]:
data = df2.iloc[:, 1:167]
data = np.array(data).astype(int)

In [60]:
dataset = {} 
dataset['input'] = torch.tensor(data, dtype=torch.float32) 

### Calculate probability of melanin binding for all generated molecules

In [61]:
predictions = model(dataset['input']).detach().numpy()

In [49]:
predictions 

array([[0.8397027 ],
       [0.5808775 ],
       [0.4220845 ],
       ...,
       [0.69458294],
       [0.63362074],
       [0.588869  ]], dtype=float32)

In [62]:
mean_pred = np.mean(predictions)
mean_pred

0.74928963

### Calculating average probability of melanin binding for molecules generated by FREED++

In [63]:
mel = [0.7648465, 0.75293547, 0.72139764, 0.75496554, 0.74928963]

In [64]:
mean_mel = np.mean(mel)
mean_mel

0.748686956

In [65]:
std = np.std(mel)
std

0.014585412810334325

### Save model

In [46]:
torch.save(model.state_dict(), 'KAN_melanin.pth')