In [1]:
import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd
from rdkit.Chem import MACCSkeys

from sklearn.model_selection import train_test_split

import pickle
from sklearn.metrics import f1_score

import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

### Preparing data

In [2]:
df = pd.read_csv('melanin.csv')
df.head(3)

Unnamed: 0,SMILES,Class
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,1
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,1
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,1


Converting SMILES to RDKit MACCS fp (167 features)

In [3]:
smiles = df['SMILES'].to_list()

In [4]:
mols = [Chem.MolFromSmiles(i) for i in smiles]
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
for i in range(len(mols)):
    ds = list(MACCSkeys.GenMACCSKeys(mols[i]).ToBitString())
    MACCS_list.append(ds)
df2 = pd.DataFrame(MACCS_list,columns=header)
df2.insert(loc=0, column='smiles', value=smiles)
df2.head(3)

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,1
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [5]:
X = np.array(df2.iloc[:, 1:-1].astype(int))
y = np.array(df['Class'])

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=10)

### Train KAN

In [7]:
from kan.KAN import KAN

In [8]:
dataset = {} 
dataset['train_input'] = torch.tensor(Xtrain, dtype=torch.int64) 
dataset['train_label'] = torch.tensor(ytrain.reshape(-1, 1), dtype=torch.int64) 
dataset['test_input'] = torch.tensor(Xtest, dtype=torch.int64) 
dataset['test_label'] = torch.tensor(ytest.reshape(-1, 1), dtype=torch.int64) 
 
X = dataset['train_input'] 
y = dataset['train_label']


In [38]:
model = KAN(width=[166,1], grid=30, k=5)

def train_acc(): 
    return torch.mean((torch.round(model(dataset['train_input'])[:,0]) == dataset['train_label'][:,0]).float()) 
 
def test_acc(): 
    return torch.mean((torch.round(model(dataset['test_input'])[:,0]) == dataset['test_label'][:,0]).float()) 
 
results = model.train(dataset, opt="LBFGS", steps=10, metrics=(train_acc, test_acc), lamb = 0.1, lamb_entropy = 0.0) 
results['train_acc'][-1], results['test_acc'][-1]

train loss: 3.93e-01 | test loss: 4.41e-01 | reg: 2.45e+00 : 100%|██| 10/10 [00:35<00:00,  3.54s/it]


(0.7948718070983887, 0.75)

## Hyperparameter optimization

In [15]:
import itertools

# Define the ranges for each parameter
grid_values = [1, 5, 10, 20, 30]  
k_values = [3, 5, 7] 

best_accuracy = 0
best_params = {}

# Perform grid search to find the best combination of parameters
for grid_val, k_val in itertools.product(grid_values, k_values):
    model = KAN(width=[166,1,1], grid=grid_val, k=k_val)
    results = model.train(dataset, opt="LBFGS", steps=10, metrics=(train_acc, test_acc))
    
    train_accuracy = results['train_acc'][-1]
    test_accuracy = results['test_acc'][-1]
    
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_params = {'grid': grid_val, 'k': k_val}

print("Best parameters found:")
print(best_params)
print("Best test accuracy:", best_accuracy)


train loss: 2.98e-01 | test loss: 4.12e-01 | reg: 1.48e+02 : 100%|██| 10/10 [00:15<00:00,  1.57s/it]
train loss: 2.82e-01 | test loss: 4.72e-01 | reg: 8.83e+01 : 100%|██| 10/10 [00:17<00:00,  1.71s/it]
train loss: 2.99e-01 | test loss: 4.17e-01 | reg: 1.31e+02 : 100%|██| 10/10 [00:21<00:00,  2.11s/it]
train loss: 2.23e-01 | test loss: 5.50e-01 | reg: 4.57e+01 : 100%|██| 10/10 [00:16<00:00,  1.61s/it]
train loss: 2.55e-01 | test loss: 6.11e-01 | reg: 4.10e+01 : 100%|██| 10/10 [00:19<00:00,  1.96s/it]
train loss: 2.46e-01 | test loss: 5.19e-01 | reg: 8.56e+01 : 100%|██| 10/10 [00:24<00:00,  2.46s/it]
train loss: 2.02e-01 | test loss: 5.23e-01 | reg: 3.59e+01 : 100%|██| 10/10 [00:17<00:00,  1.79s/it]
train loss: 2.00e-01 | test loss: 5.10e-01 | reg: 4.16e+01 : 100%|██| 10/10 [00:23<00:00,  2.31s/it]
train loss: 2.03e-01 | test loss: 5.30e-01 | reg: 3.63e+01 : 100%|██| 10/10 [00:29<00:00,  2.91s/it]
train loss: 1.86e-01 | test loss: 5.08e-01 | reg: 2.86e+01 : 100%|██| 10/10 [00:23<00:00,  

Best parameters found:
{'grid': 30, 'k': 5}
Best test accuracy: 0.7628205418586731





Calculating average values of accuracy on train and test sets (seed=1000, 2000, 2500, 3000, 3500)

In [29]:
model = KAN(width=[166,1], grid=30, k=5, seed=3500)

def train_acc(): 
    return torch.mean((torch.round(model(dataset['train_input'])[:,0]) == dataset['train_label'][:,0]).float()) 
 
def test_acc(): 
    return torch.mean((torch.round(model(dataset['test_input'])[:,0]) == dataset['test_label'][:,0]).float()) 
 
results = model.train(dataset, opt="LBFGS", steps=10, metrics=(train_acc, test_acc), lamb = 0.1, lamb_entropy = 0.0) 
results['train_acc'][-1], results['test_acc'][-1]

train loss: 3.78e-01 | test loss: 4.10e-01 | reg: 1.33e+00 : 100%|██| 10/10 [00:39<00:00,  3.93s/it]


(0.7932692170143127, 0.7884615659713745)

In [30]:
train = [0.7836538553237915, 0.7868589758872986, 0.7756410241127014, 0.7916666865348816, 0.7932692170143127]
test = [0.7179487347602844, 0.7435897588729858, 0.7756410241127014, 0.7692307829856873, 0.7884615659713745]

In [31]:
train_mean = np.mean(train)
train_mean

0.7862179517745972

In [33]:
test_mean = np.mean(test)
test_mean

0.7589743733406067

In [32]:
std_train = np.std(train)
std_train

0.006297077945457357

In [34]:
std_test = np.std(test)
std_test

0.025188311175032084

### Save the model

In [69]:
torch.save(model.state_dict(), 'KAN_melanin.pth')