In [1]:
import pandas as pd
import numpy as np
import sys

import torch
import torch.nn as nn

In [8]:
import argparse
parser = argparse.ArgumentParser()

parser.add_argument('--device', type=str, default='cuda:3')
parser.add_argument('--seed', type=int, default=1234)

args = parser.parse_args([])

In [26]:
sys.path.append('../')
import xgboost # need to install ( pip install xgboost )
from sklearn.metrics import accuracy_score

In [10]:
dataset = 'DCC_NEW'

df = pd.read_csv(f'./chem_space.csv', index_col=0)
df.shape

(3926, 513)

In [21]:
# Split train-test data

traindf = df.sample(frac = 0.8, random_state=args.seed)
testdf = df.drop(traindf.index)

traindf.shape, testdf.shape

Index(['COc1c([N+](=O)[O-])ccc(CNCCCn2ccnc2)c1O',
       'CN1CCC(=C2c3ccccc3C=Cc3ccccc32)CC1',
       'O=C(Cn1nnc(COc2ccccc2)n1)NCc1ccc(N2CCOCC2)c(F)c1',
       'CCC(CC)NC(=O)c1nnc(N(C)Cc2ccccc2O)n1CCC(=O)N(C)C',
       'CNC(CO)(COc1ccc(N(C)C)cc1)C(C)(C)C',
       'COC1CC(N(C)c2ccc3c(c2)CCC3O)C1(C)C', 'NCCc1cnc[nH]1',
       'O=C(CCCc1ccccc1)OCC(COC(=O)CCCc1ccccc1)OC(=O)CCCc1ccccc1',
       'CC(=Cc1ccc(C)o1)C(=O)N1CCOC2(CCN(C(=O)c3ccc(-n4c(C)ccc4C)cc3)C2)C1',
       'O=C(Cc1cnn(-c2ccccc2)n1)Nc1c(Cl)ccc2nonc12',
       ...
       'Cc1cnc(C(=O)NCCc2ccc(S(=O)(=O)NC(=O)NC3CCCCC3)cc2)cn1',
       'CCCc1cc(=O)[nH]c(=S)[nH]1', 'NC(=O)CS(=O)C(c1ccccc1)c1ccccc1',
       'COC(=O)C1C2CC3c4[nH]c5ccccc5c4CCN3CC2CC(OC(=O)c2cc(OC)c(OC)c(OC)c2)C1OC',
       'Cc1ccc(C(C)NC2CC(C)(O)C2)c(O)c1',
       'COc1ccc(N2CCN(C(=O)c3coc4ccccc34)CC2)c(OC)c1',
       'Cc1ccc(N2NC(=O)C(=CC(C)C)C2=O)cc1Cl',
       'CC(C)C1CCCN(C(=O)NCc2ccc(N3CCC(O)CC3)cc2)CC1',
       'c1ccc(CN2C3CCC2CC(NCCCSc2nccs2)C3)cc1', 'NS(=O)(=

In [14]:
# Preprocessing 1

train_data, train_labels = traindf.values[:, :-1],  traindf.values[:, -1]
test_data, test_labels = testdf.values[:, :-1],  testdf.values[:, -1]
train_labels.shape, (train_labels==1).sum() # check number of positives

((3141,), 1579)

In [19]:
# Preprocessing 2

train_data = train_data.astype('float32')
test_data = test_data.astype('float32')

train_labels = np.array(list(map(lambda x : 1 if x else 0, train_labels)))
test_labels = np.array(list(map(lambda x : 1 if x else 0, test_labels)))

In [20]:
test_labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [22]:
# fitting XGBoost model

model = xgboost.XGBClassifier()
model.fit(train_data, train_labels)

In [28]:
# Predict test data

test_pred = model.predict(test_data)
predictions = [round(value) for value in test_pred]

In [29]:
# Accuracy score

accuracy = accuracy_score(test_labels, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 86.27%


In [30]:
# Save model

model.save_model("xgboost_model.json")

In [51]:
test_data.shape

(772, 512)