In [1]:
# Built-in
import os
from pathlib import Path
import time

# External
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Local
from adad.utils import drop_redundant_col, maccs2binary, time2str
from adad.torch_utils import evaluate, train_model


In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

file_data = os.path.join(PATH_ROOT, "data", "distinct_maccs", "Ames_dist_MACCS.csv")
file_idx_train = os.path.join(PATH_ROOT, "data", "distinct_cv", "Ames_cv_train.csv")
file_idx_test = os.path.join(PATH_ROOT, "data", "distinct_cv", "Ames_cv_test.csv")

idx_train = pd.read_csv(file_idx_train, dtype=pd.Int64Dtype())
idx_test = pd.read_csv(file_idx_test, dtype=pd.Int64Dtype())

data = pd.read_csv(file_data)
data["y"] = data["y"].astype("category").cat.codes
print("Labels:", data["y"].unique())
print("Shape:", data.shape)

# Checking each CV set
for c in idx_train.columns:
    _i_test = idx_test[c].dropna(axis=0).to_numpy().astype(int)
    _i_train = idx_train[c].dropna(axis=0).to_numpy().astype(int)
    assert not np.all(np.isin(_i_test, _i_train))
    _n_samples = len(_i_train) + len(_i_test)
    assert _n_samples == data.shape[0], f"{_n_samples} != {data.shape[0]}"

# Selecting only 1 CV set
col = idx_train.columns[0]
idx_test = idx_test[col].dropna(axis=0).to_numpy().astype(int)
idx_train = idx_train[col].dropna(axis=0).to_numpy().astype(int)

/home/lukec/workspace/applicabilityDomain
Labels: [0 1]
Shape: (6018, 167)


In [3]:
y = data['y'].to_numpy().astype(int)

# Drop columns with no information for training.
data = drop_redundant_col(data)
X = data.drop(['y'], axis=1).to_numpy()

# Change any value greater than 1 to 1
X = maccs2binary(X)

In [4]:
X_train, X_test = X[idx_train], X[idx_test]
y_train, y_test = y[idx_train], y[idx_test]
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(4814, 154) (1204, 154)
(4814,) (1204,)


In [5]:
class SimpleModel(nn.Module):
    """A simple fullly-connected neural network with 1 hidden-layer"""

    def __init__(self, input_dim, hidden_dim=512, output_dim=2):
        super(SimpleModel, self).__init__()

        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [6]:
BATCH_SIZE = 128
MAX_EPOCHS = 300

dataset_train = TensorDataset(
    torch.from_numpy(X_train).type(torch.float32),
    torch.from_numpy(y_train).type(torch.int64),
)
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

dataset_test = TensorDataset(
    torch.from_numpy(X_test).type(torch.float32),
    torch.from_numpy(y_test).type(torch.int64),
)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print('Running on CPU!')
n_features = X_train.shape[1]
model = SimpleModel(n_features).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [8]:
# Train the clean model
time_start = time.perf_counter()
train_model(model, dataloader_train, optimizer, loss_fn, device, MAX_EPOCHS)
time_elapsed = time.perf_counter() - time_start
print('Time taken: {}'.format(time2str(time_elapsed)))

Time taken: 00h00m02s


In [9]:
acc_train, loss_train = evaluate(dataloader_train, model, loss_fn, device)
acc_test, loss_test = evaluate(dataloader_test, model, loss_fn, device)
print('Train acc: {:.2f} loss: {:.3f}. Test acc: {:.2f} loss: {:.3f}'.format(
    acc_train * 100, loss_train, acc_test * 100, loss_test,))

Train acc: 97.20 loss: 0.057. Test acc: 77.49 loss: 1.288


## Testing 1D Convolution Model

In [10]:
class Conv1dModel(nn.Module):
    def __init__(self, n_channels):
        super(Conv1dModel, self).__init__()

        self.conv1 = nn.Conv1d(
            n_channels, out_channels=1, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv1d(
            1, out_channels=1, kernel_size=3, stride=2, padding=1)
        self.flatten = nn.Flatten(1)
        self.fc1 = nn.Linear(39, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv1(x))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [11]:
model2 = Conv1dModel(n_features).to(device)
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [12]:
time_start = time.perf_counter()
train_model(model, dataloader_train, optimizer, loss_fn, device, MAX_EPOCHS)
time_elapsed = time.perf_counter() - time_start
print('Time taken: {}'.format(time2str(time_elapsed)))

Time taken: 00h00m00s


In [13]:
acc_train, loss_train = evaluate(dataloader_train, model, loss_fn, device)
acc_test, loss_test = evaluate(dataloader_test, model, loss_fn, device)
print('Train acc: {:.2f} loss: {:.3f}. Test acc: {:.2f} loss: {:.3f}'.format(
    acc_train * 100, loss_train, acc_test * 100, loss_test,))

Train acc: 97.20 loss: 0.057. Test acc: 77.49 loss: 1.285
