In [1]:
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# Featurizer class definition

class Featurizer():
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        
        df = df[df['Ki']<=1e4]
        df = df[df['Ki']>0.01]
        labels = df['Ki']
        
        fp = []
        for index, row in df.iterrows():
            fp = row[1:]
            fingerprints.append(fp)
            
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels

In [2]:
# text file to DataFrame object

filename = '../cleaned_datasets/d2_Sub_clean.csv'
df = pd.read_csv(filename)
df = df.dropna()

featurizer = Featurizer()
fp_train, ki_train = featurizer(df)

fp_train = torch.from_numpy(fp_train)
ki_train = torch.from_numpy(ki_train)

assert fp_train.shape[0] == ki_train.shape[0], 'X_train and y_train rows do not match'

FileNotFoundError: [Errno 2] No such file or directory: './cleaned_datasets/d2_Sub_clean.csv'

In [None]:
fp_train.shape

In [None]:
# data loader

from torch.utils.data import DataLoader, TensorDataset

tensor_ds = TensorDataset(fp_train, ki_train)
train = tensor_ds

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

class Encoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 16)
        self.fc41 = nn.Linear(16, output_size)
        self.fc42 = nn.Linear(16, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        h2 = self.relu(self.fc2(h1))
        h3 = self.relu(self.fc3(h2))
        mu = self.fc41(h3)
        logvar = self.fc41(h3)
        return mu, logvar

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 64)
        self.fc3 = nn.Linear(64, 128)
        self.fc4 = nn.Linear(128, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        h2 = self.relu(self.fc2(h1))
        h3 = self.relu(self.fc3(h2))
        out = self.fc4(h3)
        return self.sigmoid(out)

class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        torch.set_default_dtype(torch.float64)
        super(VAE, self).__init__()
        self.encoder = Encoder(input_size, latent_size)
        self.decoder = Decoder(latent_size, 128, input_size)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar


In [None]:
class VAELoss(nn.Module):
    def __init__(self):
        super(VAELoss, self).__init__()

    def forward(self, recon_x, x, mu, logvar):
        BCE = nn.functional.binary_cross_entropy(recon_x, x, reduction='sum')
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + KLD

In [None]:
def train_VAE(train_dataloader, fp_len, epochs=40, device=device, code_len=8):
    model = VAE(fp_len, code_len)
    criterion = VAELoss()
    if device == 'cuda':
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    for epoch in range(epochs):
        for (fp, _) in train_dataloader:
            if device == 'cuda':
                fp = fp.cuda()
            encoded, mu, logvar = model(fp)
            loss = criterion(encoded, fp, mu, logvar)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}')
    return model

In [None]:
model = train_VAE(train_dataloader, fp_len=fp_train.size(dim=1))

In [None]:
# encode all data to latent space
encoded = []
for fp in fp_train:
    encoder = model.encoder
    if device == 'cuda':
        fp = fp.cuda()
        mu, logvar = encoder(fp)
        latent = model.reparameterize(mu, logvar)
        encoded.append(latent.detach().cpu().numpy())
    else:
        mu, logvar = encoder(fp)
        latent = model.reparameterize(mu, logvar)
        encoded.append(latent.detach().numpy())

In [None]:
encoded = np.array(encoded)

In [None]:
# filter for active compounds only

activity = (ki_train < 10)
activity = np.array(activity)

In [None]:
vae_results = pd.DataFrame(encoded)
vae_results['activity'] = activity

In [None]:
# plot latent space
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# pca reduction

pca = PCA(n_components=2)
pca_results = pca.fit_transform(encoded)
pca_results = pd.DataFrame(pca_results)
pca_results['activity'] = activity


In [None]:
# t-SNE reduction

print('...tSNE...')
tsne = tsne = TSNE(perplexity=20, learning_rate=100, verbose=1)
tsne_results = pd.DataFrame(tsne.fit_transform(encoded))
tsne_results['activity'] = activity

In [None]:
import seaborn as sns

# plot PCA

sns.set_style('whitegrid')
colors = ['#D3D3D3', '#880808']
sns.scatterplot(x=0, y=1, hue='activity', data=pca_results, marker='.', palette=colors)
plt.title('VAE PCA')
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
# plot t-SNE

sns.scatterplot(x=0, y=1, hue='activity', data=tsne_results, marker='.', palette=colors)
plt.title('VAE t-SNE')
plt.xlabel('Ax1')
plt.ylabel('Ax2')