In [125]:
import csv
import numpy as np
import pandas as pd

# featurizer class definition

class Featurizer():
    
    def __init__(self, fp_len, fp_type):
        self.fp_len = fp_len
        self.fp_type = fp_type
    
    def __call__(self, df):
        fingerprints = []
        labels = []
        
        labels = df['Ki']
        fp = []
        for index, row in df.iterrows():
            fp = row[1:self.fp_len+1]
            fingerprints.append(fp)
            
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels
        
class KlekFeaturizer(Featurizer):
    def __init__(self, fp_len=4860, fp_type='Klek'):
        super().__init__(fp_len, fp_type)
    
class MACCSFeaturizer(Featurizer):
    def __init__(self, fp_len=166, fp_type='MACCS'):
        super().__init__(fp_len, fp_type)
        
class SubFeaturizer(Featurizer):
    def __init__(self, fp_len=307, fp_type='Sub'):
        super().__init__(fp_len, fp_type)

In [126]:
#file to DataFrame object

filename = 'data/5ht1a_MACCSFP_final_file.csv'
df = pd.read_csv(filename)
df = df.dropna()

featurizer = MACCSFeaturizer()
X_train, y_train = featurizer(df)

X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train)

assert X_train.shape[0] == y_train.shape[0], 'X_train and y_train rows do not match'

In [127]:
from torch.utils.data import DataLoader, TensorDataset

tensor_ds = TensorDataset(X_train, y_train)
train = tensor_ds

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)
#train_features, train_labels = next(iter(train_dataloader))

In [157]:
fp_len = featurizer.fp_len # Kelk=4860, MACCS=166, Sub=307

class Autoencoder(nn.Module):
    
    torch.set_default_dtype(torch.float64)
    
    def __init__(self, fp_len):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(fp_len, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, 3),
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(3, 16),
            nn.ReLU(),
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, fp_len),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        result = torch.as_tensor((decoded - 0.5) > 0, dtype=torch.float64)
        print(result)
        return result

In [158]:
model = Autoencoder(featurizer.fp_len)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

In [162]:
epochs = 50
outputs = []
for epoch in range(epochs):
    for (fp, _) in train_dataloader:
        fp = fp.reshape(-1, fp_len)
        recon = model(fp)
        loss = criterion(recon, fp)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch: {epoch+1}, Loss :{loss.item()}")
    outputs.append((epoch, fp, recon))
print(outputs)

tensor([[1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.]])


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn