In [14]:
pip install torch==2.0.0 torchvision==0.15.1

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install d2l==1.0.3

In [1]:
import pandas as pd

csv_file_path = 'METLIN_CCS_vectorfingerprintsVectorized.csv'

column_name = 'Dimer.1'
value_to_filter = 'Monomer'

original_data = pd.read_csv(csv_file_path, nrows = 20)

data = original_data[original_data['Dimer.1'] == 'Monomer']

# Save the filtered DataFrame back to the CSV file, overwriting the original file
data.to_csv('METLIN_CCS_vectorfingerprintsVectorized_filtered.csv', index=False)

print(data)

                                        Molecule Name Molecular Formula  \
0   3-[3-(2,3-dihydroindol-1-yl)propyl]-1-[(4-fluo...        C19H22FN3O   
1   3-{[1,1'-biphenyl]-4-ylmethylidene}-6-fluoro-2...        C22H15FO3S   
2   1-{[3-(4-methoxyphenyl)-1,2,4-oxadiazol-5-yl]m...        C25H28N4O3   
3   3-[2-oxo-2-(4-phenylmethanesulfonylpiperazin-1...       C22H30N4O5S   
4   2-{[1,1'-biphenyl]-2-ylamino}-1-(morpholin-4-y...        C18H20N2O2   
5   1-(2-{[1,1'-biphenyl]-2-ylamino}acetyl)imidazo...        C17H17N3O2   
6   N-(2,3-dihydro-1,4-benzodioxin-6-yl)-2-{[1-(4-...      C23H21N5O3S2   
7                                              Tm_322                     
8   N-[(1-benzyl-3,5-dimethylpyrazol-4-yl)methyl]-...       C24H25N5O2S   
9   N-{[1,1'-biphenyl]-4-yl}-2-(2-oxoquinoxalin-1-...        C22H17N3O2   
10                                             Tm_322                     
11  N-(4-ethoxyphenyl)-2-{2-[5-(4-methylphenyl)-1,...        C25H24N6O3   
12                       

In [2]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np

adducts = data.iloc[:, 10]
adduct_types = ['[M+H]', '[M+Na]', '[2M+H]']
encoder = OrdinalEncoder()
adducts_2d = np.array(adduct_types).reshape(-1, 1)
encoded_adducts = encoder.fit_transform(adducts_2d)
print(encoded_adducts)

adducts_map = {'[M+H]': encoded_adducts[0], '[M+Na]': encoded_adducts[1], '[2M+H]': encoded_adducts[2]}
mapped_adducts = adducts.map(adducts_map)
print(mapped_adducts)

[[1.]
 [2.]
 [0.]]
0     [1.0]
1     [1.0]
2     [1.0]
3     [1.0]
4     [1.0]
5     [1.0]
6     [1.0]
7     [1.0]
8     [1.0]
9     [1.0]
10    [1.0]
11    [1.0]
12    [1.0]
14    [1.0]
15    [1.0]
16    [1.0]
17    [1.0]
18    [1.0]
19    [1.0]
Name: Adduct, dtype: object


In [3]:
import torch

# adduct = torch.tensor(mapped_adducts)
fingerprints = torch.tensor(data.iloc[:, 21:121].to_numpy(dtype=int))
# fingerprints = data.iloc[:, 21:121]
print(fingerprints.shape)
# fingerprints = torch.tensor(data.iloc[:, 21:2235].to_numpy(dtype=int))
mz = torch.tensor(data.iloc[:, 9].to_numpy(dtype=float))
# mz = data.iloc[:, 9]
 
# dataset_tensor = X    
# X = torch.cat((mz.unsqueeze(1), fingerprints, adduct), dim=1) 
X = torch.cat((mz.unsqueeze(1), fingerprints), dim=1)
# fingerprints.insert(0, 'm/z', mz)
# fingerprints['mz'] = mz
# X = fingerprints

# X = data.iloc[:, 21:121]
# mz.unsqueeze convierte el vector mz de dimension [100] en una matriz de dimensiones [100,1], para poder concatenarlo

# target = y
y = torch.tensor(data.iloc[:, 7].to_numpy(dtype=float))
# y = data.iloc[:, 7].to_numpy()

print(X)
print(y.shape)

torch.Size([19, 100])
tensor([[328.1820,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [379.0799,   1.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [433.2234,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        ...,
        [509.1676,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [311.1754,   1.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [363.2067,   1.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],
       dtype=torch.float64)
torch.Size([19])


In [None]:
import os

# Display the name of the active environment
print(f"Active environment: {os.environ['CONDA_DEFAULT_ENV']}")

In [26]:
from models import SAINT
from data_openml import data_prep_openml,task_dset_ids,DataSetCatCon, data_split
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.optim as optim
from augmentations import embed_data_mask


categorical_indicator = [False] + [True] * 100 # faltan los adducts
cat_dims = [2] * 100 # 2: fingerprints solo pueden tomar dos valores (binario)
cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.
  
cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0]) # discrete/categorical features
print("cat idxs", cat_idxs)
con_idxs = list(set(range(X.size(1))) - set(cat_idxs)) # continuous/numerical features 

model = SAINT(
    categories = tuple(cat_dims), 
    num_continuous = len(con_idxs),                
    dim = 32, # default                         
    dim_out = 1,                       
    depth = 6, # default                        
    heads = 8, # default                         
    attn_dropout = 0.1, # default              
    ff_dropout = 0.1, # default                  
    mlp_hidden_mults = (4, 2),       
    cont_embeddings = 'MLP', # default 
    attentiontype = 'colrow', # default 
    final_mlp_style = 'sep', # default
    y_dim = 1 # porque es regression 
)



label_map = {"train": 0, "valid": 1, "test": 2}
kk = np.random.choice(["train", "valid", "test"], p = [0.65, 0.15, 0.2], size=(X.shape[0],))
np_set_array_mapped = np.vectorize(label_map.get)(kk)
torch_set_tensor = torch.from_numpy(np_set_array_mapped).long()
torch_set_tensor = torch_set_tensor.unsqueeze(1)  
X_with_set = torch.cat((X, torch_set_tensor), dim=1)

train_mask = X_with_set[:, -1] == 0
train_indices = torch.nonzero(train_mask).squeeze()
print("train indices", train_indices)

valid_mask = X_with_set[:, -1] == 1
valid_indices = torch.nonzero(valid_mask).squeeze()
print("valid indices", valid_indices)

test_mask = X_with_set[:, -1] == 2
test_indices = torch.nonzero(test_mask).squeeze()
print("test indices", test_indices)

nan_mask = torch.isnan(X)
nan_mask_y = torch.isnan(y)


X_train = {
    'data': X[train_indices],
    'mask': X[nan_mask] 
}
# if X_train['data'].shape != X_train['mask'].shape:
#     raise'Shape of data not same as that of nan mask!'
    
X_valid = {
    'data': X[valid_indices],
    'mask': X[nan_mask] 
}
# if X_valid['data'].shape != X_valid['mask'].shape:
#     raise'Shape of data not same as that of nan mask!'

X_test = {
    'data': X[test_indices],
    'mask': X[nan_mask] 
}
# if X_test['data'].shape != X_test['mask'].shape:
#     raise'Shape of data not same as that of nan mask!'
        
y_train = {
    'data': y[train_indices].reshape(-1, 1)
} 

y_valid = {
    'data': y[valid_indices].reshape(-1, 1)
} 

y_test = {
    'data': y[test_indices].reshape(-1, 1)
} 

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=5e-4)

cont = torch.index_select(X_train['data'], 1, torch.tensor(con_idxs))

train_mean = torch.mean(cont, dim=0)
train_std = torch.std(cont, dim=0)

continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32) 

train_ds = DataSetCatCon(X_train, y_train, cat_idxs,'reg',continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=256, shuffle=True,num_workers=4)

# valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs,'reg', continuous_mean_std)
# validloader = DataLoader(valid_ds, batch_size=256, shuffle=False,num_workers=4)

# test_ds = DataSetCatCon(X_test, y_test, cat_idxs,'reg', continuous_mean_std)
# testloader = DataLoader(test_ds, batch_size=256, shuffle=False,num_workers=4)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}.")

vision_dset = 'store_true'
scheduler = 'cosine' # default 

# for epoch in range(100):
#     model.train()
#     running_loss = 0.0
#     for i, data in enumerate(trainloader, 0):
#         optimizer.zero_grad()
#         x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
#         _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)           
#         reps = model.transformer(x_categ_enc, x_cont_enc)
#         y_reps = reps[:,0,:]
#         y_outs = model.mlpfory(y_reps)
#         loss = criterion(y_outs,y_gts) 
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         running_loss += loss.item()
#     model.eval()
#     with torch.no_grad():
#         valid_rmse = mean_sq_error(model, validloader, device,vision_dset)    
#         test_rmse = mean_sq_error(model, testloader, device,vision_dset)  
#         print('[EPOCH %d] VALID RMSE: %.3f' %
#             (epoch + 1, valid_rmse ))
#         print('[EPOCH %d] TEST RMSE: %.3f' %
#             (epoch + 1, test_rmse ))
#     model.train()

cat idxs [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
train indices tensor([ 1,  2,  4,  8, 12, 14, 15, 16])
valid indices tensor([ 3,  6,  7, 11, 13])
test indices tensor([ 0,  5,  9, 10, 17, 18])


  continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32)
  continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32)


AttributeError: 'Tensor' object has no attribute 'copy'

In [12]:
pd.DataFrame(X_train).copy()
# pd.DataFrame(X_train['data']).copy()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,328.182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,433.2234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,296.1394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,322.0481,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,...,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0
4,448.1802,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,322.0481,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,...,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0
6,509.1676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,311.1754,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
import torch.optim as optim
import tqdm
import copy
import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

model = nn.Sequential(
    nn.Linear(2218, 24),
    nn.ReLU(),
    nn.Linear(24, 12),
    nn.ReLU(),
    nn.Linear(12, 6),
    nn.ReLU(),
    nn.Linear(6, 1)
)

loss_fn = nn.MSELoss()  # mean square error
optimizer = optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 100   # number of epochs to run
batch_size = 10  # size of each batch
batch_start = torch.arange(0, len(X_train), batch_size)

best_mse = np.inf   # init to infinity
best_weights = None
history = []

for epoch in range(n_epochs):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = X_train[start:start+batch_size]
            y_batch = y_train[start:start+batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mse=float(loss))
    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())


model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.plot(history)
plt.show()

In [None]:
import time
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l
from torch.utils.data import TensorDataset, DataLoader

class MyDataModule(d2l.DataModule):
    def __init__(self, X, y, p_train=0.7): # training data: 70% of dataset
        super().__init__()
        self.save_hyperparameters()
        self.X = X
        self.y = y

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        #tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, shuffle=train)
    
    def get_dataloader(self, train):
        i = slice(0, self.p_train*len(self.X)) if train else slice(self.p_train*len(self.X), None)
        return self.get_tensorloader((self.X, self.y), train, i)


class LinearRegressionScratch(d2l.Module):  #@save
    """The linear regression model implemented from scratch."""
    def __init__(self, num_inputs, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.w = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        return torch.matmul(X.double(), self.w.double()) + self.b.double()

    def loss(self, y_hat, y):
        l = (y_hat - y) ** 2 / 2
        return l.mean()

    def configure_optimizers(self):
        return SGD([self.w, self.b], self.lr)


class SGD(d2l.HyperParameters):  #@save
    """Minibatch stochastic gradient descent."""
    def __init__(self, params, lr):
        self.save_hyperparameters()

    def step(self):
        for param in self.params:
            param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()


my_data = MyDataModule(X, y)
model = LinearRegressionScratch(X.shape[1], lr=0.001)
trainer = d2l.Trainer(max_epochs=3)
trainer.fit(model, my_data)