In [1]:
##
import pickle

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

############# Values to be defined at the beginning of the script
INCLUDE_DIMERS = False
USE_N_FGPs = 100 
USE_NROWS = 500
############

with open("features_dict.pkl", "rb") as f:
    features_dict = pickle.load(f)

y = features_dict["ccs"]

X = features_dict["fingerprint"][:, :USE_N_FGPs]
X = pd.DataFrame(X)
X.columns = [f"fgp_{i}" for i in range(1, X.shape[1] + 1)]
X.insert(0, "mz", features_dict["mz"])
X.insert(1, "adduct", features_dict["adduct"])

if not INCLUDE_DIMERS:
    y = y[~X["adduct"].str.startswith("Dimer")]
    X = X[~X["adduct"].str.startswith("Dimer")]

# Substitute adduct by ordinal encoder
adduct_encoder = OrdinalEncoder()
X["adduct"] = (
    adduct_encoder.fit_transform(X["adduct"].values.reshape(-1, 1)).astype("int")
)

n_adducts = len(adduct_encoder.categories_[0])
X = X.iloc[:USE_NROWS]
y = y[:USE_NROWS]

X = X.reset_index(drop=True)

assert X.shape[0] == y.shape[0]
##

categorical_indicator = [False] + [True] * (X.shape[1] - 1)
categorical_columns = X.columns[1:].tolist()
cont_columns = list(set(X.columns.tolist()) - set(categorical_columns))

cat_dims = [n_adducts] + [2] * (X.shape[1] - 2) # 2: fingerprints solo pueden tomar dos valores (binario)

In [2]:
from models import SAINT
from data_openml import data_prep_openml,task_dset_ids,DataSetCatCon, data_split
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.optim as optim
from augmentations import embed_data_mask
from sklearn.preprocessing import LabelEncoder
import numpy as np


# categorical_indicator = [False] + [True] * 100 # faltan los adducts
# categorical_columns = X.columns[1:101].tolist()
# # categorical_columns = X.columns[list(np.where(np.array(categorical_indicator)==True)[0])].tolist()
# cont_columns = list(set(X.columns.tolist()) - set(categorical_columns))

# cat_dims = [2] * 100 # 2: fingerprints solo pueden tomar dos valores (binario)
cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.
 
cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0])
con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))

for col in categorical_columns:
    X[col] = X[col].astype("category")

X["Set"] = np.random.choice(["train", "valid", "test"], p = [0.65, 0.15, 0.2], size=(X.shape[0],))

train_indices = X[X.Set=="train"].index
valid_indices = X[X.Set=="valid"].index
test_indices = X[X.Set=="test"].index
# print(train_indices, valid_indices, test_indices)

y = pd.DataFrame(y).values

X = X.drop(columns=['Set'])

nan_mask = X.isna().astype(int)

X_train, y_train = data_split(X,y,nan_mask,train_indices)
X_valid, y_valid = data_split(X,y,nan_mask,valid_indices)
X_test, y_test = data_split(X,y,nan_mask,test_indices)

train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
train_std = np.where(train_std < 1e-6, 1e-6, train_std)


continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32) 
print(continuous_mean_std)

train_ds = DataSetCatCon(X_train, y_train, cat_idxs,'reg',continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=256, shuffle=True,num_workers=4)

valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs,'reg', continuous_mean_std)
validloader = DataLoader(valid_ds, batch_size=256, shuffle=False,num_workers=4)
print("validloader: ", validloader)

test_ds = DataSetCatCon(X_test, y_test, cat_idxs,'reg', continuous_mean_std)
testloader = DataLoader(test_ds, batch_size=256, shuffle=False,num_workers=4)

[[419.4919 ]
 [ 62.17139]]
validloader:  <torch.utils.data.dataloader.DataLoader object at 0x000001DC8E8E88D0>


In [3]:
import torch
from torch import nn
from utils import count_parameters, classification_scores, mean_sq_error

model = SAINT(
    categories = tuple(cat_dims), 
    num_continuous = len(con_idxs),                
    dim = 32, # default                         
    dim_out = 1,                       
    depth = 1,                        
    heads = 4,                         
    attn_dropout = 0.8, # default              
    ff_dropout = 0.8, # default                  
    mlp_hidden_mults = (4, 2),       
    cont_embeddings = 'MLP', # default 
    attentiontype = 'colrow', # default 
    final_mlp_style = 'sep', # default
    y_dim = 1 # porque es regression 
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}.")

vision_dset = False # porque los datos no son imágenes
# scheduler = 'cosine' # default ?¿?¿?¿?¿?¿?¿?


criterion = nn.MSELoss().to(device)
model.to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.0001)

for epoch in range(100):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        optimizer.zero_grad()
        x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
        _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, vision_dset)           
        reps = model.transformer(x_categ_enc, x_cont_enc)
        y_reps = reps[:,0,:]
        y_outs = model.mlpfory(y_reps)
        loss = criterion(y_outs,y_gts) 
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        model.eval()
        with torch.no_grad():
            valid_rmse = mean_sq_error(model, validloader, device,vision_dset)    
            test_rmse = mean_sq_error(model, testloader, device,vision_dset)  
            print('[EPOCH %d] VALID RMSE: %.3f' %
                (epoch + 1, valid_rmse ))
            print('[EPOCH %d] TEST RMSE: %.3f' %
                (epoch + 1, test_rmse ))
        model.train()

Device is cpu.
[EPOCH 1] VALID RMSE: 199.006
[EPOCH 1] TEST RMSE: 200.712
[EPOCH 1] VALID RMSE: 197.052
[EPOCH 1] TEST RMSE: 198.756
[EPOCH 2] VALID RMSE: 194.418
[EPOCH 2] TEST RMSE: 196.120
[EPOCH 2] VALID RMSE: 190.903
[EPOCH 2] TEST RMSE: 192.603
[EPOCH 3] VALID RMSE: 186.016
[EPOCH 3] TEST RMSE: 187.712
[EPOCH 3] VALID RMSE: 179.328
[EPOCH 3] TEST RMSE: 181.019
[EPOCH 4] VALID RMSE: 170.266
[EPOCH 4] TEST RMSE: 171.948
[EPOCH 4] VALID RMSE: 158.329
[EPOCH 4] TEST RMSE: 159.999
[EPOCH 5] VALID RMSE: 142.928
[EPOCH 5] TEST RMSE: 144.579
[EPOCH 5] VALID RMSE: 123.450
[EPOCH 5] TEST RMSE: 125.070
[EPOCH 6] VALID RMSE: 98.869
[EPOCH 6] TEST RMSE: 100.431
[EPOCH 6] VALID RMSE: 68.397
[EPOCH 6] TEST RMSE: 69.824
[EPOCH 7] VALID RMSE: 32.356
[EPOCH 7] TEST RMSE: 33.217
[EPOCH 7] VALID RMSE: 23.394
[EPOCH 7] TEST RMSE: 20.885
[EPOCH 8] VALID RMSE: 65.147
[EPOCH 8] TEST RMSE: 62.962
[EPOCH 8] VALID RMSE: 90.770
[EPOCH 8] TEST RMSE: 88.674
[EPOCH 9] VALID RMSE: 88.387
[EPOCH 9] TEST RMSE: 86