In [14]:
pip install torch==2.0.0 torchvision==0.15.1

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install d2l==1.0.3

In [12]:
import pandas as pd

csv_file_path = 'METLIN_CCS_vectorfingerprintsVectorized.csv'

column_name = 'Dimer.1'
value_to_filter = 'Monomer'

original_data = pd.read_csv(csv_file_path, nrows = 105)

data = original_data[original_data['Dimer.1'] == 'Monomer']

# Reset the index
data.reset_index(drop=True, inplace=True)

# Save the filtered DataFrame back to the CSV file, overwriting the original file
data.to_csv('METLIN_CCS_vectorfingerprintsVectorized_filtered.csv', index=False)

print(data)

                                        Molecule Name Molecular Formula  \
0   3-[3-(2,3-dihydroindol-1-yl)propyl]-1-[(4-fluo...        C19H22FN3O   
1   3-{[1,1'-biphenyl]-4-ylmethylidene}-6-fluoro-2...        C22H15FO3S   
2   1-{[3-(4-methoxyphenyl)-1,2,4-oxadiazol-5-yl]m...        C25H28N4O3   
3   3-[2-oxo-2-(4-phenylmethanesulfonylpiperazin-1...       C22H30N4O5S   
4   2-{[1,1'-biphenyl]-2-ylamino}-1-(morpholin-4-y...        C18H20N2O2   
..                                                ...               ...   
95  1-[3-(3,4-dimethoxyphenyl)-5-(4-ethoxyphenyl)-...        C21H24N2O4   
96  2-(2-methylfuran-3-yl)-5-{[(2-phenyl-1,3-oxazo...       C17H13N3O3S   
97  4-{4H,6H,7H-thieno[3,2-c]pyridine-5-sulfonyl}b...      C14H12N2O2S2   
98  4-{2,4-dioxo-3-azatricyclo[7.3.1.0^{5,13}]trid...        C21H24N2O3   
99  2-{[2-(4-phenylpiperazine-1-carbonyl)phenyl]su...        C24H21N3OS   

    METLIN ID Precursor Adduct    CCS1    CCS2    CCS3  CCS_AVG  % CV  \
0   1181481.0    328.1820[

In [13]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np

adducts = data.iloc[:, 10]
adduct_types = ['[M+H]', '[M+Na]', '[2M+H]']
encoder = OrdinalEncoder()
adducts_2d = np.array(adduct_types).reshape(-1, 1)
encoded_adducts = encoder.fit_transform(adducts_2d)
print(encoded_adducts)

adducts_map = {'[M+H]': encoded_adducts[0], '[M+Na]': encoded_adducts[1], '[2M+H]': encoded_adducts[2]}
mapped_adducts = adducts.map(adducts_map)
print(mapped_adducts)

[[1.]
 [2.]
 [0.]]
0     [1.0]
1     [1.0]
2     [1.0]
3     [1.0]
4     [1.0]
      ...  
95    [1.0]
96    [1.0]
97    [1.0]
98    [1.0]
99    [1.0]
Name: Adduct, Length: 100, dtype: object


In [14]:
import torch

fingerprints = data.iloc[:, 21:121]
mz = data.iloc[:, 9]
fingerprints.insert(0, 'm/z', mz)
X = fingerprints # faltan los adducts 
y = data.iloc[:, 7]
print(y)

0     176.63
1     192.26
2     211.12
3     204.22
4     174.47
       ...  
95    196.76
96    178.12
97    167.71
98    189.91
99    189.10
Name: CCS_AVG, Length: 100, dtype: float64


In [4]:
import os

# Display the name of the active environment
print(f"Active environment: {os.environ['CONDA_DEFAULT_ENV']}")

Active environment: base


In [25]:
from models import SAINT
from data_openml import data_prep_openml,task_dset_ids,DataSetCatCon, data_split
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.optim as optim
from augmentations import embed_data_mask
from sklearn.preprocessing import LabelEncoder


categorical_indicator = [False] + [True] * 100 # faltan los adducts
categorical_columns = X.columns[1:101].tolist()
# categorical_columns = X.columns[list(np.where(np.array(categorical_indicator)==True)[0])].tolist()
cont_columns = list(set(X.columns.tolist()) - set(categorical_columns))

cat_dims = [2] * 100 # 2: fingerprints solo pueden tomar dos valores (binario)
cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.
 
cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0])
con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))

for col in categorical_columns:
    X[col] = X[col].astype("category")

X["Set"] = np.random.choice(["train", "valid", "test"], p = [0.65, 0.15, 0.2], size=(X.shape[0],))

train_indices = X[X.Set=="train"].index
valid_indices = X[X.Set=="valid"].index
test_indices = X[X.Set=="test"].index
# print(train_indices, valid_indices, test_indices)

y = pd.DataFrame(y).values

X = X.drop(columns=['Set'])

nan_mask = X.isna().astype(int)

X_train, y_train = data_split(X,y,nan_mask,train_indices)
X_valid, y_valid = data_split(X,y,nan_mask,valid_indices)
X_test, y_test = data_split(X,y,nan_mask,test_indices)

train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
train_std = np.where(train_std < 1e-6, 1e-6, train_std)


continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32) 
print(continuous_mean_std)

train_ds = DataSetCatCon(X_train, y_train, cat_idxs,'reg',continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=256, shuffle=True,num_workers=4)

valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs,'reg', continuous_mean_std)
validloader = DataLoader(valid_ds, batch_size=256, shuffle=False,num_workers=4)
print("validloader: ", validloader)

test_ds = DataSetCatCon(X_test, y_test, cat_idxs,'reg', continuous_mean_std)
testloader = DataLoader(test_ds, batch_size=256, shuffle=False,num_workers=4)

[[398.45184]
 [ 76.58002]]
validloader:  <torch.utils.data.dataloader.DataLoader object at 0x00000283FBA30C10>


In [40]:
from torch import nn
from utils import count_parameters, classification_scores, mean_sq_error

model = SAINT(
    categories = tuple(cat_dims), 
    num_continuous = len(con_idxs),                
    dim = 32, # default                         
    dim_out = 1,                       
    depth = 6, # default                        
    heads = 8, # default                         
    attn_dropout = 0.1, # default              
    ff_dropout = 0.1, # default                  
    mlp_hidden_mults = (4, 2),       
    cont_embeddings = 'MLP', # default 
    attentiontype = 'colrow', # default 
    final_mlp_style = 'sep', # default
    y_dim = 1 # porque es regression 
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}.")

vision_dset = True
scheduler = 'cosine' # default ?¿?¿?¿?¿?¿?¿?

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=5e-4)
criterion = nn.MSELoss().to(device)
model.to(device)

for epoch in range(100):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        optimizer.zero_grad()
        x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)

        # checkeo de lo que hace embed_data_mask antes del error (todo ok)
        print(x_categ)
        print("x_categ shape:", x_categ.shape)
        print("model.categories_offset shape:", model.categories_offset.shape)
        x_categ2 = x_categ + model.categories_offset.type_as(x_categ)
        print(x_categ2.shape)
        num_embeddings = model.embeds.num_embeddings
        print("Number of embeddings:", num_embeddings)
        if x_categ.max().item() >= num_embeddings or x_categ.min().item() < 0:
            print("Error: Indices out of range after addition.")

        _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, vision_dset)           
        reps = model.transformer(x_categ_enc, x_cont_enc)
        y_reps = reps[:,0,:]
        y_outs = model.mlpfory(y_reps)
        loss = criterion(y_outs,y_gts) 
        loss.backward()
        optimizer.step()
    #         scheduler.step() # ??????????????
        running_loss += loss.item()
    model.eval()
    with torch.no_grad():
        valid_rmse = mean_sq_error(model, validloader, device,vision_dset)    
        test_rmse = mean_sq_error(model, testloader, device,vision_dset)  
        print('[EPOCH %d] VALID RMSE: %.3f' %
            (epoch + 1, valid_rmse ))
        print('[EPOCH %d] TEST RMSE: %.3f' %
            (epoch + 1, test_rmse ))
        model.train()

Device is cpu.
tensor([[0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0]])
x_categ shape: torch.Size([65, 101])
model.categories_offset shape: torch.Size([101])
torch.Size([65, 101])
Number of embeddings: 201
Error: Indices out of range after addition.


IndexError: index out of range in self

In [39]:
valid_rmse = mean_sq_error(model, validloader, device,vision_dset) # porque aquí no sale el error??????
test_rmse = mean_sq_error(model, testloader, device,vision_dset)  
print(valid_rmse, test_rmse)

IndexError: index out of range in self

In [None]:
from sklearn.model_selection import train_test_split
import torch.optim as optim
import tqdm
import copy
import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

model = nn.Sequential(
    nn.Linear(2218, 24),
    nn.ReLU(),
    nn.Linear(24, 12),
    nn.ReLU(),
    nn.Linear(12, 6),
    nn.ReLU(),
    nn.Linear(6, 1)
)

loss_fn = nn.MSELoss()  # mean square error
optimizer = optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 100   # number of epochs to run
batch_size = 10  # size of each batch
batch_start = torch.arange(0, len(X_train), batch_size)

best_mse = np.inf   # init to infinity
best_weights = None
history = []

for epoch in range(n_epochs):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = X_train[start:start+batch_size]
            y_batch = y_train[start:start+batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mse=float(loss))
    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())


model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.plot(history)
plt.show()

In [None]:
import time
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l
from torch.utils.data import TensorDataset, DataLoader

class MyDataModule(d2l.DataModule):
    def __init__(self, X, y, p_train=0.7): # training data: 70% of dataset
        super().__init__()
        self.save_hyperparameters()
        self.X = X
        self.y = y

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        #tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, shuffle=train)
    
    def get_dataloader(self, train):
        i = slice(0, self.p_train*len(self.X)) if train else slice(self.p_train*len(self.X), None)
        return self.get_tensorloader((self.X, self.y), train, i)


class LinearRegressionScratch(d2l.Module):  #@save
    """The linear regression model implemented from scratch."""
    def __init__(self, num_inputs, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.w = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        return torch.matmul(X.double(), self.w.double()) + self.b.double()

    def loss(self, y_hat, y):
        l = (y_hat - y) ** 2 / 2
        return l.mean()

    def configure_optimizers(self):
        return SGD([self.w, self.b], self.lr)


class SGD(d2l.HyperParameters):  #@save
    """Minibatch stochastic gradient descent."""
    def __init__(self, params, lr):
        self.save_hyperparameters()

    def step(self):
        for param in self.params:
            param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()


my_data = MyDataModule(X, y)
model = LinearRegressionScratch(X.shape[1], lr=0.001)
trainer = d2l.Trainer(max_epochs=3)
trainer.fit(model, my_data)