In [1]:
from data import load_data, data_prep
from train import train, plot_learning_curve, objective
from data_openml import DataSetCatCon
from torch.utils.data import DataLoader
from models import SAINT
import torch.optim as optim
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostClassifier
import pandas as pd
import optuna
from optuna.trial import Trial


X1, y = load_data(200, 500)

# select best fingerprints 
fgpts = X1.iloc[:, 2:-1]
model2 = CatBoostClassifier(iterations=1, learning_rate=0.1, depth=2, verbose=1)  # FIXME
model2.fit(fgpts, y)
fgpts_selector = SelectFromModel(model2, prefit=True, threshold="mean", max_features=100)
X = pd.concat([X1.iloc[:, :2], X1.loc[:, fgpts.columns[fgpts_selector.get_support()]]], axis=1)


0:	learn: 6.1755479	total: 674ms	remaining: 0us


In [3]:
import sys
import os
import platform

# Python version
print("Python version:", sys.version)

# Installed packages
print("\nInstalled packages:")
!pip list

# Notebook path
print("\nNotebook path:", os.getcwd())

# Environment variables
print("\nEnvironment variables:")
print(os.environ)

# Conda environment check
print("\nEnvironment prefix:", sys.prefix)

# System information
print("\nSystem information:")
print("Platform:", platform.platform())
print("System:", platform.system())
print("Release:", platform.release())


Python version: 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]

Installed packages:
Package                           Version
--------------------------------- ------------
aext_assistant                    0.4.0
aext_assistant_server             0.4.0
aext_core                         0.4.0
aext_core_server                  0.4.0
aext_shared                       0.4.0
aiobotocore                       2.7.0
aiofiles                          22.1.0
aiohttp                           3.9.3
aioitertools                      0.7.1
aiosignal                         1.2.0
aiosqlite                         0.18.0
alabaster                         0.7.12
alembic                           1.13.1
altair                            5.0.1
anaconda-anon-usage               0.4.3
anaconda-catalogs                 0.2.0
anaconda-client                   1.12.3
anaconda-cloud-auth               0.5.1
anaconda-navigator                2.6.0
anaconda-proj

In [2]:
cat_dims, cat_idxs, con_idxs, X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_std, continuous_mean_std = data_prep(X, y, datasplit=[.65, .15, .2])

train_ds = DataSetCatCon(X_train, y_train, cat_idxs,'reg',continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=256, shuffle=True,num_workers=4)

valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs,'reg', continuous_mean_std)
validloader = DataLoader(valid_ds, batch_size=256, shuffle=False,num_workers=4)

test_ds = DataSetCatCon(X_test, y_test, cat_idxs,'reg', continuous_mean_std)
testloader = DataLoader(test_ds, batch_size=256, shuffle=False,num_workers=4)

# select best hyperparameters
study_name = "saint"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(lambda trial: objective(trial, cat_dims, con_idxs, trainloader, validloader, testloader), n_trials=2) # FIXME
optim_params = study.best_params 


[I 2024-06-18 13:49:26,649] Using an existing study with name 'saint' instead of creating a new one.
[W 2024-06-18 13:50:11,713] Trial 2 failed with parameters: {'dim': 43, 'depth': 1, 'heads': 6, 'attn_dropout': 0.8330618505595276, 'ff_dropout': 0.13212328353300987, 'mlp_hidden_mults': [16, 8], 'attentiontype': 'attn', 'final_mlp_style': 'common', 'lr': 0.0006251820770518503} because of the following error: The number of the values 2 did not match the number of the objectives 1.
[W 2024-06-18 13:50:11,716] Trial 2 failed with value [200.25241, 199.52892].
[W 2024-06-18 13:50:57,696] Trial 3 failed with parameters: {'dim': 55, 'depth': 2, 'heads': 7, 'attn_dropout': 0.4726446565607283, 'ff_dropout': 0.7524523327489069, 'mlp_hidden_mults': [8, 4], 'attentiontype': 'attnmlp', 'final_mlp_style': 'sep', 'lr': 0.00013144785395869963} because of the following error: The number of the values 2 did not match the number of the objectives 1.
[W 2024-06-18 13:50:57,699] Trial 3 failed with value 

ValueError: Record does not exist.

In [None]:
optim_params["mlp_hidden_mults"] = build_hidden_mults(optim_params["mlp_hidden_mults"])  # TODO: importar funcion
model = SAINT(
    categories = tuple(cat_dims), 
    num_continuous = len(con_idxs),                
    dim = optim_params['dim'],                         
    dim_out = 1,                       
    depth = optim_params['depth'],                        
    heads = optim_params['heads'],                         
    attn_dropout = optim_params['attn_dropout'],             
    ff_dropout = optim_params['ff_dropout'],                  
    mlp_hidden_mults = optim_params['mlp_hidden_mults'],       
    cont_embeddings = 'MLP',
    attentiontype = optim_params['attentiontype'],
    final_mlp_style = optim_params['final_mlp_style'],
    y_dim = 1 # porque es regression 
)

optimizer = optim.AdamW(model.parameters(),lr=optim_params['lr'])
scheduler = 'cosine' # default 


epochs = 100

valid_rmse, test_rmse = train(model, optimizer, scheduler, epochs, trainloader, validloader, testloader)

plot_learning_curve(valid_rmse, test_rmse)