In [None]:
import pandas as pd
df = pd.read_csv("data/input.csv")

In [None]:
target = df["band_gap"]

In [None]:
df.drop(columns=["band_gap","material_id"],inplace=True)

In [None]:
df.drop(columns=["formula_pretty"],inplace=True)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from rtdl import FTTransformer
from sklearn.preprocessing import StandardScaler
import gc

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. DATA LOADING ---
CSV_PATH = "data/input.csv"
TARGET = "band_gap"
ID_COLS = ["material_id", "formula_pretty"]

df = pd.read_csv(CSV_PATH)
df = df.drop(columns=[c for c in ID_COLS if c in df.columns], errors="ignore")

y = df[TARGET].astype(float)
X = df.drop(columns=[TARGET]).select_dtypes(include="number")

# --- 2. LOAD SPLIT INDICES ---
# Using pre-defined indices to ensure consistency with other models in your ensemble
idx_tr = pd.read_csv("data/splits/split_train.csv")["idx"].to_numpy()
idx_va = pd.read_csv("data/splits/split_val.csv")["idx"].to_numpy()
idx_te = pd.read_csv("data/splits/split_test.csv")["idx"].to_numpy()

X_tr, y_tr = X.iloc[idx_tr], y.iloc[idx_tr]
X_va, y_va = X.iloc[idx_va], y.iloc[idx_va]
X_te, y_te = X.iloc[idx_te], y.iloc[idx_te]

# --- 3. NUMERICAL & CATEGORICAL PREPROCESSING ---
# Identify binary/categorical columns (starting with 'is') vs numerical columns
cat_cols = [col for col in X_tr.columns if col.startswith("is")]
num_cols = [col for col in X_tr.columns if col not in cat_cols]

scaler = StandardScaler()
X_tr[num_cols] = scaler.fit_transform(X_tr[num_cols])
X_va[num_cols] = scaler.transform(X_va[num_cols])
X_te[num_cols] = scaler.transform(X_te[num_cols])

def prepare_tensors(X, y):
    X_num = X[num_cols].values.astype(np.float32)
    # Handle case where no categorical columns exist
    X_cat = X[cat_cols].values.astype(np.int64) if cat_cols else np.zeros((len(X), 0), dtype=np.int64)
    return torch.tensor(X_num), torch.tensor(X_cat), torch.tensor(y.values).float()

X_num_tr, X_cat_tr, y_tr_tensor = prepare_tensors(X_tr, y_tr)
X_num_va, X_cat_va, y_va_tensor = prepare_tensors(X_va, y_va)
X_num_te, X_cat_te, y_te_tensor = prepare_tensors(X_te, y_te)

# Create Datasets and Loaders
train_dataset = TensorDataset(X_num_tr, X_cat_tr, y_tr_tensor)
val_dataset   = TensorDataset(X_num_va, X_cat_va, y_va_tensor)
test_dataset  = TensorDataset(X_num_te, X_cat_te, y_te_tensor)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

# Define cardinalities for categorical embeddings (binary columns = 2)
cat_cardinalities = [2] * len(cat_cols)

# --- 4. MODEL INITIALIZATION ---
model = FTTransformer.make_baseline(
    n_num_features=X_num_tr.shape[1],
    cat_cardinalities=cat_cardinalities,
    d_token=64,
    n_blocks=3,
    attention_dropout=0.2,
    ffn_d_hidden=256,
    ffn_dropout=0.1,
    residual_dropout=0.1,
    last_layer_query_idx=[-1],
    kv_compression_ratio=None,
    kv_compression_sharing=None,
    d_out=1
).to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=5
)

# Early Stopping Parameters
best_val_loss = float('inf')
patience = 10
patience_counter = 0
checkpoint_path = "best_ftt_model.pth"

# --- 5. TRAINING LOOP ---
for epoch in range(100):  
    model.train()
    train_loss = 0.0

    for X_num, X_cat, targets in train_loader:
        X_num, X_cat, targets = X_num.to(device), X_cat.to(device), targets.to(device)
        
        optimizer.zero_grad()
        preds = model(X_num, X_cat).squeeze()
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_num.size(0)

    train_loss /= len(train_dataset)

    # Validation Phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_num, X_cat, targets in val_loader:
            X_num, X_cat, targets = X_num.to(device), X_cat.to(device), targets.to(device)
            preds = model(X_num, X_cat).squeeze()
            loss = loss_fn(preds, targets)
            val_loss += loss.item() * X_num.size(0)

    val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
    scheduler.step(val_loss)

    # Checkpoint and Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), checkpoint_path)
        print(f"  --> Best model saved: {checkpoint_path}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# --- 6. TEST INFERENCE ---
model.load_state_dict(torch.load(checkpoint_path))
model.eval()

test_preds = []
with torch.no_grad():
    for X_num, X_cat, _ in test_loader:
        X_num, X_cat = X_num.to(device), X_cat.to(device)
        preds = model(X_num, X_cat).cpu().numpy()
        test_preds.append(preds)

test_preds = np.vstack(test_preds)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- 7. TEST METRICS ---
# Assuming y_te is your ground truth and test_preds contains the model outputs
y_true = y_te
y_pred = test_preds

# Calculating standard regression error metrics
mse  = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_true, y_pred)
r2   = r2_score(y_true, y_pred)

print("\n--- Final Test Results ---")
print(f"MSE  : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"MAE  : {mae:.4f}")
print(f"RÂ²   : {r2:.4f}")

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Initializing the DataLoader for evaluation
# batch_size=32 and shuffle=False ensure stable and ordered predictions
test_loader_eval = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)

# Container for test predictions
test_preds = []

# Evaluation mode (disables dropout/batchnorm updates)
model.eval()

with torch.no_grad():
    for X_num, X_cat, _ in test_loader_eval:
        # Move tensors to GPU asynchronously using non_blocking=True
        X_num = X_num.to(device, non_blocking=True)
        X_cat = X_cat.to(device, non_blocking=True)
        
        # Forward pass and collect results back to CPU
        preds = model(X_num, X_cat).cpu().numpy()
        test_preds.append(preds)

# Vertically stack all batches into a single NumPy array
test_preds = np.vstack(test_preds)

In [None]:
train_loader_eval = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

train_preds = []
with torch.no_grad():
    for X_num, X_cat, _ in train_loader_eval:
        X_num = X_num.to(device, non_blocking=True)
        X_cat = X_cat.to(device, non_blocking=True)
        preds = model(X_num, X_cat).cpu().numpy()
        train_preds.append(preds)

train_preds = np.vstack(train_preds)


In [None]:
# Standardize target shape to a 1D vector (-1)
if torch.is_tensor(y_train):
    # For PyTorch tensors
    y_true_train = y_train.reshape(-1)
else:
    # For NumPy arrays (handled identically by .reshape)
    y_true_train = y_train.reshape(-1)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Ensure both predictions and ground truth are flattened to 1D vectors (N,)
y_pred_train = train_preds.reshape(-1)
y_true_train = y_train.reshape(-1)

# Calculate standard regression metrics
r2 = r2_score(y_true_train, y_pred_train)
mae = mean_absolute_error(y_true_train, y_pred_train)
mse = mean_squared_error(y_true_train, y_pred_train)
rmse = np.sqrt(mse)

# Console output
print(f"Train set R2 Score: {r2:.4f}")
print(f"Train set MAE:      {mae:.4f}")
print(f"Train set MSE:      {mse:.4f}")
print(f"Train set RMSE:     {rmse:.4f}")

In [None]:
import mlflow.sklearn

mlflow.sklearn.save_model(model, "mlruns/FTTransformer_model_final1")