This portfolio presents a data analysis project for the Geology Forecast Challenge, aiming to predict 1D layer-depth sequences from high-dimensional geophysical measurements using the publicly available Kaggle dataset ([https://www.kaggle.com/competitions/geology-forecast-challenge-open/data](https://www.kaggle.com/competitions/geology-forecast-challenge-open/data)).

Key steps:

1. **Data Preparation**
   • Loaded 3,300 feature columns (measurements at depths −299 to 300 across ten “realizations”).
   • Imputed missing values with column means, applied a log-transform (`log(31 + x)`), and standardized all features.
   • Extracted 3,000-dimensional targets per sample and standardized them for numerical stability.

2. **Modeling**
   • Built a six-layer PyTorch DNN: 3,300 → 1,024 → 512 → 256 → 128 → 64 → 3,000, using BatchNorm, GELU, and dropout to balance expressiveness and regularization.
   • Trained with 5-fold cross-validation, Adam optimizer (LR=1e-3, weight decay=1e-5), ReduceLROnPlateau, and early stopping (patience=5).
   • Collected out-of-fold (OOF) predictions to compute overall RMSE on the original scale.

3. **Ensembling & Post-Processing**
   • Averaged test predictions across all folds, inverted target scaling, and applied an “average-trick”: for each of the 300 depths, averaged the ten realization channels and repeated that mean to stabilize outputs.

**Key finding:** The deep, progressively compressing network plus careful regularization and post-processing outperformed simpler baselines, yielding improved OOF RMSE and a competitive leaderboard position.

   
**@author**  YI LUO
**@date**  20250601


In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Section 1: Load Data
TRAIN_PATH = './00_data/train.csv'
TEST_PATH  = './00_data/test.csv'
SUB_PATH   = './00_data/sample_submission.csv'

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sub   = pd.read_csv(SUB_PATH)

FEATURES = [c for c in test.columns if c != 'geology_id']
TARGETS  = [c for c in sub.columns if c != 'geology_id']

In [3]:
# Section 2: Preprocessing (Impute + Log + Scale)
train_feats = train[FEATURES].copy()
test_feats  = test[FEATURES].copy()

train_feats = train_feats.fillna(train_feats.mean())
test_feats  = test_feats.fillna(test_feats.mean())

X_raw      = np.log(31.0 + train_feats.values)
X_test_raw = np.log(31.0 + test_feats.values)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_raw)
X_test  = feature_scaler.transform(X_test_raw)

y_raw = train[TARGETS].copy().values.astype(np.float32)
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y_raw)

In [4]:
# Section 3: Dataset / DataLoader Helper
def create_dataloader(X_data, y_data=None, batch_size=128, shuffle=False):
    if y_data is not None:
        ds = TensorDataset(
            torch.tensor(X_data, dtype=torch.float32),
            torch.tensor(y_data, dtype=torch.float32)
        )
    else:
        ds = TensorDataset(torch.tensor(X_data, dtype=torch.float32))
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)


In [5]:
# Section 4: DNN Model Definition
class DNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.GELU(),

            nn.Linear(256, 128),
            nn.GELU(),

            nn.Linear(128, 64),
            nn.GELU(),

            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)


In [7]:
# Section 5: Training Loop with K-Fold CV, Early Stopping, LR Scheduler
device       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
FOLDS        = 5
EPOCHS       = 50
BATCH_SIZE   = 128
LR           = 1e-3
PATIENCE     = 5

n_train = X_train.shape[0]
n_test  = X_test.shape[0]
n_tgt   = len(TARGETS)

test_preds = np.zeros((n_test, n_tgt), dtype=np.float32)
oof_preds  = np.zeros((n_train, n_tgt), dtype=np.float32)

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), start=1):
    print(f"\n🕐 Fold {fold}/{FOLDS}")

    X_tr, y_tr = X_train[train_idx], y_scaled[train_idx]
    X_val, y_val = X_train[val_idx], y_scaled[val_idx]

    train_loader = create_dataloader(X_tr, y_tr, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = create_dataloader(X_val, y_val, batch_size=BATCH_SIZE, shuffle=False)
    test_loader  = create_dataloader(X_test, batch_size=BATCH_SIZE, shuffle=False)

    model     = DNN(input_dim=X_train.shape[1], output_dim=n_tgt).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', patience=2, factor=0.5
    )
    loss_fn   = nn.MSELoss()

    best_val_loss = np.inf
    epochs_no_improve = 0
    best_state = None

    for epoch in range(1, EPOCHS+1):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = loss_fn(out, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)

        model.eval()
        val_preds_fold = []
        with torch.no_grad():
            for xb, _ in val_loader:
                xb = xb.to(device)
                pred = model(xb).cpu().numpy()
                val_preds_fold.append(pred)
        val_preds_fold = np.vstack(val_preds_fold)
        val_loss = mean_squared_error(y_val, val_preds_fold)

        print(f"Epoch {epoch:02d} | Train Loss {avg_train_loss:.6f} | Val Loss {val_loss:.6f}")

        scheduler.step(val_loss)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_state = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print("⏹ Early stopping triggered.")
                break

    model.load_state_dict(best_state)

    model.eval()
    with torch.no_grad():
        fold_val_preds = []
        for xb, _ in val_loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy()
            fold_val_preds.append(pred)
        fold_val_preds = np.vstack(fold_val_preds)
        oof_preds[val_idx, :] = fold_val_preds

    model.eval()
    fold_test_preds = []
    with torch.no_grad():
        for xb, in test_loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy()
            fold_test_preds.append(pred)
        fold_test_preds = np.vstack(fold_test_preds)
        test_preds += fold_test_preds / FOLDS



🕐 Fold 1/5
Epoch 01 | Train Loss 0.928717 | Val Loss 0.770148
Epoch 02 | Train Loss 0.553296 | Val Loss 0.453052
Epoch 03 | Train Loss 0.341373 | Val Loss 0.258033
Epoch 04 | Train Loss 0.237070 | Val Loss 0.223953
Epoch 05 | Train Loss 0.230055 | Val Loss 0.274731
Epoch 06 | Train Loss 0.214801 | Val Loss 0.239698
Epoch 07 | Train Loss 0.218853 | Val Loss 0.193959
Epoch 08 | Train Loss 0.204219 | Val Loss 0.199415
Epoch 09 | Train Loss 0.200646 | Val Loss 0.251565
Epoch 10 | Train Loss 0.199135 | Val Loss 0.186487
Epoch 11 | Train Loss 0.180643 | Val Loss 0.198261
Epoch 12 | Train Loss 0.184164 | Val Loss 0.173220
Epoch 13 | Train Loss 0.179829 | Val Loss 0.183646
Epoch 14 | Train Loss 0.208639 | Val Loss 0.193868
Epoch 15 | Train Loss 0.195149 | Val Loss 0.181759
Epoch 16 | Train Loss 0.180730 | Val Loss 0.181050
Epoch 17 | Train Loss 0.192935 | Val Loss 0.178563
⏹ Early stopping triggered.

🕐 Fold 2/5
Epoch 01 | Train Loss 0.947533 | Val Loss 0.699316
Epoch 02 | Train Loss 0.596679

In [8]:
# Section 6: Inverse Transform & Prepare Submission DataFrame
oof_preds_inv  = target_scaler.inverse_transform(oof_preds)
test_preds_inv = target_scaler.inverse_transform(test_preds)

oof_score = mean_squared_error(train[TARGETS].values, oof_preds_inv)
print(f"OOF RMSE (raw): {np.sqrt(oof_score):.6f}")

sub_df = pd.DataFrame(test_preds_inv, columns=TARGETS)
sub_df.insert(0, 'geology_id', test['geology_id'])

OOF RMSE (raw): 2.860820


In [9]:
# Section 7: “Average Trick” Post‐Processing
df_sub = sub_df.copy()
numeric_values = df_sub.iloc[:, 1:].values
n_samples = numeric_values.shape[0]
data_reshaped = numeric_values.reshape(n_samples, 10, 300)
mean_across_realizations = data_reshaped.mean(axis=1)
mean_repeated = np.tile(mean_across_realizations[:, None, :], (1, 10, 1))
mean_repeated = mean_repeated.reshape(n_samples, 3000)
df_sub.iloc[:, 1:] = mean_repeated

df_sub.to_csv("submission_refined.csv", index=False)
df_sub.head()

Unnamed: 0,geology_id,1,2,3,4,5,6,7,8,9,...,r_9_pos_291,r_9_pos_292,r_9_pos_293,r_9_pos_294,r_9_pos_295,r_9_pos_296,r_9_pos_297,r_9_pos_298,r_9_pos_299,r_9_pos_300
0,g_4a52df537a,-0.006917,-0.01303,-0.019454,-0.027568,-0.032346,-0.039223,-0.046534,-0.056739,-0.05877,...,-1.441678,-1.524261,-1.582406,-1.557901,-1.414614,-1.386626,-1.417733,-1.498469,-1.480004,-1.477566
1,g_1e4b5a1509,0.013678,0.0282,0.042944,0.054485,0.070491,0.083501,0.09702,0.1077,0.125039,...,3.692461,3.666321,3.558129,3.647701,3.779645,3.833497,3.800299,3.756183,3.747805,3.832135
2,g_5919c0bea3,0.028208,0.056853,0.085918,0.112302,0.142245,0.169681,0.198137,0.224085,0.254682,...,7.240481,7.264407,7.187561,7.268591,7.342052,7.424016,7.407117,7.399234,7.420797,7.484445
3,g_9a665aae6d,0.039626,0.079531,0.119691,0.158211,0.198463,0.237721,0.277626,0.315779,0.356901,...,10.013875,10.048609,10.039421,10.076058,10.135363,10.206509,10.220245,10.213495,10.247083,10.356003
4,g_ba4abe1b9e,0.019238,0.039122,0.059521,0.076708,0.097912,0.116377,0.135869,0.152322,0.174366,...,5.051709,5.045511,4.957505,5.036664,5.160219,5.224001,5.184262,5.160882,5.156343,5.232307
