In [91]:
from transformers import Trainer, TrainingArguments,EarlyStoppingCallback
import torch
from torch import nn
from torch.utils.data import Dataset
import numpy as np
import math


In [92]:
import torch, torchvision, torchaudio
print("PyTorch:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else None)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)


PyTorch: 2.8.0+cu128
CUDA available? True
Device count: 1
Current device: 0
Device name: NVIDIA RTX A5000


In [93]:
import numpy as np

data = np.loadtxt("ENDF_B-VIII.1_AU-197(N,G)AU-198.yaml")  # (x, y)
print(data.shape)

(20330, 2)


In [94]:
x_raw = data[:, 0].astype(np.float32)
y_raw = data[:, 1].astype(np.float32)
x_log = np.log10(x_raw)
y_log = np.log10(y_raw)

In [95]:
from datasets import Dataset

full_ds = Dataset.from_dict({"x": x_log.tolist(), "y": y_log.tolist()})
ds = full_ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds["train"], ds["test"]

In [96]:
# ---------- 4) Standardize using TRAIN stats ----------
x_mean, x_std = float(np.mean(train_ds["x"])), float(np.std(train_ds["x"]))
y_mean, y_std = float(np.mean(train_ds["y"])), float(np.std(train_ds["y"]))


def standardize(example):
    example["input"] = (example["x"] - x_mean) / x_std
    example["labels"] = (example["y"] - y_mean) / y_std
    del example["x"]
    del example["y"]
    return example


train_ds = train_ds.map(standardize)
eval_ds  = eval_ds.map(standardize)


Map:   0%|          | 0/18297 [00:00<?, ? examples/s]

Map:   0%|          | 0/2033 [00:00<?, ? examples/s]

In [97]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [98]:
class MLP(nn.Module):
    def __init__(self, dim=512, activation=nn.Tanh(), p_drop=0.05):
        super().__init__()
        
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            activation,
            nn.Dropout(p_drop),
            nn.Linear(dim, dim),
            nn.Dropout(p_drop)
        )
        self.activation = activation
        
    def forward(self, x):
        return self.activation(x + self.block(x))


In [99]:
class FourierMLP(nn.Module):
    """
    FIXED: Proper shape handling for Fourier features
    """
    def __init__(self, dim=512, activation=nn.Tanh(), p_drop=0.05, 
                 num_frequencies=None, fourier_scale=10.0):
        super().__init__()
        
        if num_frequencies is None:
            num_frequencies = dim // 2
        
        self.dim = dim
        self.num_frequencies = num_frequencies
        
        # Random Fourier feature matrix: (dim, num_frequencies)
        self.register_buffer(
            'B', 
            torch.randn(dim, num_frequencies) * fourier_scale
        )
        
        # Process the Fourier features back to dim
        self.block = nn.Sequential(
            nn.Linear(num_frequencies * 2, dim),  # sin + cos features
            activation,
            nn.Dropout(p_drop),
            nn.Linear(dim, dim),
            nn.Dropout(p_drop)
        )
        self.activation = activation
    
    def forward(self, x):
        # x shape: (batch, dim)
        # Project to frequency space
        x_proj = 2 * np.pi * (x @ self.B)  # (batch, num_frequencies)
        
        # Create sin/cos features
        fourier_features = torch.cat([
            torch.sin(x_proj), 
            torch.cos(x_proj)
        ], dim=-1)  # (batch, num_frequencies * 2)
        
        # Process and add residual
        out = self.block(fourier_features)  # (batch, dim)
        return self.activation(x + out)

In [100]:
class kamma_2(nn.Module):
    def __init__(
        self, 
        in_dim=1, 
        dim=128, 
        depth=3,
        num_fourier_blocks=1,
        fourier_scale=10.0,
        p_drop=0.05,
        activation=nn.Tanh()
    ):
        super().__init__()
        
        self.inp = nn.Linear(in_dim, dim)
        self.blocks = nn.ModuleList()
        
        # Add Fourier blocks first
        for _ in range(num_fourier_blocks):
            self.blocks.append(
                FourierMLP(dim, activation, p_drop, fourier_scale=fourier_scale)
            )
        
        # Add standard MLP blocks
        for _ in range(depth - num_fourier_blocks):
            self.blocks.append(
                MLP(dim, activation, p_drop)
            )
        
        self.out = nn.Linear(dim, 1)
        self.loss_fn = nn.MSELoss()
        
        # Better initialization
        self._init_weights()
    
    def _init_weights(self):
        """Better weight initialization for deep networks"""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    
    def forward(self, input: torch.Tensor, labels: torch.Tensor = None):
        h = self.inp(input)
        for blk in self.blocks:
            h = blk(h)
        logits = self.out(h)
        
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}


In [101]:
model = kamma_2(
    in_dim=1,
    dim=512,
    depth=8,              # REDUCED from 15
    num_fourier_blocks=3, # REDUCED from 7
    fourier_scale=25.0,   # INCREASED from 10
    p_drop=0.03,          # REDUCED from 0.05
    activation=nn.Tanh()
).to(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(model)

kamma_2(
  (inp): Linear(in_features=1, out_features=512, bias=True)
  (blocks): ModuleList(
    (0-2): 3 x FourierMLP(
      (block): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): Tanh()
        (2): Dropout(p=0.03, inplace=False)
        (3): Linear(in_features=512, out_features=512, bias=True)
        (4): Dropout(p=0.03, inplace=False)
      )
      (activation): Tanh()
    )
    (3-7): 5 x MLP(
      (block): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): Tanh()
        (2): Dropout(p=0.03, inplace=False)
        (3): Linear(in_features=512, out_features=512, bias=True)
        (4): Dropout(p=0.03, inplace=False)
      )
      (activation): Tanh()
    )
  )
  (out): Linear(in_features=512, out_features=1, bias=True)
  (loss_fn): MSELoss()
)


In [102]:
# Check specific parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.device}")
    break  # Just check the first one, usually all are on same device

inp.weight: cuda:0


In [103]:
def num_of_param(model):
    return sum(p.numel() for p in model.parameters())
num_of_param(model)

4204033

In [104]:
def compute_metrics(eval_pred):
    preds = eval_pred.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.array(preds).reshape(-1)
    labels = eval_pred.label_ids.reshape(-1)

    mse  = float(np.mean((preds - labels) ** 2))
    rmse = float(math.sqrt(mse))
    mae  = float(np.mean(np.abs(preds - labels)))
    ss_res = float(np.sum((labels - preds) ** 2))
    
    ss_tot = float(np.sum((labels - np.mean(labels)) ** 2))
    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2}


def collate_scalar_to_column(batch):
    inputs = torch.tensor([ex["input"] for ex in batch], dtype=torch.float32).unsqueeze(-1)
    labels = torch.tensor([ex["labels"] for ex in batch], dtype=torch.float32).unsqueeze(-1)
    return {"input": inputs, "labels": labels}


In [108]:
training_args = TrainingArguments(
            output_dir='./results',
            learning_rate=1e-3,
            per_device_train_batch_size=512,  
            per_device_eval_batch_size=512,
            max_steps=30000,  # Replace with your desired number of steps
            weight_decay=0.01,
            eval_strategy='steps', 
            eval_steps=2500,  #the save step should be a multiple of eval step, savestep=500 by default
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
            # metric_for_best_model="mse",
        )
# ---------------- 7) Trainer ----------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    data_collator=collate_scalar_to_column,
)

In [109]:
print(f"Trainer device: {trainer.args.device}")

Trainer device: cuda:0


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Mse,Rmse,Mae,R2
2500,1.1822,1.092094,1.092094,1.045033,0.823996,-0.112196
5000,1.0817,0.993672,0.993672,0.996831,0.791861,-0.011962
7500,1.0544,0.998444,0.998444,0.999222,0.790555,-0.016823
10000,1.0379,1.002492,1.002492,1.001245,0.799004,-0.020945
12500,1.0305,0.970952,0.970952,0.985369,0.782545,0.011176
15000,1.0195,0.980741,0.980741,0.990324,0.785089,0.001206


In [None]:
train_ds

In [None]:
pred_train = trainer.predict(train_ds)

In [None]:
out_pred=pred_train.predictions

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuminx
x=np.array(train_ds["input"])
y = out_pred# All rows, column 1

# Log-log plot
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'o', alpha=0.7, markersize=1)
plt.xlabel('X (log scale)')
plt.ylabel('Y (log scale)')
plt.title('Log-Log Plot')
plt.grid(True, alpha=0.3)
plt.show()