# 🤗 최신 Transformer & Foundation Models for Tabular Data

## 사용 모델:
1. **TabTransformer** - Google Research
2. **FT-Transformer** - Feature Tokenizer Transformer
3. **SAINT** - Self-Attention and Intersample Attention
4. **TabNet** - Google Cloud AI
5. **AutoGluon-Tabular** - AWS AutoML
6. **Hugging Face TabularTransformer**
7. **XGBoost with Transformers** - Hybrid Model

## 1. 최신 라이브러리 설치

In [None]:
# Transformer 기반 Tabular 모델
!pip install transformers datasets accelerate -q
!pip install pytorch-tabnet -q
!pip install autogluon.tabular -q
!pip install tab-transformer-pytorch -q
!pip install rtdl -q  # Revisiting Deep Learning for Tabular Data
!pip install saint-pytorch -q
!pip install torch torchvision -q
!pip install einops -q

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

# Transformer models
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tab_transformer_pytorch import TabTransformer, FTTransformer
from saint_pytorch import SAINT
from pytorch_tabnet.tab_model import TabNetRegressor

# AutoML
from autogluon.tabular import TabularPredictor

# RTDL (Revisiting Deep Learning)
import rtdl

# 기본 ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. 데이터 준비

In [None]:
# 데이터 로드
df = pd.read_excel('강원도및경기일부.xlsx')
print(f"데이터 크기: {df.shape}")

# 특성과 타겟 설정
X_columns = df.columns[[3, 4, 5]]  # 100번이동, 10번이동, 3번이동
y_column = df.columns[1]  # 사정율

# 데이터 정제
data = df[list(X_columns) + [y_column]].dropna()
X = data[list(X_columns)].values
y = data[y_column].values

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")
print(f"특성 개수: {X.shape[1]}")

## 3. TabTransformer (Google Research)

In [None]:
# TabTransformer 모델
tab_transformer = TabTransformer(
    categories=(),  # 연속형 변수만 사용
    num_continuous=3,  # 연속형 변수 3개
    dim=32,
    dim_out=1,  # 회귀 출력
    depth=6,  # Transformer 층 수
    heads=8,  # Attention heads
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4, 2),
    mlp_act=nn.ReLU(),
).to(device)

print("TabTransformer 아키텍처:")
print(f"  - Transformer Depth: 6")
print(f"  - Attention Heads: 8")
print(f"  - Hidden Dimension: 32")
print(f"  - Parameters: {sum(p.numel() for p in tab_transformer.parameters()):,}")

In [None]:
# TabTransformer 학습
def train_transformer_model(model, X_train, y_train, X_test, y_test, epochs=100):
    # 텐서 변환
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1).to(device)
    
    # 옵티마이저와 손실 함수
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    # 학습
    train_losses = []
    test_losses = []
    
    for epoch in range(epochs):
        # Train
        model.train()
        optimizer.zero_grad()
        
        # TabTransformer는 범주형과 연속형을 분리해서 입력
        outputs = model(None, X_train_tensor)  # (categories, continuous)
        loss = criterion(outputs, y_train_tensor)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_losses.append(loss.item())
        
        # Eval
        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                test_outputs = model(None, X_test_tensor)
                test_loss = criterion(test_outputs, y_test_tensor)
                test_losses.append(test_loss.item())
                
                if epoch % 20 == 0:
                    r2 = r2_score(y_test_tensor.cpu(), test_outputs.cpu())
                    print(f"Epoch {epoch}: Train Loss={loss.item():.4f}, Test R²={r2:.4f}")
    
    # 최종 평가
    model.eval()
    with torch.no_grad():
        predictions = model(None, X_test_tensor).cpu().numpy()
        r2 = r2_score(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    return model, r2, rmse, train_losses

# 학습 실행
tab_transformer_trained, tab_r2, tab_rmse, tab_losses = train_transformer_model(
    tab_transformer, X_train_scaled, y_train, X_test_scaled, y_test
)

print(f"\n✅ TabTransformer 최종 성능: R²={tab_r2:.4f}, RMSE={tab_rmse:.4f}")

## 4. FT-Transformer (Feature Tokenizer Transformer)

In [None]:
# FT-Transformer
ft_transformer = FTTransformer(
    categories=(),  # 범주형 없음
    num_continuous=3,
    dim=32,
    dim_out=1,
    depth=3,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1
).to(device)

print("FT-Transformer (Feature Tokenizer):")
print("  - 각 특성을 개별 토큰으로 변환")
print("  - Self-attention으로 특성 간 상호작용 학습")
print(f"  - Parameters: {sum(p.numel() for p in ft_transformer.parameters()):,}")

# FT-Transformer 학습
ft_transformer_trained, ft_r2, ft_rmse, ft_losses = train_transformer_model(
    ft_transformer, X_train_scaled, y_train, X_test_scaled, y_test, epochs=100
)

print(f"\n✅ FT-Transformer 최종 성능: R²={ft_r2:.4f}, RMSE={ft_rmse:.4f}")

## 5. SAINT (Self-Attention and Intersample Attention)

In [None]:
# SAINT 모델 - Row & Column Attention
saint_model = SAINT(
    num_features=3,
    num_classes=1,  # 회귀이므로 1
    dim=32,
    depth=6,
    heads=8,
    dim_head=16,
    dropout=0.1
).to(device)

print("SAINT (Self-Attention & Intersample):")
print("  - Row-wise attention: 샘플 간 관계 학습")
print("  - Column-wise attention: 특성 간 관계 학습")
print(f"  - Parameters: {sum(p.numel() for p in saint_model.parameters()):,}")

# SAINT는 배치 단위로 처리
batch_size = 32
train_dataset = TensorDataset(
    torch.FloatTensor(X_train_scaled),
    torch.FloatTensor(y_train)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# SAINT 학습
optimizer = optim.AdamW(saint_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

saint_model.train()
for epoch in range(50):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = saint_model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss={epoch_loss/len(train_loader):.4f}")

# SAINT 평가
saint_model.eval()
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    predictions = saint_model(X_test_tensor).cpu().numpy().squeeze()
    saint_r2 = r2_score(y_test, predictions)
    saint_rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\n✅ SAINT 최종 성능: R²={saint_r2:.4f}, RMSE={saint_rmse:.4f}")

## 6. TabNet (Google Cloud AI)

In [None]:
# TabNet - Attention 기반 특성 선택
tabnet_model = TabNetRegressor(
    n_d=8,  # Width of decision prediction layer
    n_a=8,  # Width of attention embedding
    n_steps=3,  # Number of steps
    gamma=1.3,  # Relaxation parameter
    cat_idxs=[],  # 범주형 변수 인덱스 (없음)
    cat_dims=[],  # 범주형 변수 차원 (없음)
    cat_emb_dim=1,
    n_independent=2,
    n_shared=2,
    epsilon=1e-15,
    momentum=0.02,
    lambda_sparse=1e-3,  # Sparsity
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.02),
    scheduler_params={"step_size": 50, "gamma": 0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax',  # "sparsemax" or "entmax"
    verbose=0
)

print("TabNet (Explainable DL):")
print("  - Sequential attention for feature selection")
print("  - Interpretable feature importance")
print("  - No preprocessing required")

# TabNet 학습
tabnet_model.fit(
    X_train=X_train_scaled,
    y_train=y_train.reshape(-1, 1),
    eval_set=[(X_test_scaled, y_test.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=200,
    patience=20,
    batch_size=256,
    virtual_batch_size=128
)

# TabNet 평가
tabnet_pred = tabnet_model.predict(X_test_scaled).flatten()
tabnet_r2 = r2_score(y_test, tabnet_pred)
tabnet_rmse = np.sqrt(mean_squared_error(y_test, tabnet_pred))

print(f"\n✅ TabNet 최종 성능: R²={tabnet_r2:.4f}, RMSE={tabnet_rmse:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
print("\nTabNet Feature Importances:")
for i, imp in enumerate(feature_importances):
    print(f"  Feature {i}: {imp:.4f}")

## 7. RTDL Models (Revisiting Deep Learning for Tabular Data)

In [None]:
# RTDL - ResNet for Tabular
rtdl_resnet = rtdl.ResNet.make_baseline(
    d_in=3,  # 입력 차원
    d_out=1,  # 출력 차원 (회귀)
    n_blocks=2,
    d_main=128,
    d_hidden=256,
    dropout_first=0.2,
    dropout_second=0.1,
).to(device)

print("RTDL ResNet for Tabular:")
print("  - Residual connections for tabular data")
print("  - Deep architecture without degradation")
print(f"  - Parameters: {sum(p.numel() for p in rtdl_resnet.parameters()):,}")

# ResNet 학습
optimizer = optim.AdamW(rtdl_resnet.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

rtdl_resnet.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = rtdl_resnet(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch}: Loss={loss.item():.4f}")

# ResNet 평가
rtdl_resnet.eval()
with torch.no_grad():
    predictions = rtdl_resnet(X_test_tensor).cpu().numpy().squeeze()
    rtdl_r2 = r2_score(y_test, predictions)
    rtdl_rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\n✅ RTDL ResNet 최종 성능: R²={rtdl_r2:.4f}, RMSE={rtdl_rmse:.4f}")

## 8. AutoGluon (AWS AutoML)

In [None]:
# AutoGluon - 자동 앙상블
train_data = pd.DataFrame(X_train, columns=['feat_0', 'feat_1', 'feat_2'])
train_data['target'] = y_train
test_data = pd.DataFrame(X_test, columns=['feat_0', 'feat_1', 'feat_2'])

print("AutoGluon TabularPredictor:")
print("  - Automatic model selection & ensembling")
print("  - Neural networks + Boosted trees + More")
print("  - Automatic hyperparameter tuning")

# AutoGluon 학습
predictor = TabularPredictor(
    label='target',
    eval_metric='r2',
    verbosity=0
).fit(
    train_data=train_data,
    presets='best_quality',  # 최고 품질 설정
    time_limit=120  # 2분 제한
)

# AutoGluon 평가
autogluon_pred = predictor.predict(test_data)
autogluon_r2 = r2_score(y_test, autogluon_pred)
autogluon_rmse = np.sqrt(mean_squared_error(y_test, autogluon_pred))

print(f"\n✅ AutoGluon 최종 성능: R²={autogluon_r2:.4f}, RMSE={autogluon_rmse:.4f}")

# 모델 리더보드
print("\nAutoGluon Model Leaderboard:")
leaderboard = predictor.leaderboard(test_data)
print(leaderboard[['model', 'score_val', 'score_test']].head(10))

## 9. Hybrid: XGBoost + Transformer Features

In [None]:
import xgboost as xgb

# Transformer로 특성 추출 후 XGBoost
print("Hybrid Model: Transformer Feature Extractor + XGBoost")
print("  - Transformer로 고차원 특성 학습")
print("  - XGBoost로 최종 예측")

# TabTransformer에서 특성 추출 (마지막 층 전)
class FeatureExtractor(nn.Module):
    def __init__(self, transformer_model):
        super().__init__()
        self.transformer = transformer_model
        
    def forward(self, x):
        # Transformer의 중간 representation 추출
        with torch.no_grad():
            # 여기서는 간단히 원본 특성과 변환된 특성을 연결
            transformed = self.transformer(None, x)
        return torch.cat([x, transformed], dim=1)

# 특성 추출
extractor = FeatureExtractor(tab_transformer_trained).to(device)
extractor.eval()

with torch.no_grad():
    X_train_enhanced = extractor(torch.FloatTensor(X_train_scaled).to(device)).cpu().numpy()
    X_test_enhanced = extractor(torch.FloatTensor(X_test_scaled).to(device)).cpu().numpy()

print(f"Enhanced features shape: {X_train_enhanced.shape}")

# XGBoost with enhanced features
xgb_hybrid = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_hybrid.fit(X_train_enhanced, y_train)
hybrid_pred = xgb_hybrid.predict(X_test_enhanced)
hybrid_r2 = r2_score(y_test, hybrid_pred)
hybrid_rmse = np.sqrt(mean_squared_error(y_test, hybrid_pred))

print(f"\n✅ Hybrid (Transformer+XGBoost) 최종 성능: R²={hybrid_r2:.4f}, RMSE={hybrid_rmse:.4f}")

## 10. 모든 모델 비교

In [None]:
# 결과 정리
results = {
    'TabTransformer': {'R2': tab_r2, 'RMSE': tab_rmse},
    'FT-Transformer': {'R2': ft_r2, 'RMSE': ft_rmse},
    'SAINT': {'R2': saint_r2, 'RMSE': saint_rmse},
    'TabNet': {'R2': tabnet_r2, 'RMSE': tabnet_rmse},
    'RTDL ResNet': {'R2': rtdl_r2, 'RMSE': rtdl_rmse},
    'AutoGluon': {'R2': autogluon_r2, 'RMSE': autogluon_rmse},
    'Hybrid (Trans+XGB)': {'R2': hybrid_r2, 'RMSE': hybrid_rmse}
}

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2', ascending=False)

print("="*70)
print("🏆 최신 Transformer 모델 성능 비교")
print("="*70)
print(results_df)

# 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# R² Score
axes[0].barh(results_df.index, results_df['R2'], color='skyblue')
axes[0].set_xlabel('R² Score')
axes[0].set_title('Model Performance (R²)')
axes[0].grid(True, alpha=0.3)

# RMSE
axes[1].barh(results_df.index, results_df['RMSE'], color='coral')
axes[1].set_xlabel('RMSE')
axes[1].set_title('Model Error (RMSE)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 최고 모델
best_model = results_df.index[0]
best_r2 = results_df.iloc[0]['R2']
best_rmse = results_df.iloc[0]['RMSE']

print(f"\n🥇 최고 성능 모델: {best_model}")
print(f"   R² Score: {best_r2:.4f}")
print(f"   RMSE: {best_rmse:.4f}")

## 11. 모델 해석 및 결론

In [None]:
print("="*70)
print("📊 최신 Transformer 모델 분석 결론")
print("="*70)

print("\n🔬 모델별 특징:")
print("\n1. TabTransformer (Google)")
print("   - Transformer를 tabular data에 적용한 선구자")
print("   - 범주형 변수에 강점")

print("\n2. FT-Transformer")
print("   - 각 특성을 토큰으로 변환")
print("   - 특성 간 복잡한 상호작용 포착")

print("\n3. SAINT")
print("   - Row & Column attention")
print("   - 샘플 간 관계까지 학습")

print("\n4. TabNet (Google Cloud)")
print("   - 해석 가능한 특성 선택")
print("   - 전처리 불필요")

print("\n5. AutoGluon (AWS)")
print("   - 자동 앙상블")
print("   - 실무에서 즉시 사용 가능")

print("\n6. Hybrid Model")
print("   - Transformer + XGBoost 결합")
print("   - 딥러닝과 부스팅의 장점 결합")

print("\n💡 추천:")
if best_r2 > 0.5:
    print(f"   ✅ {best_model} 사용 권장")
    print(f"   - 예측 정확도: {best_r2*100:.1f}%")
else:
    print(f"   ⚠️ 추가 특성 엔지니어링 필요")
    print(f"   - 현재 최고 성능: {best_r2*100:.1f}%")

print("\n📈 성능 개선 방안:")
print("   1. 더 많은 특성 추가")
print("   2. 시계열 특성 활용")
print("   3. 도메인 지식 기반 특성 엔지니어링")
print("   4. 더 큰 모델 & 더 긴 학습")