In [1]:
import pandas as pd
import numpy as np
import torch
from kan import KAN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_log_error

# Veri yükleme ve ön işleme
df = pd.read_csv("train.csv")
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])

# Eksik değerleri doldurma (XGBoost subsample=0.82'ye uyumlu)
numeric_cols = X.select_dtypes(include=np.number).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

# Kategorik değişkenleri Label Encoding (colsample_bytree=0.89'a uyumlu)
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Özellik seçimi (toplam özelliğin ~%89'u)
selected_cols = X.columns[:int(len(X.columns)*0.89)]
X = X[selected_cols]

# Veriyi bölme
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ölçeklendirme
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hedef değişken log dönüşümü (RMSLE için optimize)
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# PyTorch tensörlerine dönüştürme
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_log.values.reshape(-1,1), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_log.values.reshape(-1,1), dtype=torch.float32)

dataset = {
    "train_input": X_train_tensor,
    "train_label": y_train_tensor,
    "test_input": X_test_tensor,
    "test_label": y_test_tensor
}

# Fortuna sonuçlarına göre optimize edilmiş KAN modeli
model = KAN(
    width=[X_train.shape[1], 16, 8, 1],  # max_depth=4'e denk (3 katman)
    grid=5, 
    k=4,
    seed=42
)

# XGBoost optimal parametrelerinin KAN'a uyarlanmış hali
optimized_params = {
    "opt": "LBFGS",
    "steps": 222,        # n_estimators=222
    "lr": 0.063,         # learning_rate=0.063
    "lamb": 0.0012,      # gamma=0.0012
    "lamb_entropy": 0.8, # subsample=0.82'yi dengelemek için
    "update_grid": False # Hata önleme
}

# Model eğitimi
model.fit(dataset, **optimized_params)

# Tahminler ve değerlendirme
with torch.no_grad():
    y_pred_train = model(X_train_tensor).numpy().flatten()
    y_pred_test = model(X_test_tensor).numpy().flatten()

# Log dönüşümünü geri alma
y_pred_train = np.expm1(y_pred_train)
y_pred_test = np.expm1(y_pred_test)

# Metrik hesaplama
def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return r2, rmsle

train_r2, train_rmsle = calculate_metrics(y_train, y_pred_train)
test_r2, test_rmsle = calculate_metrics(y_test, y_pred_test)

print("\n" + "="*50)
print("Optimize Edilmiş KAN Modeli Sonuçları")
print("="*50)
print(f"Eğitim Seti R²: {train_r2:.5f} | RMSLE: {train_rmsle:.5f}")
print(f"Test Seti R²: {test_r2:.5f} | RMSLE: {test_rmsle:.5f}")
print("="*50 + "\n")

# Modeli kaydetme
torch.save(model.state_dict(), "optimized_kan_model.pth")

checkpoint directory created: ./model
saving model version 0.0


| train_loss: 6.83e-02 | test_loss: 1.44e-01 | reg: 7.31e+00 | : 100%|█| 222/222 [08:50<00:00,  2.39


saving model version 0.1

Optimize Edilmiş KAN Modeli Sonuçları
Eğitim Seti R²: 0.96958 | RMSLE: 0.06825
Test Seti R²: 0.91395 | RMSLE: 0.14434

