In [3]:
# ==========================================
# Computer Prices 2025 â€“ Strong CV Baseline
# Model: CatBoostRegressor with 5-Fold CV
# Target: log(price) for training
# ==========================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor, Pool

# ------------------------
# Paths
# ------------------------
INPUT_DIR = "/kaggle/input/computer-prices-2025"

train_path = os.path.join(INPUT_DIR, "computer_prices_all.csv")
test_path = os.path.join(INPUT_DIR, "computer_prices_test.csv")
sub_path = os.path.join(INPUT_DIR, "sample_submission.csv")

print("Input directory contents:", os.listdir(INPUT_DIR))

# ------------------------
# Load data
# ------------------------
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_sub = pd.read_csv(sub_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission head:")
print(sample_sub.head())

TARGET_COL = "price"

# ------------------------
# Basic clean-up
# ------------------------
# Drop common ID-like columns from features if they exist
id_cols = ["id", "ID", "row_id", "Row_ID", "index", "Unnamed: 0"]

cols_to_drop = [c for c in id_cols if c in train.columns]
train = train.drop(columns=cols_to_drop, errors="ignore")
test = test.drop(columns=cols_to_drop, errors="ignore")

# Make sure target exists
assert TARGET_COL in train.columns, f"{TARGET_COL} not found in train!"

# ------------------------
# Train/Test feature alignment
# ------------------------
# Sometimes train has target + all features, test has only features
# We'll take intersection of feature columns between train and test
feature_cols = [c for c in train.columns if c != TARGET_COL]

# Keep only columns that exist in both
common_features = [c for c in feature_cols if c in test.columns]
train = train[common_features + [TARGET_COL]]
test = test[common_features]

print("\nNumber of features:", len(common_features))
print("Features:", common_features)

# ------------------------
# Identify categorical features (CatBoost can handle strings)
# ------------------------
cat_features = [
    c for c in common_features
    if train[c].dtype == "object"
]
num_features = [c for c in common_features if c not in cat_features]

print("\nCategorical features:", cat_features)
print("Numeric features:", num_features)

# ------------------------
# Optional: simple extra features (very light feature engineering)
# You can extend this section later.
# ------------------------

def add_simple_features(df):
    # Example combinations if columns exist:
    # These are safe checks; if a column doesn't exist, we just skip.
    
    # total_memory = RAM + VRAM (if present)
    if "ram_gb" in df.columns and "vram_gb" in df.columns:
        df["total_memory_gb"] = df["ram_gb"].fillna(0) + df["vram_gb"].fillna(0)
    
    # cpu_power = cores * base_freq
    if "cpu_cores" in df.columns and "cpu_base_ghz" in df.columns:
        df["cpu_power_score"] = df["cpu_cores"].fillna(0) * df["cpu_base_ghz"].fillna(0)
    
    # gpu performance proxy
    if "gpu_tier" in df.columns and "vram_gb" in df.columns:
        df["gpu_perf_score"] = df["gpu_tier"].fillna(0) * df["vram_gb"].fillna(0)
    
    # storage per ram
    if "storage_gb" in df.columns and "ram_gb" in df.columns:
        df["storage_per_ram"] = df["storage_gb"].fillna(0) / (df["ram_gb"].replace(0, np.nan))
    
    return df

train = add_simple_features(train)
test = add_simple_features(test)

# Update feature lists after adding new features
feature_cols = [c for c in train.columns if c != TARGET_COL]
common_features = [c for c in feature_cols if c in test.columns]
cat_features = [c for c in common_features if train[c].dtype == "object"]
num_features = [c for c in common_features if c not in cat_features]

print("\nUpdated number of features:", len(common_features))

# ------------------------
# Target transformation: log1p
# ------------------------
# Prices are often skewed. log1p can stabilize and improve performance.
y = train[TARGET_COL].astype(float)
y_log = np.log1p(y)

X = train[common_features]
X_test = test[common_features]

# ------------------------
# K-Fold CV setup
# ------------------------
N_FOLDS = 5
RANDOM_SEED = 42

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

oof_pred = np.zeros(len(train))
test_pred = np.zeros(len(test))
fold_scores = []

# ------------------------
# CatBoost parameters
# ------------------------
# If competition metric is MAE, use:
#   loss_function="MAE", eval_metric="MAE"
# and change RMSE code accordingly.
CAT_PARAMS = dict(
    loss_function="RMSE",
    eval_metric="RMSE",
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=3,
    random_seed=RANDOM_SEED,
    iterations=3000,
    od_type="Iter",
    od_wait=200,
    task_type="CPU",   # set to "GPU" if GPU is available in your session
    verbose=200
)

print("\nStarting K-Fold training...")

for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y_log), 1):
    print(f"\n========== Fold {fold}/{N_FOLDS} ==========")
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y_log.iloc[trn_idx], y_log.iloc[val_idx]
    
    train_pool = Pool(X_trn, y_trn, cat_features=cat_features)
    valid_pool = Pool(X_val, y_val, cat_features=cat_features)
    
    model = CatBoostRegressor(**CAT_PARAMS)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    
    # OOF predictions in log space
    val_pred_log = model.predict(X_val)
    oof_pred[val_idx] = val_pred_log
    
    # Transform back to original price space for metric
    val_pred = np.expm1(val_pred_log)
    val_true = np.expm1(y_val)
    
    rmse = mean_squared_error(val_true, val_pred, squared=False)
    fold_scores.append(rmse)
    print(f"Fold {fold} RMSE: {rmse:.4f}")
    
    # Test predictions (average over folds, still in log space, then transform)
    test_pred += model.predict(X_test) / N_FOLDS

# ------------------------
# Overall CV performance
# ------------------------
# OOF in original space
oof_pred_price = np.expm1(oof_pred)
rmse_cv = mean_squared_error(y, oof_pred_price, squared=False)
print("\n========== CV Results ==========")
print("Fold RMSEs:", [f"{s:.4f}" for s in fold_scores])
print(f"Mean RMSE: {rmse_cv:.4f}, Std: {np.std(fold_scores):.4f}")

# ------------------------
# Final test predictions (back-transform)
# ------------------------
test_pred_price = np.expm1(test_pred)

# ------------------------
# Build submission
# ------------------------
submission = sample_sub.copy()

# Infer prediction column (non-id)
pred_cols = [c for c in submission.columns if c.lower() not in ["id", "row_id", "index"]]
if len(pred_cols) != 1:
    raise ValueError(
        f"Could not infer prediction column from sample_submission. "
        f"Found: {pred_cols}. Set it manually."
    )

pred_col = pred_cols[0]
print(f"\nUsing '{pred_col}' as prediction column in submission.")

submission[pred_col] = test_pred_price

submission_file = "submission.csv"
submission.to_csv(submission_file, index=False)
print(f"Saved submission to {submission_file}")
print(submission.head())

Input directory contents: ['sample_submission.csv', 'computer_prices_all.csv', 'computer_prices_test.csv']
Train shape: (100000, 34)
Test shape: (50000, 33)
Sample submission head:
       ID    price
0  100000  1927.99
1  100001  1927.99
2  100002  1927.99
3  100003  1927.99
4  100004  1927.99

Number of features: 32
Features: ['device_type', 'brand', 'model', 'release_year', 'os', 'form_factor', 'cpu_brand', 'cpu_model', 'cpu_tier', 'cpu_cores', 'cpu_threads', 'cpu_base_ghz', 'cpu_boost_ghz', 'gpu_brand', 'gpu_model', 'gpu_tier', 'vram_gb', 'ram_gb', 'storage_type', 'storage_gb', 'storage_drive_count', 'display_type', 'display_size_in', 'resolution', 'refresh_hz', 'battery_wh', 'charger_watts', 'psu_watts', 'wifi', 'bluetooth', 'weight_kg', 'warranty_months']

Categorical features: ['device_type', 'brand', 'model', 'os', 'form_factor', 'cpu_brand', 'cpu_model', 'gpu_brand', 'gpu_model', 'storage_type', 'display_type', 'resolution', 'wifi']
Numeric features: ['release_year', 'cpu_tier'

In [4]:
submission[pred_col] = test_pred_price

submission_file = "submission.csv"
submission.to_csv(submission_file, index=False)
print(f"Saved submission to {submission_file}")
print(submission.head())

Saved submission to submission.csv
       ID        price
0  100000  2908.201923
1  100001  2454.600520
2  100002  1242.135335
3  100003  1466.217551
4  100004  3254.772244
