
# House Pricing — PyTorch DNN Regression (Local CSV)

**Dataset:** `/mnt/data/House_Pricing.csv` (uploaded in this chat)  

This notebook:
1. Loads the dataset from the local file  
2. Cleans & preprocesses (missing values, numeric/categorical handling, scaling, one-hot encoding)  
3. Splits into Train/Validation/Test  
4. Builds and trains a PyTorch Deep Neural Network with BatchNorm & Dropout  
5. Evaluates on the Test set with **MAE**, **MSE**, **RMSE**, **R²**  
6. Plots learning curves


In [None]:

# If needed on Colab:
# !pip install -q torch torchvision torchaudio scikit-learn pandas numpy matplotlib

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

# Config
LOCAL_PATH = "/mnt/data/House_Pricing.csv"
TARGET_COL = "price"        # Change if target is named differently
TEST_SIZE = 0.15
VAL_SIZE_WITHIN_TRAIN = 0.1765  # ~15% of full as validation
BATCH_SIZE = 64
LR = 1e-3
EPOCHS = 60                 # Reasonable runtime; adjust as needed
PATIENCE = 12               # Early stopping patience


In [None]:

# Load data
assert os.path.exists(LOCAL_PATH), f"File not found: {LOCAL_PATH}"
df = pd.read_csv(LOCAL_PATH)
print("Data shape:", df.shape)
df.head()


In [None]:

# Quick audit
print("Missing values per column:\n", df.isna().sum().sort_values(ascending=False))
display(df.describe(include='all').transpose())

# Ensure target exists
assert TARGET_COL in df.columns, f"TARGET_COL='{TARGET_COL}' not found in columns: {list(df.columns)}"

# Separate features/target
y = df[TARGET_COL].astype(float)

# Drop ID-like columns to avoid leakage
id_like = [c for c in df.columns if c.lower() in {"id", "index"}]
X = df.drop(columns=[TARGET_COL] + id_like, errors="ignore")

# Identify numeric & categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Numeric columns ({len(num_cols)}):", num_cols[:12], "..." if len(num_cols) > 12 else "")
print(f"Categorical columns ({len(cat_cols)}):", cat_cols[:12], "..." if len(cat_cols) > 12 else "")


In [None]:

# Train/Val/Test split 70/15/15
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=VAL_SIZE_WITHIN_TRAIN, random_state=RANDOM_STATE
)
print("Splits:")
print("  Train:", X_train.shape, y_train.shape)
print("  Val  :", X_val.shape, y_val.shape)
print("  Test :", X_test.shape, y_test.shape)


In [None]:

# Preprocessing: numeric (median -> scale), categorical (mode -> one-hot)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)

X_train_prep = preprocess.fit_transform(X_train)
X_val_prep   = preprocess.transform(X_val)
X_test_prep  = preprocess.transform(X_test)

input_dim = X_train_prep.shape[1]
print("Input dimension after preprocessing:", input_dim)


In [None]:

# PyTorch dataset/dataloader
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(np.asarray(y), dtype=torch.float32).view(-1, 1)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train_prep, y_train)
val_ds   = TabularDataset(X_val_prep,   y_val)
test_ds  = TabularDataset(X_test_prep,  y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)


In [None]:

# Model
class RegressionDNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RegressionDNN(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)


In [None]:

# Training loop with early stopping
best_val = float("inf")
best_state = None
train_losses, val_losses = [], []
epochs_no_improve = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    running = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        running += loss.item() * xb.size(0)
    train_loss = running / len(train_loader.dataset)

    model.eval()
    running_val = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            vpred = model(xb)
            vloss = criterion(vpred, yb)
            running_val += vloss.item() * xb.size(0)
    val_loss = running_val / len(val_loader.dataset)

    scheduler.step(val_loss)
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss + 1e-8 < best_val:
        best_val = val_loss
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Train MSE: {train_loss:.6f} | Val MSE: {val_loss:.6f}")

    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping at epoch {epoch}. Best Val MSE: {best_val:.6f}")
        break

if best_state is not None:
    model.load_state_dict(best_state)


In [None]:

# Learning curves
plt.figure()
plt.plot(train_losses, label="Train MSE")
plt.plot(val_losses, label="Val MSE")
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.title("Learning Curves")
plt.legend()
plt.show()


In [None]:

# Test evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy().ravel()
        y_pred.extend(preds.tolist())
        y_true.extend(yb.numpy().ravel().tolist())

y_true = np.array(y_true)
y_pred = np.array(y_pred)

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

print("=== Test Metrics ===")
print(f"MAE : {mae:.4f}")
print(f"MSE : {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")


In [None]:

# Save model weights (optional)
torch.save(model.state_dict(), "house_price_dnn_local.pt")
print("Saved weights -> house_price_dnn_local.pt")
