### 1. Importing the Libraries

In [36]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import torch
import torch.nn as nn
import torch.optim as optim

import scipy.sparse
from torch.utils.data import DataLoader, TensorDataset



### 2. Loading the Data

In [37]:
train_data_path = '../datasets/house-prices-advanced-regression-techniques/train.csv'
test_data_path = '../datasets/house-prices-advanced-regression-techniques/test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print(train_data.shape)

train_data.head(5)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### 3. Data Preprocessing

In [38]:
# Identify categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_data.select_dtypes(exclude=['object']).columns.tolist()

# Remove the target variable from numerical columns
numerical_cols.remove('SalePrice')

# Create transformers for the preprocessing steps
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing and modeling pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Split data into features and target variable
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Apply preprocessing
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

# Show the shape of the processed data
X_train_prepared.shape, X_test_prepared.shape


((1168, 283), (292, 283))

### 4. Model Training

In [39]:
class HousePricePredictor(nn.Module):
    def __init__(self):
        super(HousePricePredictor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(283, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.network(x)


model = HousePricePredictor()

loss_fn = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

HousePricePredictor(
  (network): Sequential(
    (0): Linear(in_features=283, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [40]:
# transform the sparse matrix to dense matrix
if isinstance(X_train_prepared, scipy.sparse.csr_matrix):
    X_train_prepared = X_train_prepared.toarray()

if isinstance(X_test_prepared, scipy.sparse.csr_matrix):
    X_test_prepared = X_test_prepared.toarray()

# transfer PyTorch tensors
X_train_tensor = torch.tensor(X_train_prepared, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_prepared, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)

# DataLoaders
train_data = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
test_data = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32)

In [46]:
# training the model
def train_model(model, train_data, test_data, loss_fn, optimizer, epochs=100):
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_data:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = loss_fn(predictions, y_batch)
            loss.backward()
            optimizer.step()

        # Evaluate the model on the test data
        model.eval()
        with torch.no_grad():
            test_loss = sum(loss_fn(model(X.to(device)), y.to(device)).item() for X, y in test_data) / len(test_data)

        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Test Loss: {test_loss:.4f}')


train_model(model, train_data, test_data, loss_fn, optimizer, epochs=250)


Epoch 10, Loss: 307048800.0000, Test Loss: 3068908355.2000
Epoch 20, Loss: 96858872.0000, Test Loss: 3082096600.0000
Epoch 30, Loss: 169385664.0000, Test Loss: 3088700243.2000
Epoch 40, Loss: 394482880.0000, Test Loss: 3089830593.6000
Epoch 50, Loss: 167771728.0000, Test Loss: 3090012081.6000
Epoch 60, Loss: 90636368.0000, Test Loss: 3088349633.6000
Epoch 70, Loss: 1795566976.0000, Test Loss: 3109867612.8000
Epoch 80, Loss: 272184448.0000, Test Loss: 3107638756.8000
Epoch 90, Loss: 888845056.0000, Test Loss: 3108026902.4000
Epoch 100, Loss: 135068048.0000, Test Loss: 3127627798.4000
Epoch 110, Loss: 82273856.0000, Test Loss: 3125341148.8000
Epoch 120, Loss: 353975776.0000, Test Loss: 3140690892.8000
Epoch 130, Loss: 119888464.0000, Test Loss: 3130423206.4000
Epoch 140, Loss: 230913440.0000, Test Loss: 3135767539.2000
Epoch 150, Loss: 141166384.0000, Test Loss: 3167364241.6000
Epoch 160, Loss: 469276576.0000, Test Loss: 3160558512.0000
Epoch 170, Loss: 105843696.0000, Test Loss: 3156267