In [140]:
import os
import math
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt

In [141]:
data_root = "D:\Downloads\Data\Kaggle\house-prices-advanced-regression-techniques"

train_data = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_root, 'test.csv'))

print(train_data.shape)
print(test_data.shape)
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
print(test_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

(1460, 81)
(1459, 80)
   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000
     Id  MSSubClass MSZoning  LotFrontage  YrSold SaleType SaleCondition
0  1461          20       RH         80.0    2010       WD        Normal
1  1462          20       RL         81.0    2010       WD        Normal
2  1463          60       RL         74.0    2010       WD        Normal
3  1464          60       RL         78.0    2010       WD        Normal


In [142]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

all_features = pd.get_dummies(all_features, dummy_na=True)
all_features *= 1
print(all_features.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

   MSSubClass  LotFrontage   LotArea  OverallQual  SaleCondition_Normal  \
0    0.067320    -0.184443 -0.217841     0.646073                     1   
1   -0.873466     0.458096 -0.072032    -0.063174                     1   
2    0.067320    -0.055935  0.137173     0.646073                     1   
3    0.302516    -0.398622 -0.078371     0.646073                     0   

   SaleCondition_Partial  SaleCondition_nan  
0                      0                  0  
1                      0                  0  
2                      0                  0  
3                      0                  0  


In [143]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

In [144]:
loss = nn.MSELoss()
hidden_layer = 10
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features, hidden_layer),
                        nn.ReLU(),
                        nn.Linear(hidden_layer, 1))
    return net

In [145]:
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [146]:
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator.
    Defined in :numref:`sec_utils`"""
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if (epoch + 1) % int(math.sqrt(num_epochs)) == 0:
            print(f"[{epoch}] log rmse:{train_ls[-1]:.5f}")
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

In [147]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [148]:
k = 5
num_epochs = 1000
lr = 0.5
weight_decay = 0
batch_size = 64

def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    print(f'final log rmse：{float(train_ls[-1]):f}')
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv(os.path.join(data_root, 'submission.csv'), index=False)

train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

[30] log rmse:0.11434
[61] log rmse:0.10649
[92] log rmse:0.10092
[123] log rmse:0.09818
[154] log rmse:0.09607
[185] log rmse:0.09431
[216] log rmse:0.09421
[247] log rmse:0.09446
[278] log rmse:0.09242
[309] log rmse:0.09149
[340] log rmse:0.09126
[371] log rmse:0.09217
[402] log rmse:0.09192
[433] log rmse:0.08874
[464] log rmse:0.08863
[495] log rmse:0.09144
[526] log rmse:0.08963
[557] log rmse:0.09049
[588] log rmse:0.09354
[619] log rmse:0.08856
[650] log rmse:0.08804
[681] log rmse:0.08706
[712] log rmse:0.08973
[743] log rmse:0.08667
[774] log rmse:0.08655
[805] log rmse:0.08638
[836] log rmse:0.08633
[867] log rmse:0.08692
[898] log rmse:0.09281
[929] log rmse:0.08573
[960] log rmse:0.08720
[991] log rmse:0.08573
final log rmse：0.089369
