In [1]:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

In [3]:
# 读取数据
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
all_feature = pd.concat([train.iloc[:, 1:-1], test.iloc[:, 1:]])

In [5]:
# 清洗数据

# 用z-score标准化数据，跳过object类
for col in all_feature.columns:
    if all_feature[col].dtype != 'object':
        all_feature[col] = (all_feature[col] - all_feature[col].mean()) / all_feature[col].std()
        all_feature[col] = all_feature[col].fillna(0)

# 将所有object类的数据使用1-hot编码转化成数值
all_feature = pd.get_dummies(all_feature, dummy_na=True, dtype=float)
print(all_feature.shape)

(2919, 330)


In [None]:
# 将数据集分成训练集，验证集和测试集，测试集仅用来在kaggle上提交结果
n_train = train.shape[0]
n_valid = n_train // 5
n_train -= n_valid

train_data = train.iloc[:n_train, :]
valid_data = train.iloc[n_train:n_train+n_valid, :]
train_features = torch.tensor(all_feature[:n_train].values)
valid_features = torch.tensor(all_feature[n_train:n_train+n_valid].values)
test_features = torch.tensor(all_feature[n_train+n_valid:].values)
train_labels = torch.tensor(train_data['SalePrice'].values.reshape(-1, 1))
valid_labels = torch.tensor(valid_data['SalePrice'].values.reshape(-1, 1))

In [7]:
class HousePricesDataset(Dataset):
    def __init__(self, features, label=None):
        super(HousePricesDataset, self).__init__()
        self.features = features
        self.label = label
    
    def __getitem__(self, index):
        if self.label != None:
            return self.features[index], self.label[index]
        else:
            return self.features[index]
    
    def __len__(self):
        return len(self.features)


In [None]:
# 定义三层全连接网络
class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.model = nn.Sequential(nn.Linear(input_size, 256),
                                   nn.ReLU(),
                                   nn.Linear(256, 128),
                                   nn.ReLU(),
                                   nn.Linear(128, 64),
                                   nn.ReLU(),
                                   nn.Linear(64, output_size))
        
    def forward(self, x):
        return self.model(x)

In [9]:
# 初始化参数
input_size = train_features.shape[1]
output_size = 1
bs = 16
learning_rate = 0.001
epochs = 500

# 划分数据集
train_dataset = HousePricesDataset(train_features, train_labels)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
valid_dataset = HousePricesDataset(valid_features, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=True)

In [10]:
# 定义损失函数
loss_fn = nn.MSELoss()

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features.float()), 1, float('inf'))
    rmse = torch.sqrt(loss_fn(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()

In [None]:
# 初始化模型
model = MLP(input_size, output_size)

total_train_step = 0
writer = SummaryWriter('logs_train')

# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-3)

# 训练模型
for epoch in tqdm(range(epochs)):

    if epoch % 50 == 0:
        print(f'---{epoch+1}th round start---')

    for batch_x, batch_y in train_loader:
        batch_x = batch_x.float()
        batch_y = batch_y.float()
        # 前向传播
        output = model(batch_x)
        # 计算损失
        optimizer.zero_grad()
        loss = loss_fn(output, batch_y)
        # print(loss.item())
        # 反向传播
        loss.backward()
        optimizer.step()

        total_train_step += 1
        if total_train_step % 5000 == 0:
            train_logrmse = log_rmse(model, train_features, train_labels)
            valid_logrmse = log_rmse(model, valid_features, valid_labels)
            print(f'Step:{total_train_step}, train logrmse:{train_logrmse}, valid logrmse:{valid_logrmse}')
        if total_train_step % 500 == 0:
            train_logrmse = log_rmse(model, train_features, train_labels)
            valid_logrmse = log_rmse(model, valid_features, valid_labels)
            writer.add_scalar('train_loss', train_logrmse, total_train_step)
            writer.add_scalar('valid_loss', valid_logrmse, total_train_step)
writer.close()

  0%|          | 0/500 [00:00<?, ?it/s]

---1th round start---


 10%|█         | 52/500 [00:03<00:34, 12.93it/s]

---51th round start---


 14%|█▍        | 70/500 [00:05<00:32, 13.40it/s]

Step:5000, train logrmse:0.11090034246444702, valid logrmse:0.16012156009674072


 20%|██        | 102/500 [00:07<00:32, 12.07it/s]

---101th round start---


 28%|██▊       | 138/500 [00:10<00:36, 10.00it/s]

Step:10000, train logrmse:0.09872469305992126, valid logrmse:0.16051173210144043


 30%|███       | 151/500 [00:12<00:41,  8.50it/s]

---151th round start---


 40%|████      | 202/500 [00:16<00:23, 12.76it/s]

---201th round start---


 42%|████▏     | 208/500 [00:17<00:22, 13.00it/s]

Step:15000, train logrmse:0.09050535410642624, valid logrmse:0.1624620258808136


 50%|█████     | 252/500 [00:20<00:18, 13.34it/s]

---251th round start---


 55%|█████▌    | 276/500 [00:22<00:16, 13.18it/s]

Step:20000, train logrmse:0.0854841023683548, valid logrmse:0.1670057326555252


 60%|██████    | 302/500 [00:24<00:15, 12.86it/s]

---301th round start---


 69%|██████▉   | 344/500 [00:27<00:12, 12.86it/s]

Step:25000, train logrmse:0.07761896401643753, valid logrmse:0.1698838174343109


 70%|███████   | 352/500 [00:28<00:11, 13.15it/s]

---351th round start---


 80%|████████  | 402/500 [00:32<00:07, 13.28it/s]

---401th round start---


 82%|████████▏ | 412/500 [00:32<00:06, 13.23it/s]

Step:30000, train logrmse:0.07220405340194702, valid logrmse:0.17689871788024902


 90%|█████████ | 452/500 [00:35<00:03, 13.27it/s]

---451th round start---


 96%|█████████▋| 482/500 [00:38<00:01, 13.41it/s]

Step:35000, train logrmse:0.05962911620736122, valid logrmse:0.18072952330112457


100%|██████████| 500/500 [00:39<00:00, 12.68it/s]


In [12]:
preds = model(test_features.float()).detach().numpy()
test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test['Id'], test['SalePrice']], axis=1)

In [13]:
submission.to_csv('submission3.csv', index=False)

In [None]:
# 最后在kaggle上的得分大概是0.19，这里尝试了多种不同的网络与参数，结果都在0.19附近，模型收敛较快，增加训练次数只会导致过拟合，但实际效果一般。