In [1]:
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

正在从http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv下载../data/kaggle_house_pred_train.csv...
正在从http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv下载../data/kaggle_house_pred_test.csv...


In [6]:
print(train_data.shape, test_data.shape)
idx = slice(0, 5)
train_data.iloc[idx, [0,1,2,-3,-2,-1]]

(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,SaleType,SaleCondition,SalePrice
0,1,60,RL,WD,Normal,208500
1,2,20,RL,WD,Normal,181500
2,3,60,RL,WD,Normal,223500
3,4,70,RL,WD,Abnorml,140000
4,5,60,RL,WD,Normal,250000


In [67]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [68]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x : (x - x.mean()) / x.std())
all_features.iloc[0:4, [0,1,2,-2,-1]]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition
0,0.06732,RL,0.216038,WD,Normal
1,-0.873466,RL,0.664045,WD,Normal
2,0.06732,RL,0.305639,WD,Normal
3,0.302516,RL,0.066702,WD,Abnorml


In [70]:
all_features = pd.get_dummies(all_features, dummy_na=True)

In [90]:
all_features.iloc[1:3, [0,1,2,3,4,5,-4,-3,-2,-1]]
nan_index = (all_features.dtypes[all_features.dtypes.index.str.find('nan') != -1]).index
nan_ratio = all_features[nan_index].sum() / all_features[nan_index].count()

Alley_nan          0.932169
PoolQC_nan         0.996574
Fence_nan          0.804385
MiscFeature_nan    0.964029
dtype: float64

In [212]:
all_data = torch.tensor(all_features.values, dtype=torch.float32)
all_label = torch.tensor( train_data['SalePrice'].values.reshape(-1, 1), dtype=torch.float32 )

print(train_data.shape, all_data.shape, all_label.shape)
print(all_label)

(1460, 81) torch.Size([2919, 331]) torch.Size([1460, 1])
tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])


In [208]:
import math
#net = nn.Sequential(nn.Linear(all_data.shape[-1], 10), nn.ReLU(), nn.Dropout(0.5),  nn.Linear(10, 1))
net = nn.Sequential(nn.Linear(all_data.shape[-1], 1))

# 参数初始化
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, 0, math.sqrt(2.0/(m.in_features + m.out_features)) )
        #nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
net.apply(init_xavier)

print(net[0].weight.data)

tensor([[ 0.0868,  0.0697, -0.0990,  0.1098,  0.0456,  0.0592,  0.1206, -0.0163,
          0.1262,  0.0941, -0.0820,  0.0355,  0.0891, -0.0068, -0.1066, -0.0386,
          0.0166, -0.0087,  0.0041, -0.0511,  0.0326,  0.0976,  0.0671, -0.0915,
         -0.0118, -0.0797,  0.0873, -0.0331, -0.0725, -0.1953,  0.0236,  0.0285,
         -0.0328,  0.0433,  0.1137,  0.0135, -0.0781,  0.0733,  0.0229,  0.2119,
          0.0268,  0.0339, -0.0320,  0.0886, -0.0826,  0.0053, -0.0453, -0.1077,
         -0.0933,  0.0160, -0.0286,  0.0731,  0.0294, -0.0008, -0.0010, -0.0331,
         -0.0376, -0.0192,  0.0597, -0.0733,  0.1456,  0.1272,  0.0619,  0.1579,
          0.1299, -0.0601,  0.0074,  0.0599,  0.0023, -0.1633, -0.1630,  0.0052,
         -0.0652,  0.1650,  0.0847,  0.0415,  0.0471,  0.0506, -0.0056, -0.0507,
          0.0125, -0.0814,  0.0328, -0.0171, -0.0063, -0.0565, -0.1157, -0.0393,
          0.0153,  0.1577,  0.1045,  0.0832, -0.2002, -0.0207, -0.0070, -0.0138,
          0.0395,  0.0891, -

In [213]:
batch_size = 64
t_train = train_data.shape[0]
n_train = int(t_train * 0.8)
n_epoch = 1000

train_iter = d2l.load_array((all_data[:n_train], all_label[:n_train]), batch_size)
test_iter = d2l.load_array((all_data[n_train:t_train], all_label[n_train:t_train]), batch_size, is_train=False)

updater = torch.optim.Adam(net.parameters(), lr = 5, weight_decay=0)
criteria = nn.MSELoss()
criteria_l1 = nn.L1Loss()
def log_rmse(pre, label):
    return (torch.log(pre) - torch.log(label)).pow(2).mean().sqrt()

for epoch in range(n_epoch):
    for feature, label in train_iter:
        y_pre = net(feature)
        loss = criteria(y_pre, label.reshape(y_pre.shape))
        loss.backward()
        updater.step()
        updater.zero_grad()
    if epoch % 100 == 0:
        with torch.no_grad():
            y_pre = net(all_data[n_train:t_train])
            loss1 = log_rmse(y_pre, all_label[n_train:t_train].reshape(y_pre.shape))
            loss_l1 = criteria_l1(y_pre, all_label[n_train:t_train].reshape(y_pre.shape))
            loss_l2 = criteria(y_pre, all_label[n_train:t_train].reshape(y_pre.shape))
            print(f'{float(loss1.item()):f}', f'{float(loss_l1.item()):f}', f'{float(loss_l2.item()):f}')


0.167382 19987.271484 2018316032.000000
0.166714 19963.232422 2015700864.000000
0.167245 19997.984375 2025715456.000000
0.167187 20016.787109 2029350016.000000
0.167500 20031.115234 2032445568.000000
0.167594 20062.832031 2036752896.000000
0.167670 20054.486328 2034133120.000000
0.167517 20085.353516 2038554496.000000
0.167922 20104.513672 2046333952.000000
0.167917 20123.525391 2051719808.000000


In [158]:
def log_rmse(pre, label):
    return (torch.log(pre) - torch.log(label)).pow(2).mean().sqrt()

pre = torch.tensor([10, 500, 3], dtype=torch.float32)
label = torch.tensor([1000, 50, 3], dtype=torch.float32)

print( log_rmse(pre, label) )

print( pre.log(), label.log(), pre.log() - label.log(), (pre.log() - label.log()).pow(2) , sep="\n")

tensor(2.9726)
tensor([2.3026, 6.2146, 1.0986])
tensor([6.9078, 3.9120, 1.0986])
tensor([-4.6052,  2.3026,  0.0000])
tensor([21.2076,  5.3019,  0.0000])


In [198]:
all_label[n_train:t_train].shape

torch.Size([292])