In [1]:
import os
import requests
import zipfile
import tarfile
import hashlib

DATA_HUB = dict()  #@save
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'  #@save

In [2]:
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """Download a file inserted into DATA_HUB, return the local filename."""
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    d2l.mkdir_if_not_exist(cache_dir)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

In [3]:
def download_extract(name, folder=None):  #@save
    """Download and extract a zip/tar file."""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """Download all files in the DATA_HUB."""
    for name in DATA_HUB:
        download(name)

In [4]:
# If pandas is not installed, please uncomment the following line:
# !pip install pandas

%matplotlib inline
from d2l import tensorflow as d2l
import tensorflow as tf
import pandas as pd
import numpy as np

In [5]:
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [6]:
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

Downloading ..\data\kaggle_house_pred_train.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv...
Downloading ..\data\kaggle_house_pred_test.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv...


In [7]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [8]:
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [16]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [17]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [18]:
all_features = pd.get_dummies(all_features, dummy_na=True)


In [19]:
n_train = train_data.shape[0]
train_features = tf.constant(all_features[:n_train].values, dtype=tf.float32)
test_features = tf.constant(all_features[n_train:].values, dtype=tf.float32)
train_labels = tf.constant(
    train_data.SalePrice.values.reshape(-1, 1), dtype=tf.float32)

In [27]:
loss = tf.keras.losses.MeanSquaredError()

def get_net():
    net = tf.keras.models.Sequential()
    net.add(tf.keras.layers.Dense(1, kernel_regularizer = tf.keras.regularizers.l2(0)))
    return net


In [28]:
def log_rmse(y_true, y_pred):
    # To further stabilize the value when the logarithm is taken, set the
    # value less than 1 as 1
    clipped_preds = tf.clip_by_value(y_pred, 1, float('inf'))
    return tf.sqrt(tf.reduce_mean(loss(
        tf.math.log(y_true), tf.math.log(clipped_preds))))

In [29]:
def load_data(data, batch_size, is_train = True):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.shuffle(buffer_size = 1000)
    dataset = dataset.batch(batch_size)
    return dataset

In [32]:
num_epochs, lr, batch_size = 50, 0.05, 32
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
net = get_net()
data_iter = load_data((train_features, train_labels), batch_size)
net.compile(loss = loss, optimizer = optimizer)
train_loss = []
for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = net(X)
            l = loss(y, yhat)
        
        params = net.trainable_variables
        grads = tape.gradient(l, params)

        optimizer.apply_gradients(zip(grads, params))
    train_loss.append(log_rmse(train_labels, net(train_features)))
    print(train_loss[epoch].numpy())       

7.481093
6.783854
6.377819
6.089277
5.8658514
5.6828547
5.528625
5.3951416
5.277648
5.1722207
5.0769157
4.9905744
4.910351
4.8363504
4.7675905
4.703025
4.642667
4.5855556
4.531809
4.480516
4.432027
4.3855505
4.3412795
4.298783
4.258199
4.2192273
4.181665
4.1453395
4.1105256
4.0766225
4.043927
4.012403
3.9817514
3.952043
3.923318
3.8951156
3.8679621
3.841394
3.8154852
3.7903109
3.7657144
3.7417583
3.718305
3.6955266
3.6730855
3.6513274
3.6298718
3.6089153
3.5884154
3.568285


## K Fold

In [33]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = tf.concat([X_train, X_part], 0)
            y_train = tf.concat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [37]:
k, num_epochs, lr, weight_decay, batch_size = 5, 10, 5, 0, 64
train_loss_sum, test_loss_sum = 0, 0
for i in range(k):
    print(i)
    train_features_kf, train_labels_kf, test_features_kf, test_labels_kf = get_k_fold_data(k, i, train_features, train_labels)
    net = get_net()

    data_iter = load_data((train_features_kf, train_labels_kf), batch_size)
    net.compile(loss = loss, optimizer = optimizer)
    train_loss = []; test_loss = []
    for epoch in range(num_epochs):
        for X, y in data_iter:
            with tf.GradientTape() as tape:
                yhat = net(X)
                l = loss(y, yhat)
            
            params = net.trainable_variables
            grads = tape.gradient(l, params)

            optimizer.apply_gradients(zip(grads, params))
        train_loss.append(log_rmse(train_labels_kf, net(train_features_kf)))
        test_loss.append(log_rmse(test_labels_kf, net(test_features_kf)))
        # print(train_loss[epoch].numpy())       
    
    train_loss_sum += train_loss[-1]
    test_loss_sum += test_loss[-1]
    print(f'fold {i + 1}, train log rmse {float(train_loss[-1]):f}, '
            f'valid log rmse {float(test_loss[-1]):f}')

avg_train_loss = train_loss_sum/k
avg_test_loss = test_loss_sum/k

0
fold 1, train log rmse 4.759342, valid log rmse 4.773881
1
fold 2, train log rmse 4.751072, valid log rmse 4.761989
2
fold 3, train log rmse 4.742665, valid log rmse 4.757630
3
fold 4, train log rmse 4.748435, valid log rmse 4.724545
4
fold 5, train log rmse 4.739539, valid log rmse 4.742434


In [40]:
print(avg_test_loss.numpy(), avg_train_loss.numpy())

4.752095 4.7482104
