In [12]:
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [13]:
class KaggleHouse(d2l.DataModule):
    def __init__(self, batch_size, train=None, val=None):
        super().__init__()
        self.save_hyperparameters()
        if self.train is None:
            self.raw_train = pd.read_csv(d2l.download(
                d2l.DATA_URL + 'kaggle_house_pred_train.csv', self.root,
                sha1_hash='585e9cc93e70b39160e7921475f9bcd7d31219ce'))
            self.raw_val = pd.read_csv(d2l.download(
                d2l.DATA_URL + 'kaggle_house_pred_test.csv', self.root,
                sha1_hash='fa19780a7b011d9b009e8bff8e99922a8ee2eb90'))

In [14]:
data = KaggleHouse(batch_size=64)
print(data.raw_train.shape)
print(type(data.raw_train))
print(data.raw_val.shape)
print(type(data.raw_val))

(1460, 81)
<class 'pandas.core.frame.DataFrame'>
(1459, 80)
<class 'pandas.core.frame.DataFrame'>


In [15]:
print(data.raw_train.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [16]:
# Add new features to KaggleHouse class

@d2l.add_to_class(KaggleHouse)
def preprocess(self):
    # Remove the ID and label columns
    label = "SalePrice"
    features = pd.concat(
        [self.raw_train.drop(columns=["Id", label]), self.raw_val.drop(columns=["Id"])])
    
    # Standardize the numerical features
    numeric_features = features.dtypes[features.dtypes != "object"].index
    features[numeric_features] = features[numeric_features].apply(lambda x: (x - x.mean()) / x.std(), axis=0)
    
    # Replace NAN numerical features by 0
    features[numeric_features] = features[numeric_features].fillna(0)
    
    # Replace discrete features by one-hot encoding
    features = pd.get_dummies(features, dummy_na=True)
    
    # Save preprocessed features
    
    self.train = features.iloc[:self.raw_train.shape[0], :].copy()
    self.train[label] = self.raw_train[label]
    self.val = features.iloc[self.raw_train.shape[0]:, :].copy()

In [18]:
data.preprocess()

print(data.train.shape)
print(data.train.head())

(1460, 331)
   MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0    0.067320    -0.184443 -0.217841     0.646073    -0.507197   1.046078   
1   -0.873466     0.458096 -0.072032    -0.063174     2.187904   0.154737   
2    0.067320    -0.055935  0.137173     0.646073    -0.507197   0.980053   
3    0.302516    -0.398622 -0.078371     0.646073    -0.507197  -1.859033   
4    0.067320     0.629439  0.518814     1.355319    -0.507197   0.947040   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_WD  \
0      0.896679    0.523038    0.580708    -0.29303  ...         True   
1     -0.395536   -0.569893    1.177709    -0.29303  ...         True   
2      0.848819    0.333448    0.097840    -0.29303  ...         True   
3     -0.682695   -0.569893   -0.494771    -0.29303  ...         True   
4      0.753100    1.381770    0.468770    -0.29303  ...         True   

   SaleType_nan  SaleCondition_Abnorml  SaleCondition_AdjLand  \
0         False      

In [None]:
@d2l.add_to_class(KaggleHouse)
def get_dataloader(self, train):
    label = 'SalePrice'
    data = self.train if train else self.val
    if label not in data:
        return
    get_tensor = lambda x: torch.tensor(x.values.astype(float), dtype=torch.float32)
    # Logarithm of prices
    tensors = (
        get_tensor(data.drop(columns=[label])),  # X
        torch.log(get_tensor(data[label])).reshape((-1, 1))  # Y
    )
    return self.get_tensorloader(tensors, train)

In [None]:
def k_fold_data(data, k):
    rets = []
    fold_size = data.train.shape[0] // k
    for j in range(k):
        idx = range(j * fold_size, (j+1) * fold_size)
        rets.append(KaggleHouse(data.batch_size, data.train.drop(index=idx), data.train.loc[idx]))
    return rets