In [4]:
%matplotlib inline
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import pandas as pd

In [8]:
train_data = pd.read_csv('house-prices/train.csv')
test_data = pd.read_csv('house-prices/test.csv')

In [10]:
train_data.shape

(1460, 81)

In [11]:
test_data.shape

(1459, 80)

In [12]:
train_data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [13]:
test_data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [15]:
#the first five training samples, the first four and last three feautures
train_data.iloc[0:5, [0,1,2,3,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000
4,5,60,RL,84.0,WD,Normal,250000


In [16]:
#the first five testing samples, the first four and last three feautures
test_data.iloc[0:5, [0,1,2,3,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,2010,WD,Normal
1,1462,20,RL,81.0,2010,WD,Normal
2,1463,60,RL,74.0,2010,WD,Normal
3,1464,60,RL,78.0,2010,WD,Normal
4,1465,120,RL,43.0,2010,WD,Normal


In [17]:
# we do not need the id and the saleprice(only in the trainging data)
# combine all the features in testing and training data
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:,1:]))

In [19]:
# standardization for numeric features
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
print(numeric_features)

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [21]:
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean())/ (x.std()) )
all_features[numeric_features] = all_features[numeric_features].fillna(0)
print(all_features.iloc[0:5])

   MSSubClass MSZoning  LotFrontage   LotArea Street Alley LotShape  \
0    0.067320       RL    -0.202033 -0.217841   Pave   NaN      Reg   
1   -0.873466       RL     0.501785 -0.072032   Pave   NaN      Reg   
2    0.067320       RL    -0.061269  0.137173   Pave   NaN      IR1   
3    0.302516       RL    -0.436639 -0.078371   Pave   NaN      IR1   
4    0.067320       RL     0.689469  0.518814   Pave   NaN      IR1   

  LandContour Utilities LotConfig  ... ScreenPorch  PoolArea PoolQC Fence  \
0         Lvl    AllPub    Inside  ...   -0.285886 -0.063139    NaN   NaN   
1         Lvl    AllPub       FR2  ...   -0.285886 -0.063139    NaN   NaN   
2         Lvl    AllPub    Inside  ...   -0.285886 -0.063139    NaN   NaN   
3         Lvl    AllPub    Corner  ...   -0.285886 -0.063139    NaN   NaN   
4         Lvl    AllPub       FR2  ...   -0.285886 -0.063139    NaN   NaN   

  MiscFeature   MiscVal    MoSold    YrSold  SaleType  SaleCondition  
0         NaN -0.089577 -1.551918  0.15

In [25]:
# Discrete features
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(2919, 331)

In [None]:
# Features from 79 to 331