In [1]:
import os
import tarfile
import zipfile

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [3]:
def extract(fname, folder=None):
    """Extract a zip/tar file."""
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

In [4]:
# !kaggle competitions download -c house-prices-advanced-regression-techniques

In [5]:
# extract('house-prices-advanced-regression-techniques.zip')

### Data

In [6]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [8]:
X_train = train_data.iloc[:, 1:-1]
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [9]:
y_train = train_data.iloc[:, -1:]
y_train

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


### Data Preprocessing

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [11]:
numeric_features = X_train.dtypes[X_train.dtypes != 'object'].index
numeric_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [12]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [13]:
X_train[numeric_features].values

array([[6.000e+01, 6.500e+01, 8.450e+03, ..., 0.000e+00, 2.000e+00,
        2.008e+03],
       [2.000e+01, 8.000e+01, 9.600e+03, ..., 0.000e+00, 5.000e+00,
        2.007e+03],
       [6.000e+01, 6.800e+01, 1.125e+04, ..., 0.000e+00, 9.000e+00,
        2.008e+03],
       ...,
       [7.000e+01, 6.600e+01, 9.042e+03, ..., 2.500e+03, 5.000e+00,
        2.010e+03],
       [2.000e+01, 6.800e+01, 9.717e+03, ..., 0.000e+00, 4.000e+00,
        2.010e+03],
       [2.000e+01, 7.500e+01, 9.937e+03, ..., 0.000e+00, 6.000e+00,
        2.008e+03]])

In [14]:
scaler = StandardScaler()
scaler.fit(X_train[numeric_features].values)
X_train[numeric_features] = scaler.transform(X_train[numeric_features].values)

In [15]:
# After standardizing the data all means vanish, hence we can set missing
# values to 0
X_train[numeric_features] = X_train[numeric_features].fillna(0)

In [16]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.073375,RL,-0.208034,-0.207142,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,-1.599111,0.138777,WD,Normal
1,-0.872563,RL,0.409895,-0.091886,Pave,,Reg,Lvl,AllPub,FR2,...,-0.270208,-0.068692,,,,-0.087688,-0.489110,-0.614439,WD,Normal
2,0.073375,RL,-0.084449,0.073480,Pave,,IR1,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,0.990891,0.138777,WD,Normal
3,0.309859,RL,-0.414011,-0.096897,Pave,,IR1,Lvl,AllPub,Corner,...,-0.270208,-0.068692,,,,-0.087688,-1.599111,-1.367655,WD,Abnorml
4,0.073375,RL,0.574676,0.375148,Pave,,IR1,Lvl,AllPub,FR2,...,-0.270208,-0.068692,,,,-0.087688,2.100892,0.138777,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,RL,-0.331620,-0.260560,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,0.620891,-0.614439,WD,Normal
1456,-0.872563,RL,0.615871,0.266407,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,MnPrv,,-0.087688,-1.599111,1.645210,WD,Normal
1457,0.309859,RL,-0.166839,-0.147810,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,GdPrv,Shed,4.953112,-0.489110,1.645210,WD,Normal
1458,-0.872563,RL,-0.084449,-0.080160,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,-0.859110,1.645210,WD,Normal


In [17]:
X_train = pd.get_dummies(X_train, dummy_na=True)

In [18]:
X_train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.073375,-0.208034,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.510015,0.575425,-0.288653,...,0,1,0,0,0,0,0,1,0,0
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.288653,...,0,1,0,0,0,0,0,1,0,0
2,0.073375,-0.084449,0.073480,0.651479,-0.517200,0.984752,0.830215,0.322174,0.092907,-0.288653,...,0,1,0,0,0,0,0,1,0,0
3,0.309859,-0.414011,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.572835,-0.499274,-0.288653,...,0,1,0,1,0,0,0,0,0,0
4,0.073375,0.574676,0.375148,1.374795,-0.517200,0.951632,0.733308,1.360826,0.463568,-0.288653,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.331620,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.572835,-0.973018,-0.288653,...,0,1,0,0,0,0,0,1,0,0
1456,-0.872563,0.615871,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084610,0.759659,0.722112,...,0,1,0,0,0,0,0,1,0,0
1457,0.309859,-0.166839,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.572835,-0.369871,-0.288653,...,0,1,0,0,0,0,0,1,0,0
1458,-0.872563,-0.084449,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.572835,-0.865548,6.092188,...,0,1,0,0,0,0,0,1,0,0


In [19]:
model = tf.keras.models.Sequential()
lambd = 0.01
model.add(tf.keras.layers.Flatten(input_shape=(X_train.shape[-1], )))
model.add(tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(lambd)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(lambd)))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 331)               0         
_________________________________________________________________
dense (Dense)                (None, 200)               66400     
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 66,601
Trainable params: 66,601
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss = tf.keras.losses.MeanSquaredError(),
              optimizer = tf.keras.optimizers.Adam(lr = 0.01))

In [22]:
model.fit(X_train.values, y_train.values, epochs=150, validation_split = 0.2)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150


Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150


Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x1fee638d608>

In [23]:
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [24]:
X_test = test_data.iloc[:, 1:]
X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [25]:
X_test[numeric_features] = scaler.transform(X_test[numeric_features].values)

In [26]:
X_test[numeric_features] = X_test[numeric_features].fillna(0)

In [27]:
X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-0.872563,RH,0.409895,0.110763,Pave,,Reg,Lvl,AllPub,Inside,...,1.882709,-0.068692,,MnPrv,,-0.087688,-0.119110,1.645210,WD,Normal
1,-0.872563,RL,0.451090,0.375850,Pave,,IR1,Lvl,AllPub,Corner,...,-0.270208,-0.068692,,,Gar2,25.116309,-0.119110,1.645210,WD,Normal
2,0.073375,RL,0.162723,0.332053,Pave,,IR1,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,MnPrv,,-0.087688,-1.229111,1.645210,WD,Normal
3,0.073375,RL,0.327504,-0.054002,Pave,,IR1,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,-0.119110,1.645210,WD,Normal
4,1.492282,RL,-1.114330,-0.552407,Pave,,IR1,HLS,AllPub,Inside,...,2.313293,-0.068692,,,,-0.087688,-1.969111,1.645210,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,RM,-2.020626,-0.859988,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,-0.119110,-1.367655,WD,Normal
1455,2.438219,RM,-2.020626,-0.864197,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,-0.859110,-1.367655,WD,Abnorml
1456,-0.872563,RL,3.705515,0.950423,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,,,-0.087688,0.990891,-1.367655,WD,Abnorml
1457,0.664586,RL,-0.331620,-0.007600,Pave,,Reg,Lvl,AllPub,Inside,...,-0.270208,-0.068692,,MnPrv,Shed,1.323736,0.250891,-1.367655,WD,Normal


In [28]:
X_test = pd.get_dummies(X_test, dummy_na=True)
X_test

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,-0.872563,0.409895,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.572835,0.053428,0.604293,...,0,1,0,0,0,0,0,1,0,0
1,-0.872563,0.451090,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023838,1.051363,-0.288653,...,0,1,0,0,0,0,0,1,0,0
2,0.073375,0.162723,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.572835,0.761852,-0.288653,...,0,1,0,0,0,0,0,1,0,0
3,0.073375,0.327504,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.462340,0.347326,-0.288653,...,0,1,0,0,0,0,0,1,0,0
4,1.492282,-1.114330,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.572835,-0.396190,-0.288653,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-2.020626,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.572835,-0.973018,-0.288653,...,0,1,0,0,0,0,0,1,0,0
1455,2.438219,-2.020626,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.572835,-0.420316,-0.288653,...,0,1,0,1,0,0,0,0,0,0
1456,-0.872563,3.705515,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.572835,1.711535,-0.288653,...,0,1,0,1,0,0,0,0,0,0
1457,0.664586,-0.331620,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.572835,-0.233889,-0.288653,...,0,1,0,0,0,0,0,1,0,0


In [29]:
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

X_test

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,-0.872563,0.409895,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.572835,0.053428,0.604293,...,0,1,0,0,0,0,0,1,0,0
1,-0.872563,0.451090,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023838,1.051363,-0.288653,...,0,1,0,0,0,0,0,1,0,0
2,0.073375,0.162723,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.572835,0.761852,-0.288653,...,0,1,0,0,0,0,0,1,0,0
3,0.073375,0.327504,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.462340,0.347326,-0.288653,...,0,1,0,0,0,0,0,1,0,0
4,1.492282,-1.114330,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.572835,-0.396190,-0.288653,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-2.020626,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.572835,-0.973018,-0.288653,...,0,1,0,0,0,0,0,1,0,0
1455,2.438219,-2.020626,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.572835,-0.420316,-0.288653,...,0,1,0,1,0,0,0,0,0,0
1456,-0.872563,3.705515,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.572835,1.711535,-0.288653,...,0,1,0,1,0,0,0,0,0,0
1457,0.664586,-0.331620,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.572835,-0.233889,-0.288653,...,0,1,0,0,0,0,0,1,0,0


In [30]:
model.predict(X_test)

array([[117345.25],
       [162339.66],
       [182480.67],
       ...,
       [185170.78],
       [114333.87],
       [230564.47]], dtype=float32)

In [31]:
preds = model.predict(X_test).reshape(1, -1)[0]
preds

array([117345.25, 162339.66, 182480.67, ..., 185170.78, 114333.87,
       230564.47], dtype=float32)

In [32]:
test_data['SalePrice'] = preds

In [33]:
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission

Unnamed: 0,Id,SalePrice
0,1461,117345.250000
1,1462,162339.656250
2,1463,182480.671875
3,1464,198784.390625
4,1465,191491.906250
...,...,...
1454,2915,79379.695312
1455,2916,64674.140625
1456,2917,185170.781250
1457,2918,114333.867188


In [34]:
submission.to_csv('submission.csv', index=False)

In [35]:
# !kaggle competitions submit -c "house-prices-advanced-regression-techniques" -f "submission.csv" -m "base model"