In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import PIL

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

## Data preprocessing

In [2]:
df = pd.read_csv('./train.csv')
df = df.drop('Id', 1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
enc_features = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig',
               'LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','OverallQual',
               'OverallCond','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual',
               'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
               'Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu',
               'GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature',
               'SaleType','SaleCondition']
for enc in enc_features:
    one_hot = pd.get_dummies(df[enc], prefix=enc)
    df = df.drop(enc,axis = 1)
    df = df.join(one_hot)
df.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,2003,2003,196.0,706,0,150,856,856,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,1976,1976,0.0,978,0,284,1262,1262,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,2001,2002,162.0,486,0,434,920,920,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,1915,1970,0.0,216,0,540,756,961,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,2000,2000,350.0,655,0,490,1145,1145,...,0,0,0,1,0,0,0,0,1,0


In [4]:
real_features = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtUnfSF',
                'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
                'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars',
                'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
                'MiscVal','MoSold','YrSold']
cat_features = list(set(df.columns.values.tolist()) - set(real_features))
cat_features.remove('SalePrice')
print (real_features)
print (cat_features)

['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSSubClass_90', 'GarageQual_Po', 'Condition2_Feedr', 'Condition1_PosA', 'GarageCond_Fa', 'OverallCond_8', 'RoofMatl_Tar&Grv', 'OverallQual_4', 'SaleType_ConLw', 'BsmtQual_Gd', 'RoofMatl_Metal', 'FireplaceQu_Po', 'PoolQC_Gd', 'SaleType_ConLI', 'OverallQual_2', 'Exterior1st_Plywood', 'OverallQual_7', 'Exterior2nd_Stone', 'MSZoning_RM', 'KitchenQual_TA', 'Exterior1st_MetalSd', 'Exterior2nd_Other', 'BsmtExposure_Mn', 'GarageCond_TA', 'OverallCond_4', 'PavedDrive_Y', 'Condition1_Feedr', 'MiscFeature_Shed', 'RoofMatl_CompShg', 'MSZoning_C (all)', 'Firepla

In [5]:
df[real_features].describe()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1201.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.049958,10516.828082,1971.267808,1984.865753,103.685262,443.639726,567.240411,1057.429452,1162.626712,346.992466,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,24.284752,9981.264932,30.202904,20.645407,181.066207,456.098091,441.866955,438.705324,386.587738,436.528436,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,21.0,1300.0,1872.0,1950.0,0.0,0.0,0.0,0.0,334.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,59.0,7553.5,1954.0,1967.0,0.0,0.0,223.0,795.75,882.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,69.0,9478.5,1973.0,1994.0,0.0,383.5,477.5,991.5,1087.0,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,80.0,11601.5,2000.0,2004.0,166.0,712.25,808.0,1298.25,1391.25,728.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,313.0,215245.0,2010.0,2010.0,1600.0,5644.0,2336.0,6110.0,4692.0,2065.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [6]:
df[cat_features].describe()

Unnamed: 0,MSSubClass_90,GarageQual_Po,Condition2_Feedr,Condition1_PosA,GarageCond_Fa,OverallCond_8,RoofMatl_Tar&Grv,OverallQual_4,SaleType_ConLw,BsmtQual_Gd,...,HouseStyle_SFoyer,OverallQual_1,SaleCondition_Alloca,LandSlope_Sev,BsmtFinType1_Rec,BsmtFinType2_GLQ,ExterQual_Ex,PoolQC_Ex,BsmtFinSF2,LandContour_Lvl
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,0.035616,0.002055,0.00411,0.005479,0.023973,0.049315,0.007534,0.079452,0.003425,0.423288,...,0.025342,0.00137,0.008219,0.008904,0.091096,0.009589,0.035616,0.00137,46.549315,0.897945
std,0.185395,0.045299,0.063996,0.073846,0.153016,0.216599,0.086502,0.270536,0.05844,0.494249,...,0.157217,0.036999,0.090317,0.093973,0.287844,0.097486,0.185395,0.036999,161.319273,0.302824
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1474.0,1.0


In [7]:
print (df.shape)

(1460, 320)


In [8]:
for ft in real_features:
    df[ft] = df[ft].fillna(0)
for ft in cat_features:
    df[ft] = df[ft].fillna(0)    

In [9]:
y = df['SalePrice'].to_numpy()
df = df.drop('SalePrice', 1)

X_real = df[real_features].to_numpy()
X_cat = df[cat_features].to_numpy()

print ("X_real: {} ".format(X_real[0]))
print ("X_cat: {} ".format(X_cat[0]))
print ("y: {} ".format(y[0]))

X_real: [6.500e+01 8.450e+03 2.003e+03 2.003e+03 1.960e+02 7.060e+02 1.500e+02
 8.560e+02 8.560e+02 8.540e+02 0.000e+00 1.710e+03 1.000e+00 0.000e+00
 2.000e+00 1.000e+00 3.000e+00 1.000e+00 8.000e+00 0.000e+00 2.003e+03
 2.000e+00 5.480e+02 0.000e+00 6.100e+01 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 2.000e+00 2.008e+03] 
X_cat: [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] 
y: 208500 


In [10]:
transform = PolynomialFeatures(2)
X_poly_real = transform.fit_transform(X_real)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(X_poly_real)

In [12]:
print (X_real_scaled.shape)
print (X_cat.shape)
print (y.shape)

(1460, 561)
(1460, 287)
(1460,)


In [13]:
(X_train_real, X_test_real, X_train_cat, X_test_cat, 
 y_train, y_test) = train_test_split(X_real_scaled, X_cat, y, test_size=0.3, random_state=0)
print (X_train_real.shape, X_test_real.shape)
print (X_train_cat.shape, X_test_cat.shape)

print (y_train.shape, y_test.shape)

(1022, 561) (438, 561)
(1022, 287) (438, 287)
(1022,) (438,)


In [14]:
X_train = np.hstack((X_train_real, X_train_cat))
X_test = np.hstack((X_test_real, X_test_cat))
X = np.hstack((X_real_scaled, X_cat))

In [15]:
xi = np.isfinite(X) 
print (xi[np.where(xi == False)].sum())

xn = np.isnan(X) 
print (xn[np.where(xn == False)].sum())

0
0


In [16]:
yi = np.isfinite(y) 
print (yi[np.where(yi == False)].sum())

yn = np.isnan(y) 
print (yn[np.where(yn == False)].sum())

0
0


## Features selection with Lasso

In [17]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoLars
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [18]:
param_grid = {'alpha': [30.0], 'max_iter': [100000]}
estimator = Lasso()

optimizer = GridSearchCV(estimator, param_grid, cv=10)
optimizer.fit(X, y)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [20.0], 'max_iter': [100000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
c = 0
not_null_idx = []
for cf in optimizer.best_estimator_.coef_:
    if cf != 0:
        not_null_idx.append(c)
    c += 1
print (len(not_null_idx))    
print (not_null_idx)    

440
[15, 16, 22, 32, 34, 35, 37, 38, 39, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 59, 63, 66, 68, 69, 70, 72, 74, 75, 76, 83, 85, 86, 87, 88, 89, 90, 91, 93, 94, 96, 97, 99, 101, 109, 110, 119, 121, 134, 140, 147, 152, 153, 155, 156, 157, 159, 163, 164, 165, 166, 167, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 184, 187, 188, 192, 194, 196, 197, 200, 201, 202, 203, 207, 210, 213, 216, 217, 218, 219, 220, 221, 223, 224, 225, 227, 228, 229, 230, 231, 234, 241, 242, 245, 246, 247, 248, 253, 254, 256, 258, 259, 261, 264, 265, 266, 267, 268, 271, 272, 274, 277, 278, 279, 282, 283, 292, 294, 297, 298, 301, 302, 303, 304, 310, 311, 312, 313, 315, 317, 319, 321, 322, 323, 324, 325, 326, 327, 332, 333, 334, 336, 338, 343, 349, 351, 352, 353, 354, 355, 356, 357, 358, 359, 361, 362, 363, 365, 366, 368, 369, 371, 372, 373, 374, 376, 377, 380, 381, 383, 384, 385, 386, 387, 388, 390, 391, 392, 395, 398, 399, 400, 401, 402, 403, 406, 407, 408, 409, 411, 412, 413, 414, 415, 4

In [20]:
X = X[:, not_null_idx]
print (X.shape)

(1460, 440)


## Linear regression

### Ridge

In [21]:
rg_model = Ridge(alpha=.5)

score = cross_val_score(rg_model, X, y, cv=10)
score_mean = score.mean()

In [22]:
print (score_mean)

0.8238732869751754


In [23]:
param_grid = {'alpha': [0.01, 0.02, 0.05, 0.07, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 5, 10]}
estimator = Ridge()

rg2_model = GridSearchCV(estimator, param_grid, cv=10)
rg2_model.fit(X, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.01, 0.02, 0.05, 0.07, 0.08, 0.1, 0.2, 0.3,
                                   0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [24]:
score = cross_val_score(rg2_model, X, y, cv=10)
score_mean = score.mean()



In [25]:
print (score_mean)

0.8404950194532568


### ElasticNet

In [26]:
el_model = ElasticNet(alpha=.5, l1_ratio=.5, max_iter=100000)

score = cross_val_score(el_model, X, y, cv=10)
score_mean = score.mean()

In [27]:
print (score_mean)

0.7937317756145086


In [28]:
param_grid = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1., 5., 10.], 
              'l1_ratio': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
              'max_iter': [100000]}
estimator = ElasticNet()

el2_model = GridSearchCV(estimator, param_grid, cv=10)
el2_model.fit(X, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1.0, 5.0, 10.0],
                         'l1_ratio': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
                         'max_iter': [100000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [29]:
score = cross_val_score(el2_model, X, y, cv=10)
score_mean = score.mean()



In [30]:
print (score_mean)

0.8370641116996907


### LassoLars

In [31]:
la_model = LassoLars(alpha=.5, max_iter=30)

score = cross_val_score(la_model, X, y, cv=10)
score_mean = score.mean()

In [32]:
print (score_mean)

0.8007965494555039


In [33]:
param_grid = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1., 5., 10.], 'max_iter': [30, 40]}
estimator = LassoLars()

la2_model = GridSearchCV(estimator, param_grid, cv=10)
la2_model.fit(X, y)















GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LassoLars(alpha=1.0, copy_X=True,
                                 eps=2.220446049250313e-16, fit_intercept=True,
                                 fit_path=True, max_iter=500, normalize=True,
                                 positive=False, precompute='auto',
                                 verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1.0, 5.0, 10.0],
                         'max_iter': [50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [34]:
score = cross_val_score(la2_model, X, y, cv=10)
score_mean = score.mean()























































































































In [35]:
print (score_mean)

0.8065903201769025


### Voting

In [36]:
from sklearn.ensemble import VotingRegressor

In [65]:
vt_model = VotingRegressor([('rg2', rg2_model), ('el', el2_model), ('la', la2_model)])

vt_model.fit(X, y)

VotingRegressor(estimators=[('rg2',
                             GridSearchCV(cv=10,
                                          error_score='raise-deprecating',
                                          estimator=Ridge(alpha=1.0,
                                                          copy_X=True,
                                                          fit_intercept=True,
                                                          max_iter=None,
                                                          normalize=False,
                                                          random_state=None,
                                                          solver='auto',
                                                          tol=0.001),
                                          iid='warn', n_jobs=None,
                                          param_grid={'alpha': [0.01, 0.02,
                                                                0.05, 0.07,
                                   

### LR submission

In [66]:
df_train = df.copy()

df = pd.read_csv('./test.csv')
house_ids = df['Id'].to_numpy()
df = df.drop('Id', 1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [67]:
for enc in enc_features:
    one_hot = pd.get_dummies(df[enc], prefix=enc)
    df = df.drop(enc,axis = 1)
    df = df.join(one_hot)
df.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.0,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1,81.0,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
2,74.0,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
3,78.0,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
4,43.0,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0


In [68]:
features_diff = list(set(df_train.columns.values.tolist()) - set(df.columns.values.tolist()))
print (len(features_diff), features_diff)
zeros = np.zeros(df.shape[0])

for ft in features_diff:
    df[ft] = pd.Series(zeros)

18 ['Exterior1st_ImStucc', 'Exterior2nd_Other', 'MiscFeature_TenC', 'Exterior1st_Stone', 'RoofMatl_Metal', 'RoofMatl_Membran', 'Condition2_RRAe', 'PoolQC_Fa', 'Heating_Floor', 'GarageQual_Ex', 'RoofMatl_Roll', 'HouseStyle_2.5Fin', 'Electrical_Mix', 'Heating_OthW', 'Condition2_RRNn', 'Condition2_RRAn', 'Utilities_NoSeWa', 'RoofMatl_ClyTile']


In [69]:
df[real_features].describe()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1232.0,1459.0,1459.0,1459.0,1444.0,1458.0,1458.0,1458.0,1459.0,1459.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,68.580357,9819.161069,1971.357779,1983.662783,100.709141,439.203704,554.294925,1046.11797,1156.534613,325.967786,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,22.376841,4955.517327,30.390071,21.130467,177.6259,455.268042,437.260486,442.898624,398.16582,420.610226,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,21.0,1470.0,1879.0,1950.0,0.0,0.0,0.0,0.0,407.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,58.0,7391.0,1953.0,1963.0,0.0,0.0,219.25,784.0,873.5,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,67.0,9399.0,1973.0,1992.0,0.0,350.5,460.0,988.0,1079.0,0.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,80.0,11517.5,2001.0,2004.0,164.0,753.5,797.75,1305.0,1382.5,676.0,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,200.0,56600.0,2010.0,2010.0,1290.0,4010.0,2140.0,5095.0,5095.0,1862.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [70]:
df[cat_features].describe()

Unnamed: 0,MSSubClass_90,GarageQual_Po,Condition2_Feedr,Condition1_PosA,GarageCond_Fa,OverallCond_8,RoofMatl_Tar&Grv,OverallQual_4,SaleType_ConLw,BsmtQual_Gd,...,HouseStyle_SFoyer,OverallQual_1,SaleCondition_Alloca,LandSlope_Sev,BsmtFinType1_Rec,BsmtFinType2_GLQ,ExterQual_Ex,PoolQC_Ex,BsmtFinSF2,LandContour_Lvl
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1458.0,1459.0
mean,0.039068,0.001371,0.004798,0.008225,0.026731,0.049349,0.008225,0.075394,0.002056,0.405072,...,0.031528,0.001371,0.008225,0.002056,0.106237,0.013708,0.037697,0.001371,52.619342,0.898561
std,0.193823,0.037012,0.069124,0.090348,0.16135,0.21667,0.090348,0.264117,0.045314,0.491074,...,0.174801,0.037012,0.090348,0.045314,0.308247,0.116316,0.190528,0.037012,176.753926,0.302013
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1526.0,1.0


In [71]:
print (df.shape)

(1459, 320)


In [72]:
for ft in real_features:
    df[ft] = df[ft].fillna(0)
for ft in cat_features:
    df[ft] = df[ft].fillna(0) 

In [73]:
X_testing_real = df[real_features].to_numpy()
X_testing_cat = df[cat_features].to_numpy()

print ("X_real: {} ".format(X_testing_real[0]))
print ("X_cat: {} ".format(X_testing_cat[0]))

X_real: [8.0000e+01 1.1622e+04 1.9610e+03 1.9610e+03 0.0000e+00 4.6800e+02
 2.7000e+02 8.8200e+02 8.9600e+02 0.0000e+00 0.0000e+00 8.9600e+02
 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 2.0000e+00 1.0000e+00
 5.0000e+00 0.0000e+00 1.9610e+03 1.0000e+00 7.3000e+02 1.4000e+02
 0.0000e+00 0.0000e+00 0.0000e+00 1.2000e+02 0.0000e+00 0.0000e+00
 6.0000e+00 2.0100e+03] 
X_cat: [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   1.   0.   0.   0.   1.   0.   1.   1.   0.
   1.   0.   0.   0.   0.   1.   0.   0.   1.   0.   0.   0.   0.   0.
   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.
   1.   0.   0.   1.   0.   0.   0.   0.   0.   0.   1.   0.   1.   0.
   1.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
   1.   0.   0.   1.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.
   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.
   1.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.  

In [74]:
X_testing_poly_real = transform.fit_transform(X_testing_real)

In [75]:
X_testing_real_scaled = scaler.fit_transform(X_testing_poly_real)

In [76]:
X_testing = np.hstack((X_testing_real_scaled, X_testing_cat))

X_testing = X_testing[:, not_null_idx]
print (X_testing.shape)

(1459, 440)


In [77]:
predicted = vt_model.predict(X_testing)

print (predicted)
print (house_ids)

[111805.64731689 157757.81346005 198703.26567678 ... 179880.09564922
 114611.98706071 230518.65610531]
[1461 1462 1463 ... 2917 2918 2919]


In [78]:
with open("submission_lr2.txt", "w") as fout:
    fout.write("Id,SalePrice\n")
    for val in zip(house_ids, predicted):
        fout.write("{},{}\n".format(str(int(val[0])), str(int(val[1]))))