# Load Data and Libraries <a id="load"></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

In [2]:
data = pd.read_csv('house.csv', usecols=['MSZoning', 'Street', 'LotShape', 'LandContour', 
     'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 
     'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 
     'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
     'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 
     'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
     'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
     'Electrical', 'KitchenQual', 'Functional', 'GarageType', 
     'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
     'SaleType', 'SaleCondition','PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu','SalePrice'])
data.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,208500
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,RFn,TA,TA,Y,,,,WD,Normal,181500
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,223500
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Unf,TA,TA,Y,,,,WD,Abnorml,140000
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,250000


# Check Data <a id="check-data"></a>

There's 81 columns in data

In [3]:
# Columns
print(len(data.columns))
data.columns

44


Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')

Check types of each variables

In [4]:
types = pd.DataFrame(data.dtypes).rename(columns={0: 'type'}).sort_values(by=['type'],ascending=False)
types

Unnamed: 0,type
MSZoning,object
Street,object
BsmtFinType1,object
BsmtFinType2,object
Heating,object
HeatingQC,object
CentralAir,object
Electrical,object
KitchenQual,object
Functional,object


In [5]:
X = data.drop(columns='SalePrice')
y = data['SalePrice']

Check how many data is missing

In [6]:
# Check missing values
def check_missing(df):
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    col = missing_table.rename(columns = {0 : 'Num', 1 : 'Rate'})
    return col

# Display columns missing values are under 1%.
print("Data #"+str(len(X)))
cols = check_missing(X)
types.join(cols).sort_values(by="Rate", ascending=False)

Data #1460


Unnamed: 0,type,Num,Rate
PoolQC,object,1453.0,99.520548
MiscFeature,object,1406.0,96.30137
Alley,object,1369.0,93.767123
Fence,object,1179.0,80.753425
FireplaceQu,object,690.0,47.260274
GarageCond,object,81.0,5.547945
GarageQual,object,81.0,5.547945
GarageFinish,object,81.0,5.547945
GarageType,object,81.0,5.547945
BsmtExposure,object,38.0,2.60274


# Data Pre-Processing <a id="pre-processing"></a>

Drop variables more than 40% data was missing..

In [7]:
# Drop more than 40% missing variables
X.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace = True)

In [8]:
X.isnull().sum()

MSZoning          0
Street            0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
KitchenQual       0
Functional        0
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
PavedDrive        0
SaleType          0
SaleCondition     0
dtype: int64

In [9]:
X.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


Process categorical variables(string)
1. Fill missing data by most frequent value
2. One-Hot Encoding

In [10]:
# Fill missing data and replace with dummy value
colm = X.columns.copy()

for v in colm:
    # Fill NaN with mode
    X[v] = X[v].fillna(X[v].mode()[0])
    # One-Hot Encoding
    X = pd.get_dummies(data=X, columns=[v], drop_first=True)
    # Categorize
    # data[v] = pd.factorize(data[v])[0]

In [11]:
X.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


Process categorical variables(int)
1. Do nothing, because there's no missing data

In [12]:
# Fill missing data
numerical_variavles = X.columns.copy()

ss = StandardScaler()
for v in numerical_variavles:
    # Fill NaN with mean
    X[v] = X[v].fillna(X[v].mean())
    # Standardize values
    X[v] = ss.fit_transform(X[[v]])

Data after processing is like this

In [13]:
# Data example
X.sample(n=10)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1298,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,12.041595,-1.314904,-0.188311,-0.159,...,-0.058621,-0.058621,3.311678,-0.045376,-2.56218,-0.052414,-0.091035,-0.117851,-2.138345,3.268027
792,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,-1.314904,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
565,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
66,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
481,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,-1.314904,5.310367,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
268,-0.215859,-0.105263,-1.930005,2.386891,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
883,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1294,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
271,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,-1.314904,-0.188311,6.289321,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1457,-0.215859,-0.105263,0.518133,-0.418955,0.064238,-0.169981,-0.083045,0.760512,-0.188311,-0.159,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


# Feature Selection

In [15]:
possible_features = X.columns.copy()


# Check feature importances
selector = SelectKBest(score_func=f_regression, k=len(possible_features))
selector.fit(X[possible_features], y)
scores = -np.log10(selector.pvalues_)
indices = np.argsort(scores)[::-1]
col = []
print('Id Feature importances:')
for i in range(100):
    print('%.2f %s' % ( scores[indices[i]], possible_features[indices[i]]))
    col.append(possible_features[indices[i]])    

Id Feature importances:
136.37 ExterQual_TA
100.89 KitchenQual_TA
98.48 GarageFinish_Unf
91.85 BsmtQual_TA
91.51 Foundation_PConc
73.86 ExterQual_Gd
67.59 BsmtFinType1_GLQ
57.14 Neighborhood_NridgHt
47.17 MasVnrType_None
44.53 SaleType_New
43.66 GarageType_Detchd
43.12 SaleCondition_Partial
40.91 Foundation_CBlock
37.82 MasVnrType_Stone
37.80 Neighborhood_NoRidge
35.76 KitchenQual_Gd
35.39 BsmtExposure_No
33.75 HeatingQC_TA
32.58 BsmtExposure_Gd
32.38 Exterior2nd_VinylSd
32.08 Exterior1st_VinylSd
28.57 MSZoning_RM
24.65 LotShape_Reg
21.74 CentralAir_Y
20.68 MSZoning_RL
20.32 HouseStyle_2Story
20.27 SaleType_WD
20.23 Electrical_SBrkr
19.18 RoofStyle_Hip
19.11 GarageType_BuiltIn
19.01 BsmtQual_Gd
18.60 GarageType_Attchd
18.56 PavedDrive_Y
17.45 RoofStyle_Gable
16.05 Neighborhood_StoneBr
13.67 MasVnrType_BrkFace
12.89 Neighborhood_OldTown
12.42 Neighborhood_NAmes
11.37 Neighborhood_Edwards
10.18 GarageFinish_RFn
10.13 RoofMatl_WdShngl
9.88 Exterior1st_MetalSd
9.54 Neighborhood_IDOTRR
9.36

In [17]:
# Get params
train_target = y.values
train_features = X[col].values
test_features  = X[col].values

# Training and Prediction <a id="training-prediction"></a>

Here's just use SVR for prediction, with GridSearch

In [23]:
from sklearn.svm import SVR,LinearSVR
from sklearn.model_selection import GridSearchCV

svrgs_parameters = {
    'kernel': ['rbf'],
    'C':     [100,500,2000,3000,5000],
    'gamma': [0.004,0.0045,0.005]
}

svr_cv = GridSearchCV(estimator=LinearSVR(), param_grid=svrgs_parameters, cv=8, scoring= 'r2',n_jobs=-1)
svr_cv.fit(train_features, train_target)
print("SVR GridSearch score: "+str(svr_cv.best_score_))
print("SVR GridSearch params: ")
print(svr_cv.best_params_)

SVR GridSearch score: 0.5004725504038599
SVR GridSearch params: 
{'C': 5000, 'gamma': 0.0045, 'kernel': 'rbf'}
