# House Prices: Advanced Regression Techniques

1. Hypothesis (Before looking the data)

   Larger area, higher price
   More Bedroom, higher price
   More Bathroom, higher price
   Higher School rate, higher price
   Newer house, higher price
   Lower store, higher price
   House type (single family, townhouse, conde), price decrease
   Sale type (loan, cash), price decrease
   Others (Remodel, roof, safty)

2. Look at the data

In [523]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split


from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [518]:
# Reading Data
train = pd.read_csv('C:\\Users\\alexliuyi\\Documents\\Kaggle\\Home Price\\train.csv')
test  = pd.read_csv('C:\\Users\\alexliuyi\\Documents\\Kaggle\\Home Price\\test.csv')
data  = train.append(test, sort=False)

print('Shape of Train', train.shape)
print('Shape of Test',  test.shape)
print('Shape of Data',  data.shape)

Shape of Train (1460, 81)
Shape of Test (1459, 80)
Shape of Data (2919, 81)


In [427]:
# Define Numerical and Categorical Variable, as well as Dependent Variable
number_var = data1.dtypes.index[data1.dtypes.values != 'object']
cate_var =   data1.dtypes.index[data1.dtypes.values == 'object']

y, number_var = number_var[-1], number_var[:-1]

print('Number of Numerical Variables:   ', len(number_var), number_var)
print('Number of Categorical Variables: ', len(cate_var), cate_var)

Number of Numerical Variables:    37 Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')
Number of Categorical Variables:  38 Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure',

In [None]:
# Check Missing Values
# Numerical Variables
num_missing = round(np.sum(data[number_var].isnull())/data.shape[0]*100,2)
print(num_missing[num_missing>0].sort_values(ascending=False))

In [None]:
# Plot Scatter for LotFrontage
plt.scatter(data1['LotFrontage'],data1['SalePrice'])

In [None]:
# Impute with Median for Numberical Variables
imr = Imputer(missing_values = 'NaN', strategy = 'mean')

In [None]:
# Descriptve for Numberical Variables
data1[number_var[1:10]].describe()

In [None]:
data1[number_var[10:20]].describe()

In [None]:
data1[number_var[20:30]].describe()

In [None]:
data1[number_var[30:]].describe()

In [None]:
%matplotlib inline
plt.figure(1)
plt.plot(data1.groupby(['YearBuilt']).median()['SalePrice'], color='blue')
plt.plot(data1.groupby(['YearRemodAdd']).median()['SalePrice'], color='green')
plt.plot(data1.groupby(['GarageYrBlt']).median()['SalePrice'], color='red')
plt.plot(data1.groupby(['YrSold']).median()['SalePrice'], color='purple')

From above plot, we will just use the YearBuilt variable

In [None]:
corr_var = ['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
            'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','OpenPorchSF',
            'GarageArea','PoolArea',
            'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','Fireplaces','GarageCars',
            'SalePrice']
data_corr = data1[corr_var]

corr = data_corr.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(20, 10))

cmap = sns.diverging_palette(220, 50, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap)

In [None]:
corr[corr>0.5]

Correlation greater than 0.5 with SalePrice:
TotalBsmtSF, 1stFlrSF, GrLivArea, FullBath, GarageArea, GarageCars

However, TotalBsmtSF & 1stFlrSF, GrLivArea & FullBath, GarageArea & GarageCars may have multicollinearity

So just choose TotalBsmtSF, GrLivArea, GarageArea as final numerical predict variables.

In [None]:
# Check Distribution
boxplot_var = ['TotalBsmtSF','GrLivArea','GarageArea']
data1[boxplot_var].boxplot()

In [None]:
# Check Missing Values
# Categorical Variables
cat_missing = round(np.sum(data[cate_var].isnull())/data.shape[0]*100,2)
print(cat_missing[cat_missing>0].sort_values(ascending=False))

In [None]:
# Categorical Variables
for i in cate_var:
    print(i, '\n', pd.value_counts(data1[i]), '\n')

In [None]:
f, ax = plt.subplots(figsize=(20, 8))
sns.violinplot(data1.SaleCondition , data1.SalePrice)

In [None]:
# ANOVA test for difference
data2 = data1[np.isnan(data1['SalePrice'])==False]

model  = ols('SalePrice ~ MSZoning', data2).fit()
anovat = anova_lm(model)
print(anovat)

In [None]:
# If ANOVA test significant, move to pairwies tukey test
print(pairwise_tukeyhsd(data2.SalePrice, data2.SaleCondition))

Choose: BldgType, ExterQual, KitchenQual, Neighbor

In [428]:
var_final = ['TotalBsmtSF','GrLivArea','GarageArea','BldgType','ExterQual','KitchenQual','Neighborhood','SalePrice']

In [471]:
# Delete Variables with Missing Values >= 20%
data_raw = data[var_final]

In [430]:
data_raw.head()

Unnamed: 0,TotalBsmtSF,GrLivArea,GarageArea,BldgType,ExterQual,KitchenQual,Neighborhood,SalePrice
0,856.0,1710,548.0,1Fam,Gd,Gd,CollgCr,208500.0
1,1262.0,1262,460.0,1Fam,TA,TA,Veenker,181500.0
2,920.0,1786,608.0,1Fam,Gd,Gd,CollgCr,223500.0
3,756.0,1717,642.0,1Fam,TA,Gd,Crawfor,140000.0
4,1145.0,2198,836.0,1Fam,Gd,Gd,NoRidge,250000.0


In [472]:
# Re-categorize
neighbor_high = ['NoRidge','NridgHt','StoneBr']

for i in range(len(data_raw.Neighborhood)):
    if data_raw.Neighborhood.values[i] in neighbor_high:
        data_raw.Neighborhood.values[i] = 'High'
    else:
        data_raw.Neighborhood.values[i] = 'Low'



In [484]:
# Impute Missing Value
for i in range(len(data_raw.KitchenQual)):
    if pd.isnull(data_raw.KitchenQual.values[i])>0 :
        data_raw.KitchenQual.values[i] = 'TA'

# Continuous Variable
cont_var = ['GarageArea','TotalBsmtSF']
for i in cont_var:
    data_raw[i] = data_raw[i].replace(np.nan, np.mean(data_raw[i]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [474]:
# LabelEncoder
label = LabelEncoder()
label_var = ['BldgType','ExterQual','KitchenQual','Neighborhood']

for var in label_var:
    data_raw[var] = label.fit_transform(data_raw[var])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [501]:
# Train & Test data


Unnamed: 0,TotalBsmtSF,GrLivArea,GarageArea,BldgType,ExterQual,KitchenQual,Neighborhood,SalePrice
0,856.0,1710,548.0,0,2,2,1,208500.0
1,1262.0,1262,460.0,0,3,3,1,181500.0
2,920.0,1786,608.0,0,2,2,1,223500.0
3,756.0,1717,642.0,0,3,2,1,140000.0
4,1145.0,2198,836.0,0,2,2,0,250000.0


In [538]:
# Onehot Encoding
data_feature_array = data_raw[label_var].values

data_list = data_feature_array.tolist()

onehot = OneHotEncoder()

onehot.fit(data_list)

data_onehot = onehot.transform(data_list).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [539]:
final_var = ['TotalBsmtSF','GrLivArea','GarageArea']

data_final = np.hstack((data_raw[final_var], data_onehot))

In [540]:
X_train = data_final[0:1460,:]
X_test  = data_final[1460:,:]

Y_train = data_raw.SalePrice.values[0:1460]

print(X_train.shape, X_test.shape, Y_train.shape)

(1460, 18) (1459, 18) (1460,)


In [541]:
X_model, X_validate, y_model, y_validate = train_test_split(X_train, Y_train, test_size=1/4, random_state=0)

In [542]:
scaler = MinMaxScaler()

X_model_scaled = scaler.fit_transform(X_model)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)


In [551]:
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error

LR = LinearRegression()
LR.fit(X_model_scaled, y_model)
MSE = mean_squared_log_error(LR.predict(X_validate_scaled), y_validate)
print(MSE)


0.038998931660799166


In [552]:
Lasso = LassoCV(cv=5)
Lasso.fit(X_model_scaled, y_model)
MSE = mean_squared_log_error(Lasso.predict(X_validate_scaled), y_validate)
print(MSE)

0.037035085721662135


In [553]:
Ridge = RidgeCV(cv=5)
Ridge.fit(X_model_scaled, y_model)
MSE = mean_squared_log_error(Ridge.predict(X_validate_scaled), y_validate)
print(MSE)

0.03872335723452591


In [563]:
predicted = Lasso.predict(X_test_scaled)

In [564]:
result = pd.DataFrame(predicted)

In [566]:
result.to_csv('C:\\Users\\alexliuyi\\Documents\\Kaggle\\Home Price\\result.csv')

In [562]:
LR.predict(X_test)

array([3.85075478e+08, 5.17631223e+08, 5.58961423e+08, ...,
       4.97689082e+08, 3.54414899e+08, 6.73569877e+08])