In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
import statsmodels.api as sm
pd.options.display.max_rows = 1000; pd.options.display.max_columns = 100;
sns.set_style('whitegrid')
import kaggle
from zipfile import ZipFile

Configure

In [None]:
train = pd.read_csv(ZipFile("house-prices-advanced-regression-techniques.zip").open("train.csv"))
test = pd.read_csv(ZipFile("house-prices-advanced-regression-techniques.zip").open("test.csv"))

Load data

In [None]:
train.head()

Preview head

In [None]:
numeric = [f for f in train.columns if train.dtypes[f] != 'object']
numeric.remove('SalePrice')
numeric.remove('Id')
categorical = [f for f in train.columns if train.dtypes[f] == 'object']

Separate numeric, categorical and target from each other.

# Explore Data
1460 ids in the training set. 81 variables in the dataset. 36 numeric variables (excluding Ids and Sale Price) and 43 categorical variables.

Numeric: 
'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
'2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold'

Categorical:
'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
'MiscFeature', 'SaleType', 'SaleCondition'

## Sale Price Variable
Sale Price is what we want to predict. Mean is around 180k with a couple of outliers.

In [None]:
y = train['SalePrice']
y.hist(bins=100)
plt.savefig('hopr-saleprice.svg')
plt.close();

![](hopr-saleprice-ant.svg1)
See the shape of the target data.

In [None]:
y.describe()

See a bit of a right skew due to some outliers. Will it affect ML accuracy if we keep them in, not sure...

## Overall Quality
SalePrice increases as OverallQual increase, we know this. Because we are working with averages, a feature with outliers that are really far from the rest of the points will skew the our algorithm.

In [None]:
sns.boxplot(x=train['OverallQual'], y=train['SalePrice'], color='steelblue')
plt.savefig('hopr-overallquality.svg')
plt.close();

![](overallquality.svg)
In this dataset, we found some but aren't far enough or numerous enough to become a threat to our averages.

## GrLivArea
GrLivArea is "Above Grade Living Area". There are many ways to measure a house but the most frequently measurement is to measure the area that is on the ground level. In the dataset, it has a strong relationship with SalePrice.

In [None]:
sns.regplot(x=train['GrLivArea'], y=train['SalePrice'], ci=None
            , scatter_kws={'alpha':0.5}
            , line_kws={"color": "orange"})

plt.savefig('hopr-saleprice-grlivarea.svg')
plt.close();

![](hopr-saleprice-grlivarea-ant.svg1)
The larger the house, the more you pay. Most of the houses follows this idea except for 2 houses priced at 523 and 1298.

## Correlation
Now we look at what features is more correlated with SalePrice.

In [None]:
g = (train[numeric].corrwith(train['SalePrice']).sort_values(ascending=False)
     .reset_index()).head(15)
g.columns = ['Features', 'Correlation']
g.style.bar(vmax=1, vmin=0)

Overall Quality seems to be the most correlated matric to Sales Price.

In [None]:
matrix = train[['SalePrice'] + list(g['Features'])[:15]].corr()
np.fill_diagonal(matrix.values, 0)
sns.heatmap(matrix, linewidths=.01, linecolor='lightgrey', annot=True, cmap='Blues', vmin=.4, cbar=False);
plt.savefig('hopr-matrix.svg')
plt.close();

![](hopr-matrix-annot.svg1)

Calculated correlation with the rest of the features. 
- GarageCars and GarageArea has a high correlation with each other.
- There are others, which I will keep in mind when I decide to combine them to create new combined features.

# Cleaning Area

In [None]:
cat_num = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual']
for c in cat_num:
    train[c] = train[c].replace({ 'Ex':5 , 'Gd':4 , 'TA':3, 'Fa':2, 'Po':1, 'NULL': 0})


## Missing Data

In [None]:
missing = train.isnull().sum()
missing = missing[missing>0]
missing_tally = pd.DataFrame({
    'count': missing,
    'proportion': missing / len(train) * 100
})
missing_tally.sort_values(by='proportion', ascending=False).style.bar(vmax=100)

19 columns that have missing data. Missing data could be used here as a way to say that the house does not have this feature. Here are all of them

## Batch Exploring Features


In [None]:
for c in categorical:
    train[c] = train[c].fillna('NULL')

def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)
    
f = pd.melt(train, id_vars=['SalePrice'], value_vars=categorical)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, height=4)
g = g.map(boxplot, "value", "SalePrice")

- **Neighborhood Split Class**. CollgCr seems to be average. OldTown and Edwards are commonly cheap. NridgHt, NoRidge and StonBr are at the higher end.
- **Poor-Excellent Quality**. Some categories can be turned into numeric values.
- **SaleCondition and SaleType**. Partial SalesCondition and New SaleType seems to be both at higher value.


In [None]:
train['BsmtCond'].value_counts()

# Machine Learning