In [1]:
# Load packages
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import mean_squared_error

from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklearn.model_selection import cross_val_score

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Considering there are a lot of features in the dataset, make 
# the display wider to make sure every column could be shown.
pd.set_option('display.max_columns', 500)

# Load the data
house = pd.read_csv('./housing.csv')

# Estimatie house value.

Estimate the sale price of properties based on their "fixed" characteristics, such as neighborhood, lot size, number of stories.

##  Data cleaning

In [2]:
# First glance at data.
house.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [7]:
# shape of house dataset
house.shape

(1460, 81)

In [8]:
# house.dtype

In [9]:
# Check basic data infomation
# house.info()

In [10]:
# Describe numeric data
# house.describe().T

### Drop unwanted rows
Any non residential houses are out of this analysis, so they should be dropped.

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM	Residential Medium Density

In [11]:
# Remove any houses that are not residential from the dataset.
print house.MSZoning.unique(), house.shape
house = house[house.MSZoning != 'C (all)']
print house.MSZoning.unique(), house.shape

# All commercial house, which categories as 'C (all)', now has been dropped (10 rows).

['RL' 'RM' 'C (all)' 'FV' 'RH'] (1460, 81)
['RL' 'RM' 'FV' 'RH'] (1450, 81)


### Drop unwanted features
Majority in Alley, FireplaceQu, PoolQC, Fence and MiscFeature feature are missing. Therefore, either fillna or imputation are not going to do any good to future model. So, these 5 columns should be dropped.

In [13]:
# Check if any null data
for i in range(len(house.columns)):
    # This loop will go through each column to check if there are more than 
    # 20% data missing. And then, print out selected columns.
    nullValue = house.iloc[:,i].isnull().sum()
    nullprop = float(nullValue) / 14.50
    if nullValue != 0 and nullprop > 20:
        print '{} Column has {} null value, which is {}% missing data'.format(
            house.columns[i], nullValue, nullprop)
        
# Drop Alley, FireplaceQu, PoolQC, Fence, MiscFeature, 
# since most of data in those columns are missing.
print house.shape
house.drop(['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], 
           axis=1, inplace=True)
print house.shape

# There are 6 columns dropped. Id column also be dropped 
# since it does no help for predicting.

Alley Column has 1361 null value, which is 93.8620689655% missing data
FireplaceQu Column has 681 null value, which is 46.9655172414% missing data
PoolQC Column has 1443 null value, which is 99.5172413793% missing data
Fence Column has 1172 null value, which is 80.8275862069% missing data
MiscFeature Column has 1398 null value, which is 96.4137931034% missing data
(1450, 81)
(1450, 75)


### Fill NA

In [None]:
# Find columns who still have null value
mc = house.columns[pd.isnull(house).sum() > 0]
for c in mc[1:]:
    print 'There are {} missing value in {}.'.format(house[c].isnull().sum(), c)
    if house[c].dtype == 'object':
        house[c] = house[c].fillna('None')
    else:
        house[c] = house[c].fillna(house[c].mean())

print '\nAfter fill na, there are still {} missing data in {} colums.'.format(
    house.isnull().sum().sum(), mc[0])

# All columns with dtype float64 will be filled mean value in nan blank
# All columns 
# house.info()

There are very little missing data in above columns, therefore, I decide to fill in those missing spot with means or 'None' for object columns. 

### Imputing Missing Data: Lofrontage
Since 259/1450 missing data in one columns, which is a very larger present, feature impute need to be processed.

In [15]:
#house.LotFrontage

## Identify Fixed Feature

Fixed feature:
- MSSubClass: Identifies the type of dwelling involved in the sale.	
- LotFrontage: Linear feet of street connected to property
- LandContour: Flatness of the property
- LotConfig: Lot configuration
- LandSlope: Slope of property
- Neighborhood: Physical locations within Ames city limits
- Condition1: Proximity to various conditions
- Condition2: Proximity to various conditions (if more than one is present)
- BldgType: Type of dwelling ?
- YearBuilt: Original construction date
- YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
- BsmtQual: Evaluates the height of the basement ?
- 1stFlrSF: First Floor square feet
- 2ndFlrSF: Second floor square feet
- GrLivArea: Above grade (ground) living area square feet ?
- BsmtFullBath: Basement full bathrooms
- BsmtHalfBath: Basement half bathrooms
- FullBath: Full bathrooms above grade
- HalfBath: Half baths above grade
- Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
- Kitchen: Kitchens above grade
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- Fireplaces: Number of fireplaces
- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet
- WoodDeckSF: Wood deck area in square feet ?
- OpenPorchSF: Open porch area in square feet
- EnclosedPorch: Enclosed porch area in square feet
- 3SsnPorch: Three season porch area in square feet
- ScreenPorch: Screen porch area in square feet
- PoolArea: Pool area in square feet

## Explotary Data Analysis

Are time related features having strong or weak corelationship with sale price?

In [None]:
house['SalePrice'].plot(kind='hist', figsize=(10,5), bins=30)

According to histogram above, sale price basically normally distributed. Obviously, there are some outliers which should be removed later.

In [None]:
sns.jointplot(x="YrSold", y="SalePrice", data=house, kind="reg")

Sale price of houses and sold year of houses have no corelationship. 

In [None]:
sns.jointplot(x="MoSold", y="SalePrice", data=house, kind="reg")

Similar with year of sold, month of sold also has no strong corelationship with sale price. 

These 2 features should be dropped after feature selection.

In [None]:
sns.jointplot(x="YearBuilt", y="SalePrice", data=house, kind="reg")

In [None]:
sns.jointplot(x="YearRemodAdd", y="SalePrice", data=house, kind="reg")

In [None]:
sns.jointplot(x="GarageYrBlt", y="SalePrice", data=house, kind="reg")

Comparing to YrSold and MoSold, YearBuilt, YearRemodAdd and GarageYrBlt are obvious better features associated with sale price.

- YearBuilt: Original construction date
- YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
- GarageYrBlt: Year garage was built


In [None]:
sns.pairplot(house[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 
                   'MoSold', 'SalePrice']])

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(house.corr(), annot=True)

According the plot above, some features have strong corelationship with sale price.

- OverallQual
- TotalBsmtSF
- 1stFlrSF
- GrLiveArea
- GarageCars
- GarageArea

Other finding:

1. GarageYrBlt and YearBuilt have strong corelation (78%)
- GarageArea and GarageCars (88%)
- TotRmsAbvGrd and GrLiveArea (83%)
- 1stFlrSF and TotalBsmtSF (82%)

In general, each of two features in a pair has similar meaning in realistic. For instance, garage built year and house built year have strong corelationship since garage always built with house. 

Those pairs are not good for linear regression since they are strong corelated. Therefore, only one feature from a pair should be chosen for linear regression. 

This is a reference for checking after *feature selection*.

## Identify outliers

In [None]:
# Box plot functions helping mathod to help identify outliers
def boxplot_helper(column):
    fig = plt.figure(figsize=(6,4))
    ax = fig.gca()

    ax = sns.boxplot(column, orient='v',
                fliersize=8, linewidth=1.5, notch=True,
                saturation=0.5, ax=ax)

    ax.set_ylabel(column.name, fontsize=16)
    ax.set_title(column.name, fontsize=20)

    plt.show()
    
def boxplot_all(df, f=(12,6)):
    fig = plt.figure(figsize=f)
    ax = fig.gca()

    ax = sns.boxplot(data=df, orient='h', fliersize=5, linewidth=3, notch=True,
                 saturation=0.5, ax=ax)

    ax.set_title('All variables boxplot\n')
    plt.show()

In [None]:
# boxplot of saleprice column
boxplot_helper(house.SalePrice)

In [None]:
# Split object dtypes columns and numeric dtypes columns
numer_col = []
for c in house.columns:
    if house[c].dtypes != 'object':
        numer_col.append(c)

house_numer_cols = house[numer_col]

In [None]:
# Standardize numeric data and boxplot it 
hnc_stand = (house_numer_cols - house_numer_cols.mean()) / house_numer_cols.std()
boxplot_all(hnc_stand, f=(12,12))

According to the plot above, we can comfirm that there are a lot of outliers existing. Ths most obvirous outliers sit in "LotArea", "LowQualFinSF", "3SsnPorch", "PoolArea",  "MiscVal". 

- LotArea: Lot size in square feet.
- LowQualFinSF: Low quality finished square feet (all floors).
- 3SsnPorch: Three season porch area in square feet.
- PoolArea: Pool area in square feet.
- MiscVal: Value of miscellaneous feature.

It is hard to come up a realistic reason to decide which is outlier since even outliers seems reasonable. For instance, some house can really have a giant swimming pool comparing most of rest having none. However, outliers sometimes can do real damage to our model since they can make the model skew or bias towards outliers.

Therefore, I'm going to identify outliers and remove it from dataset.

In [None]:
def is_outlier(points, thresh=3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

Identify ouliers:

In [None]:
hnc_stand_noout = hnc_stand[~ is_outlier(hnc_stand, thresh=1.2)]
print hnc_stand_noout.shape
boxplot_all(hnc_stand_noout, f=(12,12))

Comparing the boxplot before, the range of x-axis (-4 t0 8) is significanly narrowed, which is (-5 to 30) in previous.

There are (1450 - 1392=) 58 rows dropped.

In [None]:
house_remove_outlier = house[~ is_outlier(hnc_stand, thresh=1.2)]
house_remove_outlier.shape

## Feature Selection

### Factorize categorical into dummies variables

In [None]:
# Exact columns with object dtype
object_cols = []
for c in house.columns:
    if house[c].dtypes == 'object':
        object_cols.append(c)
print len(object_cols)
house_with_object_cols = house_remove_outlier.loc[:, object_cols]
print house_with_object_cols.shape

In [None]:
# process factorization
house_fix = house_remove_outlier.copy()
for c in object_cols:
    dummies = pd.get_dummies(house_with_object_cols[[c]], drop_first=True)
    house_fix = pd.concat([dummies, house_fix], axis=1)
    house_fix.drop(c, axis=1, inplace=True)
house_fix.shape

### Feature selection

In [None]:
# spliting data into testing and training group 
# depends on if the house sold in 2010

testing = house_fix[house_fix.YrSold == 2010]
training = house_fix[house_fix.YrSold != 2010]

testing_y = testing['SalePrice'].values
testing_X = testing.iloc[:, 0:len(house_fix.columns)-1]

training_y = training['SalePrice'].values
training_X = training.iloc[:, 0:len(house_fix.columns)-1]

## Build models

In [None]:
# Standardize data 
ss = StandardScaler()
Xs_train = ss.fit_transform(training_X)
Xs_test = ss.transform(testing_X)

In [None]:
# fiting all features into a linear model
lm = linear_model.LinearRegression()
model = lm.fit(Xs_train, training_y)

## Model Evaluation

In [None]:
# result for model with all feature

#print model.score(Xs_test, testing_y)

predict_test = model.predict(Xs_test)

y_hat = np.array(predict_test.round())
y_hat.astype(int)


#sns.jointplot(testing_y, y_hat)

# Determine any value of *changeable* property characteristics unexplained by the *fixed* ones.


In [None]:
# A:

# What property characteristics predict an "abnormal" sale?


In [None]:
# A: