# House Prices: Advanced Regression Techniques

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load datasets in pandas.dataframe
df_train = pd.read_csv('data/train.csv')
df_test  = pd.read_csv('data/test.csv')
df_all   = pd.concat([df_train, df_test]).reset_index(drop=True)

In [3]:
# view data statistics
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}'.format(df_test.shape[0]))
print('Number of All Examples = {}\n'.format(df_all.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print('All X Shape = {}'.format(df_all.shape))
print('All y Shape = {}\n'.format(df_all.shape[0]))

#df_train.info()
#df_train.head(10)
#df_train.sample(8)
#df_all['SalePrice'].head(2919)

Number of Training Examples = 1460
Number of Test Examples = 1459
Number of All Examples = 2919

Training X Shape = (1460, 81)
Training y Shape = 1460

Test X Shape = (1459, 80)
Test y Shape = 1459

All X Shape = (2919, 81)
All y Shape = 2919



### Features

    SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
    MSSubClass: The building class
    MSZoning: The general zoning classification
    LotFrontage: Linear feet of street connected to property
    LotArea: Lot size in square feet
    Street: Type of road access
    Alley: Type of alley access
    LotShape: General shape of property
    LandContour: Flatness of the property
    Utilities: Type of utilities available
    LotConfig: Lot configuration
    LandSlope: Slope of property
    Neighborhood: Physical locations within Ames city limits
    Condition1: Proximity to main road or railroad
    Condition2: Proximity to main road or railroad (if a second is present)
    BldgType: Type of dwelling
    HouseStyle: Style of dwelling
    
    OverallQual: Overall material and finish quality
    OverallCond: Overall condition rating
    YearBuilt: Original construction date
    YearRemodAdd: Remodel date
    RoofStyle: Type of roof
    RoofMatl: Roof material
    Exterior1st: Exterior covering on house
    Exterior2nd: Exterior covering on house (if more than one material)
    
    MasVnrType: Masonry veneer type
    MasVnrArea: Masonry veneer area in square feet
    ExterQual: Exterior material quality
    ExterCond: Present condition of the material on the exterior
    Foundation: Type of foundation
    
    BsmtQual: Height of the basement
    BsmtCond: General condition of the basement
    BsmtExposure: Walkout or garden level basement walls
    BsmtFinType1: Quality of basement finished area
    BsmtFinSF1: Type 1 finished square feet
    BsmtFinType2: Quality of second finished area (if present)
    BsmtFinSF2: Type 2 finished square feet
    BsmtUnfSF: Unfinished square feet of basement area
    TotalBsmtSF: Total square feet of basement area

    BsmtFullBath: Basement full bathrooms
    BsmtHalfBath: Basement half bathrooms
    FullBath: Full bathrooms above grade
    HalfBath: Half baths above grade

    Bedroom: Number of bedrooms above basement level
    Kitchen: Number of kitchens
    KitchenQual: Kitchen quality    
    TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

    Heating: Type of heating
    HeatingQC: Heating quality and condition
    CentralAir: Central air conditioning
    Electrical: Electrical system

    1stFlrSF: First Floor square feet
    2ndFlrSF: Second floor square feet
    LowQualFinSF: Low quality finished square feet (all floors)
    GrLivArea: Above grade (ground) living area square feet
    
    Functional: Home functionality rating
    Fireplaces: Number of fireplaces
    FireplaceQu: Fireplace quality

    GarageType: Garage location
    GarageYrBlt: Year garage was built
    GarageFinish: Interior finish of the garage
    GarageCars: Size of garage in car capacity
    GarageArea: Size of garage in square feet
    GarageQual: Garage quality
    GarageCond: Garage condition
    
    PavedDrive: Paved driveway
    WoodDeckSF: Wood deck area in square feet
    
    OpenPorchSF: Open porch area in square feet
    EnclosedPorch: Enclosed porch area in square feet
    3SsnPorch: Three season porch area in square feet
    ScreenPorch: Screen porch area in square feet

    PoolArea: Pool area in square feet
    PoolQC: Pool quality
    Fence: Fence quality
    MiscFeature: Miscellaneous feature not covered in other categories
    MiscVal: $ Value of miscellaneous feature
    MoSold: Month Sold
    YrSold: Year Sold
    SaleType: Type of sale
    SaleCondition: Condition of sale

In [4]:
# Missing Values
def print_missing_values(df):
    no = 0
    for col in df.columns.tolist():
        miss_count = df[col].isnull().sum()
        if miss_count:
            no+=1
            print('   {} \t{:<12} {:>8}/{} \t{:.2%} \t{}'.format(no, col, miss_count, len(df), miss_count/len(df), df[col].dtype))
    print('Total missing features: {}  \n'.format(no))

In [5]:
print_missing_values(df_all)

   1 	Alley            2721/2919 	93.22% 	object
   2 	BsmtCond           82/2919 	2.81% 	object
   3 	BsmtExposure       82/2919 	2.81% 	object
   4 	BsmtFinSF1          1/2919 	0.03% 	float64
   5 	BsmtFinSF2          1/2919 	0.03% 	float64
   6 	BsmtFinType1       79/2919 	2.71% 	object
   7 	BsmtFinType2       80/2919 	2.74% 	object
   8 	BsmtFullBath        2/2919 	0.07% 	float64
   9 	BsmtHalfBath        2/2919 	0.07% 	float64
   10 	BsmtQual           81/2919 	2.77% 	object
   11 	BsmtUnfSF           1/2919 	0.03% 	float64
   12 	Electrical          1/2919 	0.03% 	object
   13 	Exterior1st         1/2919 	0.03% 	object
   14 	Exterior2nd         1/2919 	0.03% 	object
   15 	Fence            2348/2919 	80.44% 	object
   16 	FireplaceQu      1420/2919 	48.65% 	object
   17 	Functional          2/2919 	0.07% 	object
   18 	GarageArea          1/2919 	0.03% 	float64
   19 	GarageCars          1/2919 	0.03% 	float64
   20 	GarageCond        159/2919 	5.45% 	object
   21 	GarageFinish

In [6]:
# Filling features [0 None Avg  ...]
fil_val_int = 0
fil_val_str = '---'

In [7]:
# basement features
for feature in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    df_all[feature] = df_all[feature].fillna(fil_val_int)
for feature in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', ]:
    df_all[feature] = df_all[feature].fillna(fil_val_str)

# garage features
for feature in ['GarageArea', 'GarageCars', 'GarageYrBlt']:
    df_all[feature] = df_all[feature].fillna(fil_val_int)
for feature in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    df_all[feature] = df_all[feature].fillna(fil_val_str)

print_missing_values(df_all)

   1 	Alley            2721/2919 	93.22% 	object
   2 	Electrical          1/2919 	0.03% 	object
   3 	Exterior1st         1/2919 	0.03% 	object
   4 	Exterior2nd         1/2919 	0.03% 	object
   5 	Fence            2348/2919 	80.44% 	object
   6 	FireplaceQu      1420/2919 	48.65% 	object
   7 	Functional          2/2919 	0.07% 	object
   8 	KitchenQual         1/2919 	0.03% 	object
   9 	LotFrontage       486/2919 	16.65% 	float64
   10 	MSZoning            4/2919 	0.14% 	object
   11 	MasVnrArea         23/2919 	0.79% 	float64
   12 	MasVnrType         24/2919 	0.82% 	object
   13 	MiscFeature      2814/2919 	96.40% 	object
   14 	PoolQC           2909/2919 	99.66% 	object
   15 	SalePrice        1459/2919 	49.98% 	float64
   16 	SaleType            1/2919 	0.03% 	object
   17 	Utilities           2/2919 	0.07% 	object
Total missing features: 17  



In [8]:
# some categorical features
features = ['Fence', 'FireplaceQu', 'MasVnrType']
df_all[features] = df_all[features].fillna(fil_val_str)
df_all['MasVnrArea'] = df_all['MasVnrArea'].fillna(fil_val_int)

# delete non-informative features (more 90% missing val)
features = ['Alley', 'PoolQC', 'MiscFeature']
df_all = df_all.drop(features, axis=1)

print_missing_values(df_all)

   1 	Electrical          1/2919 	0.03% 	object
   2 	Exterior1st         1/2919 	0.03% 	object
   3 	Exterior2nd         1/2919 	0.03% 	object
   4 	Functional          2/2919 	0.07% 	object
   5 	KitchenQual         1/2919 	0.03% 	object
   6 	LotFrontage       486/2919 	16.65% 	float64
   7 	MSZoning            4/2919 	0.14% 	object
   8 	SalePrice        1459/2919 	49.98% 	float64
   9 	SaleType            1/2919 	0.03% 	object
   10 	Utilities           2/2919 	0.07% 	object
Total missing features: 10  



In [9]:
# Filling missing values in categorical features with the mode value of neighborhood and house type
for feature in ['Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual', 'MSZoning', 'SaleType', 'Utilities']:
    df_all[feature] = df_all.groupby(['Neighborhood', 'MSSubClass'])[feature].apply(lambda x: x.fillna(x.mode()[0]))

# Filling the missing values in LotFrontage with the median of neighborhood
df_all['LotFrontage'] = df_all.groupby(['Neighborhood'])['LotFrontage'].apply(lambda x: x.fillna(x.median()))

print_missing_values(df_all)

   1 	SalePrice        1459/2919 	49.98% 	float64
Total missing features: 1  



In [10]:
# Correlations
features = ['GarageYrBlt', 'Id', 'MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']
df_all = df_all.drop(features, axis=1)

In [11]:
df_all_corr  = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
#df_all_corr.drop(df_all_corr.iloc[1::2].index, inplace=True)
df_all_corr_ = df_all_corr.drop(df_all_corr[df_all_corr['Correlation Coefficient'] == 1.0].index)

In [14]:
#df_all_corr_
df_all_corr_[df_all_corr['Feature 1'] == 'SalePrice']

  


Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
38,SalePrice,OverallQual,0.790982
40,SalePrice,GrLivArea,0.708624
46,SalePrice,GarageCars,0.640409
51,SalePrice,GarageArea,0.623431
53,SalePrice,TotalBsmtSF,0.613581
58,SalePrice,1stFlrSF,0.605852
70,SalePrice,FullBath,0.560664
75,SalePrice,TotRmsAbvGrd,0.533723
100,SalePrice,MasVnrArea,0.472614
102,SalePrice,Fireplaces,0.466929
