# House Hunters Linear Model

Linear regression model of human-interpretable features that will make recommendations for people looking to buy or sell a house.

## Pre-Processing

In [62]:
# import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import exp

# set display options
pd.set_option('display.max_columns', None)

In [78]:
# import dataset
housing = pd.read_csv('data/Ames_Housing_Price_Data.csv', index_col=0)
# real_estate = pd.read_csv('./data/Ames_Real_Estate_Data.csv')

#### Feature Cleaning

In [79]:
housing.LotArea.describe()

count      2580.000000
mean      10120.153488
std        8126.937892
min        1300.000000
25%        7406.750000
50%        9391.000000
75%       11494.000000
max      215245.000000
Name: LotArea, dtype: float64

In [80]:
#Remove Outliers
housing = housing[np.logical_and(housing.SalePrice >= 40000, housing.SalePrice <= 750000)]
housing = housing[housing.GrLivArea < 4000]
housing = housing[housing.BedroomAbvGr > 0]
housing.loc[housing['GarageCars'] == 5, 'GarageCars'] = 4

#Remove Bad Classes
housing = housing[housing.Neighborhood != 'Landmrk']

housing.MSZoning = housing.MSZoning.astype('string')
housing.MSZoning = housing.MSZoning.str.strip()
housing = housing[housing.MSZoning.isin(["FV", "RH", "RL", "RM"])]

housing = housing[housing.Functional.isin(["Typ", "Min1", "Min2"])]

housing.SaleType = housing.SaleType.astype('string')
housing.SaleType = housing.SaleType.str.strip()
housing = housing[housing.SaleType == 'WD']

housing = housing[housing.SaleCondition == 'Normal']


#Replace NAs
housing = housing.fillna(0)

#### Feature Engineering

In [81]:
#Area Calculations
housing['PorchTotSF'] = housing.OpenPorchSF + housing.EnclosedPorch + housing['3SsnPorch'] + housing.ScreenPorch
housing['BsmtSF'] = housing.BsmtFinSF1 + housing.BsmtFinSF2
housing.loc[housing['BsmtSF'] == 0, 'BsmtSF'] = exp(1)

#Log Transforms
housing['LogSalePrice'] = np.log(housing.SalePrice)
housing['LogLotArea'] = np.log(housing.LotArea)
housing['LogGrLivArea'] = np.log(housing.GrLivArea)
housing['LogBsmtSF'] = np.log(housing.BsmtSF)

#Categorical to Ordinal
housing.Neighborhood = housing.Neighborhood.replace({'MeadowV':1,'BrDale':2, 'IDOTRR':3, 'BrkSide':4, 'OldTown':5, 'Edwards':6, 'SWISU':7, 'Landmrk':8, 'Sawyer':9,\
                           'NPkVill':10, 'Blueste':11, 'NAmes':12, 'Mitchel':13, 'SawyerW':14, 'Gilbert':15, 'NWAmes':16, 'Greens':17, 'Blmngtn':18,\
                           'CollgCr':19, 'Crawfor':20, 'ClearCr':21, 'Somerst':22, 'Timber':23, 'Veenker':24, 'GrnHill':25, 'StoneBr':26,'NridgHt':27, 'NoRidge':28})
housing.BldgType = housing.BldgType.replace({'2fmCon':1,'Twnhs':2, 'Duplex':3, '1Fam':4, 'TwnhsE':5})
housing.HouseStyle = housing.HouseStyle.replace({'1.5Unf':1,'1.5Fin':2, 'SFoyer':3, 'SLvl':4, '1Story':5, '2.5Unf':6, '2Story':7, '2.5Fin':8})
housing.MoSold = housing.MoSold.replace({1:11, 9:10, 8:9, 6:8, 7:7, 11:6, 12:5, 2:4, 3:3, 10:2, 5:1, 4:0})

#Renumber Numerical
housing['NumBath'] = housing.FullBath + 0.5*housing.HalfBath + 0.5*housing.BsmtFullBath

#Binary HasBLANK Categories
housing['BeenRemod'] = np.where(housing.YearBuilt != housing.YearRemodAdd, 1, 0)
housing['HasFinBsmt'] = np.where(housing.BsmtFinSF1 > 0, 1, 0)
housing['HasFinGarage'] = np.where(housing.GarageFinish == "Fin", 1, 0)
housing['HasPool'] = np.where(housing.PoolArea > 0, 1, 0)
housing['HasFireplace'] = np.where(housing.Fireplaces > 0, 1, 0)
housing['HasPorch'] = np.where(housing.PorchTotSF > 0, 1, 0)
housing['HasDeck'] = np.where(housing.WoodDeckSF > 0, 1, 0)


#Binary Quality/Cond Categories
housing['AttachedGarage'] = np.where(housing.GarageType == "Attchd", 1, 0)
housing['GreatElectric'] = np.where(housing.Electrical == "SBrkr", 1, 0)
housing['GreatHeat'] = np.where(housing.HeatingQC == "Ex", 1, 0)
housing['CentralAir'] = np.where(housing.CentralAir == "Y", 1, 0)

#### Feature Selection

In [67]:
model_cols = ['LogSalePrice', 'LogLotArea', 'LogGrLivArea', 'LogBsmtSF', 'OverallQual', 'OverallCond', 'YearBuilt',\
              'Neighborhood', 'BldgType', 'HouseStyle', 'MoSold', 'NumBath', 'GarageCars', 'BedroomAbvGr',\
              'BeenRemod', 'HasFinBsmt', 'HasFinGarage', 'HasPool', 'HasFireplace', 'HasPorch', 'HasDeck',\
              'AttachedGarage', 'GreatElectric', 'GreatHeat', 'CentralAir']
housing = housing[model_cols]
x = housing.drop(['LogSalePrice', 'LogBsmtSF', 'HasPool', 'HouseStyle', 'BedroomAbvGr', \
                  'BeenRemod', 'YearBuilt', 'MoSold'], axis=1)
y=housing.LogSalePrice

In [68]:
from sklearn import linear_model
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=.2, random_state=0)
clf = linear_model.Lasso(alpha=9.326033468832199e-05)
clf.fit(x_train,y_train)

Lasso(alpha=9.326033468832199e-05, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [69]:
x.columns

Index(['LogLotArea', 'LogGrLivArea', 'OverallQual', 'OverallCond',
       'Neighborhood', 'BldgType', 'NumBath', 'GarageCars', 'HasFinBsmt',
       'HasFinGarage', 'HasFireplace', 'HasPorch', 'HasDeck', 'AttachedGarage',
       'GreatElectric', 'GreatHeat', 'CentralAir'],
      dtype='object')

In [70]:
recommendation = [[2,'second'],[1,'first'],[3,'third']]
sorted(recommendation)

recommendation_string = [rec[1] for rec in sorted(recommendation)]
recommendation_string[:2]

['first', 'second']

In [71]:
housing.LogLotArea.describe()

count    2256.000000
mean        9.075977
std         0.508690
min         7.170120
25%         8.910990
50%         9.143666
75%         9.339920
max        12.279532
Name: LogLotArea, dtype: float64

In [45]:
housing.BldgType.value_counts()

4    1885
5     183
2      81
3      73
1      41
Name: BldgType, dtype: int64

In [87]:
model_cols = [ 'LogGrLivArea', 'LogLotArea', 'OverallQual', 'OverallCond',
       'Neighborhood', 'BldgType', 'NumBath', 'GarageCars', 'HasFinBsmt',
       'HasFinGarage', 'HasFireplace', 'HasPorch', 'HasDeck', 'AttachedGarage',
       'GreatElectric', 'GreatHeat', 'CentralAir','BedroomAbvGr']

test = housing[model_cols]
test.corr()

Unnamed: 0,LogGrLivArea,LogLotArea,OverallQual,OverallCond,Neighborhood,BldgType,NumBath,GarageCars,HasFinBsmt,HasFinGarage,HasFireplace,HasPorch,HasDeck,AttachedGarage,GreatElectric,GreatHeat,CentralAir,BedroomAbvGr
LogGrLivArea,1.0,0.359543,0.567961,-0.136027,0.431351,0.012624,0.702621,0.49848,-0.05396,0.266008,0.460361,0.315289,0.176437,0.201597,0.118913,0.294311,0.126494,0.560701
LogLotArea,0.359543,1.0,0.124963,-0.022152,0.278672,0.100343,0.190021,0.272291,0.01434,0.104896,0.226225,0.094124,0.077257,0.215744,0.016135,0.072484,0.048624,0.2971
OverallQual,0.567961,0.124963,1.0,-0.145581,0.645101,0.183002,0.563104,0.566564,0.062843,0.354649,0.417697,0.323078,0.288919,0.34221,0.229679,0.439408,0.233475,0.060539
OverallCond,-0.136027,-0.022152,-0.145581,1.0,-0.293577,-0.032248,-0.260818,-0.226155,-0.004908,-0.153511,-0.101502,-0.074994,-0.060588,-0.1953,0.012248,-0.081553,0.085385,-0.008935
Neighborhood,0.431351,0.278672,0.645101,-0.293577,1.0,0.200121,0.53852,0.553692,0.145574,0.302345,0.340457,0.197568,0.265697,0.419211,0.245881,0.411336,0.239178,0.013168
BldgType,0.012624,0.100343,0.183002,-0.032248,0.200121,1.0,-0.000813,0.135406,0.054596,0.132677,0.16068,0.123637,0.077316,0.237369,0.096927,0.152008,0.201673,-0.159849
NumBath,0.702621,0.190021,0.563104,-0.260818,0.53852,-0.000813,1.0,0.526771,0.120701,0.333975,0.333601,0.264837,0.253787,0.28169,0.217934,0.358567,0.156285,0.334501
GarageCars,0.49848,0.272291,0.566564,-0.226155,0.553692,0.135406,0.526771,1.0,0.073692,0.32039,0.345969,0.183305,0.240504,0.284463,0.213795,0.315049,0.214772,0.123495
HasFinBsmt,-0.05396,0.01434,0.062843,-0.004908,0.145574,0.054596,0.120701,0.073692,1.0,0.064131,0.080552,-0.065769,0.074342,0.227432,0.129456,-0.001678,0.189721,-0.098562
HasFinGarage,0.266008,0.104896,0.354649,-0.153511,0.302345,0.132677,0.333975,0.32039,0.064131,1.0,0.28126,0.119533,0.223489,0.192051,0.137149,0.22015,0.120809,0.020881


## Model Generation

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [88]:
y = housing['LogSalePrice']
x = housing.drop('LogSalePrice', axis=1)

x_train, x_test, y_train, y_test = model_selection.train_test_split(test, y, test_size=.2, random_state=0)

lm = LinearRegression()
lm.fit(x_train, y_train)

print(lm.score(x_train, y_train))
print(lm.score(x_test, y_test))
print(lm.coef_)

0.9113327350306758
0.8925541146577071
[ 0.39812811  0.09716918  0.07968026  0.03152839  0.00989799  0.00947372
  0.0281395   0.05526506  0.08350009  0.02485043  0.02664231  0.00658779
  0.02437426  0.02567893  0.0212838   0.04251617  0.08749372 -0.03206053]


In [89]:
# lasso
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.0003417)
clf.fit(x_train,y_train)

print('training score',clf.score(x_train, y_train))
print('testing score',clf.score(x_test, y_test))
print(clf.coef_)
list(zip(x_train.columns,clf.coef_))

training score 0.9112631092298309
testing score 0.8924856448409578
[ 0.38767877  0.09587699  0.08070506  0.03154928  0.01010478  0.00947587
  0.02907972  0.05554002  0.08193086  0.02343714  0.02740474  0.0055078
  0.0235406   0.02530306  0.01721136  0.0416493   0.0821488  -0.02940007]


[('LogGrLivArea', 0.38767876735252116),
 ('LogLotArea', 0.09587699274944511),
 ('OverallQual', 0.08070505726921937),
 ('OverallCond', 0.031549275403620215),
 ('Neighborhood', 0.010104780695715621),
 ('BldgType', 0.009475873803766714),
 ('NumBath', 0.029079716493941307),
 ('GarageCars', 0.05554001614651261),
 ('HasFinBsmt', 0.08193085802913853),
 ('HasFinGarage', 0.023437141603218876),
 ('HasFireplace', 0.02740474049569175),
 ('HasPorch', 0.00550779549852457),
 ('HasDeck', 0.023540602815809924),
 ('AttachedGarage', 0.02530306165289991),
 ('GreatElectric', 0.017211360781079144),
 ('GreatHeat', 0.04164930473381904),
 ('CentralAir', 0.08214880275135265),
 ('BedroomAbvGr', -0.02940006550480452)]

In [29]:
# tune alpha in lasso with gridsearchCV
from sklearn.model_selection import GridSearchCV

clf = linear_model.Lasso()
param_grid={'alpha':[.00001,.0001, .001,.01]}
grid_search = GridSearchCV(estimator= clf, param_grid= param_grid)
grid_search.fit(x_train,y_train)

print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

# list(zip(x_train.columns,clf.coef_))

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
{'alpha': 0.0001}
0.916118392881971


In [34]:
dir(grid_search)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score

In [21]:
# ridge
rlf = linear_model.Ridge(alpha=0.1)
rlf.fit(x_train,y_train)

print('training score',rlf.score(x_train, y_train))
print('testing score',rlf.score(x_test, y_test))
print(rlf.coef_)
list(zip(x_train.columns,rlf.coef_))

training score 0.919930456525999
testing score 0.9237104028246876
[ 9.27895951e-02  4.66951543e-01  4.37956255e-02  7.19308545e-02
  4.59369022e-02  2.16424572e-03  7.82435825e-03  1.04761059e-02
 -7.85231853e-03 -1.32448672e-03  1.69795793e-03  4.03186796e-02
 -2.60310149e-02  1.86995131e-03 -1.51835767e-01  1.57104959e-02
  1.12266183e-01  2.45450219e-02  1.69207903e-02  1.56171690e-02
 -3.32208566e-04  1.28956221e-02  3.89007447e-02  4.29304203e-02]


[('LogLotArea', 0.09278959505309077),
 ('LogGrLivArea', 0.4669515426789206),
 ('LogBsmtSF', 0.043795625505231854),
 ('OverallQual', 0.07193085448613026),
 ('OverallCond', 0.04593690223661592),
 ('YearBuilt', 0.0021642457159206313),
 ('Neighborhood', 0.007824358252859815),
 ('BldgType', 0.010476105946723293),
 ('HouseStyle', -0.00785231852706007),
 ('MoSold', -0.0013244867242321278),
 ('NumBath', 0.0016979579256392035),
 ('GarageCars', 0.04031867955344403),
 ('BedroomAbvGr', -0.0260310148985254),
 ('BeenRemod', 0.0018699513111521617),
 ('HasFinBsmt', -0.1518357667848678),
 ('HasFinGarage', 0.015710495922836618),
 ('HasPool', 0.11226618304542663),
 ('HasFireplace', 0.024545021883393484),
 ('HasPorch', 0.016920790336176463),
 ('HasDeck', 0.015617168964217321),
 ('AttachedGarage', -0.00033220856550959793),
 ('GreatElectric', 0.01289562209086398),
 ('GreatHeat', 0.03890074467790228),
 ('CentralAir', 0.04293042031461239)]

## Model Deployment

#### Buyer Profile:

In [None]:
Budget = 100000
LivArea = 100000
HasGarage = 0
HasPool = 1
HasPorch = 1

buyer_data = [np.log(Budget), np.log(LivArea), HasGarage, HasPool, HasPorch]
buyer_cols = keep

buyer = pd.DataFrame(data = buyer_data, columns = buyer_cols)

#### Buyer Recomendation Tool:

In [None]:
from math import exp

def buyer_recommendation(buyer, model):
    
    #generate intial estimate
    budget = buyer['LogSalePrice']
    buyer = buyer.drop('LogSalePrice', axis=1)
    base_value = exp(model.predict(buyer))
    
    print('Your budget is: ', budget)
    print('Based on your profile, the house you want will cost: ', base_value)
    
    #search for ways to find a good deal
    recommendation = []
    
    #if over budget, do a grid search for ways to reduce cost
        #for feature in list:
            #if buyer[feature] == 1:
                buyer[feature] = 0
                new_value = exp(model.predict(buyer))
                savings = base_value - new_value
                recommendation.append(savings, feature)
        print('If you are willing to compromise on these features, you could save this much money:')
        print(recommendation)
        
    #if under budget, do a grid search for way to optimize cost
        #for feature in list:
            #if buyer[feature] == 0:
                buyer[feature] = 1
                new_value = exp(model.predict(buyer))
                if new_value < budget:
                    stretch = new_value - base_value
                    recommendation.append(stretch, feature)
        print('If you want, you could add these features without going over budget:')
        print(recommendation)
        
    #neighborhood search
    
    #square footage search
    
    #month of year search

#### Seller Profile:

In [None]:
LivArea = 100000
HasGarage = 0
HasPool = 1
HasPorch = 1

seller_data = [np.log(LivArea), HasGarage, HasPool, HasPorch]
seller_cols = keep[1:]

buyer = pd.DataFrame(data = seller_data, columns = seller_cols)

#### Seller Recommedation Tool:

In [None]:
def seller_recommendation(seller, model):
    
    #generate intial estimate
    budget = buyer['LogSalePrice']
    buyer = buyer.drop('LogSalePrice', axis=1)
    base_value = exp(model.predict(buyer))
    
    print('Your budget is: ', budget)
    print('Based on your profile, the house you want will cost: ', base_value)
    
    #search for ways to find a good deal
    recommendation = []
    
    #if over budget, do a grid search for ways to reduce cost
        #for feature in list:
            #if buyer[feature] == 1:
                buyer[feature] = 0
                new_value = exp(model.predict(buyer))
                savings = base_value - new_value
                recommendation.append(savings, feature)
        print('If you are willing to compromise on these features, you could save this much money:')
        print(recommendation)
        
    #if under budget, do a grid search for way to optimize cost
        #for feature in list:
            #if buyer[feature] == 0:
                buyer[feature] = 1
                new_value = exp(model.predict(buyer))
                if new_value < budget:
                    stretch = new_value - base_value
                    recommendation.append(stretch, feature)
        print('If you want, you could add these features without going over budget:')
        print(recommendation)
        
    #neighborhood search
    
    #square footage search
    
    #month of year search