# House Hunters Linear Model

Linear regression model of human-interpretable features that will make recommendations for people looking to buy or sell a house.

## Pre-Processing

In [1]:
# import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import exp

# set display options
pd.set_option('display.max_columns', None)

In [2]:
# import dataset
housing = pd.read_csv('data/Ames_Housing_Price_Data.csv', index_col=0)
# real_estate = pd.read_csv('./data/Ames_Real_Estate_Data.csv')

#### Feature Cleaning

In [3]:
#Remove Outliers
housing = housing[np.logical_and(housing.SalePrice >= 40000, housing.SalePrice <= 750000)]
housing = housing[housing.GrLivArea < 4000]
housing = housing[housing.BedroomAbvGr > 0]
housing.loc[housing['GarageCars'] == 5, 'GarageCars'] = 4

#Remove Bad Classes
housing = housing[housing.Neighborhood != 'Landmrk']

housing.MSZoning = housing.MSZoning.astype('string')
housing.MSZoning = housing.MSZoning.str.strip()
housing = housing[housing.MSZoning.isin(["FV", "RH", "RL", "RM"])]

housing = housing[housing.Functional.isin(["Typ", "Min1", "Min2"])]

housing.SaleType = housing.SaleType.astype('string')
housing.SaleType = housing.SaleType.str.strip()
housing = housing[housing.SaleType == 'WD']

housing = housing[housing.SaleCondition == 'Normal']


#Replace NAs
housing = housing.fillna(0)

#### Feature Engineering

In [4]:
#Area Calculations
housing['PorchTotSF'] = housing.OpenPorchSF + housing.EnclosedPorch + housing['3SsnPorch'] + housing.ScreenPorch
housing['BsmtSF'] = housing.BsmtFinSF1 + housing.BsmtFinSF2
housing.loc[housing['BsmtSF'] == 0, 'BsmtSF'] = exp(1)

#Log Transforms
housing['LogSalePrice'] = np.log(housing.SalePrice)
housing['LogLotArea'] = np.log(housing.LotArea)
housing['LogGrLivArea'] = np.log(housing.GrLivArea)
housing['LogBsmtSF'] = np.log(housing.BsmtSF)

#Categorical to Ordinal
housing.Neighborhood = housing.Neighborhood.replace({'MeadowV':1,'BrDale':2, 'IDOTRR':3, 'BrkSide':4, 'OldTown':5, 'Edwards':6, 'SWISU':7, 'Landmrk':8, 'Sawyer':9,\
                           'NPkVill':10, 'Blueste':11, 'NAmes':12, 'Mitchel':13, 'SawyerW':14, 'Gilbert':15, 'NWAmes':16, 'Greens':17, 'Blmngtn':18,\
                           'CollgCr':19, 'Crawfor':20, 'ClearCr':21, 'Somerst':22, 'Timber':23, 'Veenker':24, 'GrnHill':25, 'StoneBr':26,'NridgHt':27, 'NoRidge':28})
housing.BldgType = housing.BldgType.replace({'2fmCon':1,'Twnhs':2, 'Duplex':3, '1Fam':4, 'TwnhsE':5})
housing.HouseStyle = housing.HouseStyle.replace({'1.5Unf':1,'1.5Fin':2, 'SFoyer':3, 'SLvl':4, '1Story':5, '2.5Unf':6, '2Story':7, '2.5Fin':8})
housing.MoSold = housing.MoSold.replace({1:11, 9:10, 8:9, 6:8, 7:7, 11:6, 12:5, 2:4, 3:3, 10:2, 5:1, 4:0})

#Renumber Numerical
housing['NumBath'] = housing.FullBath + 0.5*housing.HalfBath + 0.5*housing.BsmtFullBath

#Binary HasBLANK Categories
housing['BeenRemod'] = np.where(housing.YearBuilt != housing.YearRemodAdd, 1, 0)
housing['HasFinBsmt'] = np.where(housing.BsmtFinSF1 > 0, 1, 0)
housing['HasFinGarage'] = np.where(housing.GarageFinish == "Fin", 1, 0)
housing['HasPool'] = np.where(housing.PoolArea > 0, 1, 0)
housing['HasFireplace'] = np.where(housing.Fireplaces > 0, 1, 0)
housing['HasPorch'] = np.where(housing.PorchTotSF > 0, 1, 0)
housing['HasDeck'] = np.where(housing.WoodDeckSF > 0, 1, 0)


#Binary Quality/Cond Categories
housing['AttachedGarage'] = np.where(housing.GarageType == "Attchd", 1, 0)
housing['GreatElectric'] = np.where(housing.Electrical == "SBrkr", 1, 0)
housing['GreatHeat'] = np.where(housing.HeatingQC == "Ex", 1, 0)
housing['CentralAir'] = np.where(housing.CentralAir == "Y", 1, 0)

#### Feature Selection

In [38]:
model_cols = ['LogSalePrice', 'LogLotArea', 'LogGrLivArea', 'LogBsmtSF', 'OverallQual', 'OverallCond', 'YearBuilt',\
              'Neighborhood', 'BldgType', 'HouseStyle', 'MoSold', 'NumBath', 'GarageCars', 'BedroomAbvGr',\
              'BeenRemod', 'HasFinBsmt', 'HasFinGarage', 'HasPool', 'HasFireplace', 'HasPorch', 'HasDeck',\
              'AttachedGarage', 'GreatElectric', 'GreatHeat', 'CentralAir']
housing = housing[model_cols]
x = housing.drop(['LogSalePrice', 'LogBsmtSF', 'HasPool', 'HouseStyle', 'BedroomAbvGr', \
                  'BeenRemod', 'YearBuilt', 'MoSold'], axis=1)
y=housing.LogSalePrice

In [39]:
from sklearn import linear_model
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=.2, random_state=0)
clf = linear_model.Lasso(alpha=9.326033468832199e-05)
clf.fit(x_train,y_train)

Lasso(alpha=9.326033468832199e-05, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [40]:
x.columns

Index(['LogLotArea', 'LogGrLivArea', 'OverallQual', 'OverallCond',
       'Neighborhood', 'BldgType', 'NumBath', 'GarageCars', 'HasFinBsmt',
       'HasFinGarage', 'HasFireplace', 'HasPorch', 'HasDeck', 'AttachedGarage',
       'GreatElectric', 'GreatHeat', 'CentralAir'],
      dtype='object')

In [47]:
[[1,2,3]]

[[1, 2, 3]]

In [46]:
housing.NumBath.describe()

count    2263.000000
mean        2.013168
std         0.686876
min         0.800000
25%         1.500000
50%         2.000000
75%         2.500000
max         6.200000
Name: NumBath, dtype: float64

In [45]:
housing.BldgType.value_counts()

4    1885
5     183
2      81
3      73
1      41
Name: BldgType, dtype: int64

## Model Generation

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [7]:
y = housing['LogSalePrice']
x = housing.drop('LogSalePrice', axis=1)

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=.2, random_state=0)

lm = LinearRegression()
lm.fit(x_train, y_train)

print(lm.score(x_train, y_train))
print(lm.score(x_test, y_test))
print(lm.coef_)

0.9199308325342065
0.9236199354291406
[ 9.27327073e-02  4.67985294e-01  4.39133320e-02  7.18724949e-02
  4.59541323e-02  2.16733925e-03  7.82057043e-03  1.04898002e-02
 -7.85741492e-03 -1.32503140e-03  1.49201415e-03  4.02823721e-02
 -2.61653402e-02  1.84865930e-03 -1.52443546e-01  1.56838337e-02
  1.14880150e-01  2.44327410e-02  1.68908778e-02  1.56180026e-02
 -3.70010899e-04  1.29045391e-02  3.88929693e-02  4.29182814e-02]


In [33]:
# lasso
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.0003417)
clf.fit(x_train,y_train)

print('training score',clf.score(x_train, y_train))
print('testing score',clf.score(x_test, y_test))
print(clf.coef_)
list(zip(x_train.columns,clf.coef_))

training score 0.9195543710154819
testing score 0.9256463678959304
[ 9.33521666e-02  4.60918803e-01  3.93640048e-02  7.27481952e-02
  4.62682783e-02  2.17625115e-03  7.91376495e-03  1.01301765e-02
 -7.77654622e-03 -1.28787394e-03  2.65123650e-03  4.01807205e-02
 -2.45032575e-02  8.21034226e-05 -1.26588290e-01  1.48941531e-02
  0.00000000e+00  2.53966677e-02  1.57929192e-02  1.49578413e-02
  0.00000000e+00  9.19006581e-03  3.78467363e-02  3.53064143e-02]


[('LogLotArea', 0.09335216664699966),
 ('LogGrLivArea', 0.4609188032337754),
 ('LogBsmtSF', 0.039364004839795366),
 ('OverallQual', 0.0727481951642858),
 ('OverallCond', 0.046268278295084696),
 ('YearBuilt', 0.002176251146532969),
 ('Neighborhood', 0.007913764954765852),
 ('BldgType', 0.0101301765267008),
 ('HouseStyle', -0.007776546223356674),
 ('MoSold', -0.0012878739376956262),
 ('NumBath', 0.00265123649757247),
 ('GarageCars', 0.04018072045509322),
 ('BedroomAbvGr', -0.024503257527527335),
 ('BeenRemod', 8.2103422556146e-05),
 ('HasFinBsmt', -0.12658829011899603),
 ('HasFinGarage', 0.01489415313294915),
 ('HasPool', 0.0),
 ('HasFireplace', 0.02539666771983235),
 ('HasPorch', 0.01579291921453131),
 ('HasDeck', 0.0149578412896334),
 ('AttachedGarage', 0.0),
 ('GreatElectric', 0.00919006581197453),
 ('GreatHeat', 0.037846736337777416),
 ('CentralAir', 0.03530641429631902)]

In [29]:
# tune alpha in lasso with gridsearchCV
from sklearn.model_selection import GridSearchCV

clf = linear_model.Lasso()
param_grid={'alpha':[.00001,.0001, .001,.01]}
grid_search = GridSearchCV(estimator= clf, param_grid= param_grid)
grid_search.fit(x_train,y_train)

print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

# list(zip(x_train.columns,clf.coef_))

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
{'alpha': 0.0001}
0.916118392881971


In [34]:
dir(grid_search)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score

In [21]:
# ridge
rlf = linear_model.Ridge(alpha=0.1)
rlf.fit(x_train,y_train)

print('training score',rlf.score(x_train, y_train))
print('testing score',rlf.score(x_test, y_test))
print(rlf.coef_)
list(zip(x_train.columns,rlf.coef_))

training score 0.919930456525999
testing score 0.9237104028246876
[ 9.27895951e-02  4.66951543e-01  4.37956255e-02  7.19308545e-02
  4.59369022e-02  2.16424572e-03  7.82435825e-03  1.04761059e-02
 -7.85231853e-03 -1.32448672e-03  1.69795793e-03  4.03186796e-02
 -2.60310149e-02  1.86995131e-03 -1.51835767e-01  1.57104959e-02
  1.12266183e-01  2.45450219e-02  1.69207903e-02  1.56171690e-02
 -3.32208566e-04  1.28956221e-02  3.89007447e-02  4.29304203e-02]


[('LogLotArea', 0.09278959505309077),
 ('LogGrLivArea', 0.4669515426789206),
 ('LogBsmtSF', 0.043795625505231854),
 ('OverallQual', 0.07193085448613026),
 ('OverallCond', 0.04593690223661592),
 ('YearBuilt', 0.0021642457159206313),
 ('Neighborhood', 0.007824358252859815),
 ('BldgType', 0.010476105946723293),
 ('HouseStyle', -0.00785231852706007),
 ('MoSold', -0.0013244867242321278),
 ('NumBath', 0.0016979579256392035),
 ('GarageCars', 0.04031867955344403),
 ('BedroomAbvGr', -0.0260310148985254),
 ('BeenRemod', 0.0018699513111521617),
 ('HasFinBsmt', -0.1518357667848678),
 ('HasFinGarage', 0.015710495922836618),
 ('HasPool', 0.11226618304542663),
 ('HasFireplace', 0.024545021883393484),
 ('HasPorch', 0.016920790336176463),
 ('HasDeck', 0.015617168964217321),
 ('AttachedGarage', -0.00033220856550959793),
 ('GreatElectric', 0.01289562209086398),
 ('GreatHeat', 0.03890074467790228),
 ('CentralAir', 0.04293042031461239)]

## Model Deployment

#### Buyer Profile:

In [None]:
Budget = 100000
LivArea = 100000
HasGarage = 0
HasPool = 1
HasPorch = 1

buyer_data = [np.log(Budget), np.log(LivArea), HasGarage, HasPool, HasPorch]
buyer_cols = keep

buyer = pd.DataFrame(data = buyer_data, columns = buyer_cols)

#### Buyer Recomendation Tool:

In [None]:
from math import exp

def buyer_recommendation(buyer, model):
    
    #generate intial estimate
    budget = buyer['LogSalePrice']
    buyer = buyer.drop('LogSalePrice', axis=1)
    base_value = exp(model.predict(buyer))
    
    print('Your budget is: ', budget)
    print('Based on your profile, the house you want will cost: ', base_value)
    
    #search for ways to find a good deal
    recommendation = []
    
    #if over budget, do a grid search for ways to reduce cost
        #for feature in list:
            #if buyer[feature] == 1:
                buyer[feature] = 0
                new_value = exp(model.predict(buyer))
                savings = base_value - new_value
                recommendation.append(savings, feature)
        print('If you are willing to compromise on these features, you could save this much money:')
        print(recommendation)
        
    #if under budget, do a grid search for way to optimize cost
        #for feature in list:
            #if buyer[feature] == 0:
                buyer[feature] = 1
                new_value = exp(model.predict(buyer))
                if new_value < budget:
                    stretch = new_value - base_value
                    recommendation.append(stretch, feature)
        print('If you want, you could add these features without going over budget:')
        print(recommendation)
        
    #neighborhood search
    
    #square footage search
    
    #month of year search

#### Seller Profile:

In [None]:
LivArea = 100000
HasGarage = 0
HasPool = 1
HasPorch = 1

seller_data = [np.log(LivArea), HasGarage, HasPool, HasPorch]
seller_cols = keep[1:]

buyer = pd.DataFrame(data = seller_data, columns = seller_cols)

#### Seller Recommedation Tool:

In [None]:
def seller_recommendation(seller, model):
    
    #generate intial estimate
    budget = buyer['LogSalePrice']
    buyer = buyer.drop('LogSalePrice', axis=1)
    base_value = exp(model.predict(buyer))
    
    print('Your budget is: ', budget)
    print('Based on your profile, the house you want will cost: ', base_value)
    
    #search for ways to find a good deal
    recommendation = []
    
    #if over budget, do a grid search for ways to reduce cost
        #for feature in list:
            #if buyer[feature] == 1:
                buyer[feature] = 0
                new_value = exp(model.predict(buyer))
                savings = base_value - new_value
                recommendation.append(savings, feature)
        print('If you are willing to compromise on these features, you could save this much money:')
        print(recommendation)
        
    #if under budget, do a grid search for way to optimize cost
        #for feature in list:
            #if buyer[feature] == 0:
                buyer[feature] = 1
                new_value = exp(model.predict(buyer))
                if new_value < budget:
                    stretch = new_value - base_value
                    recommendation.append(stretch, feature)
        print('If you want, you could add these features without going over budget:')
        print(recommendation)
        
    #neighborhood search
    
    #square footage search
    
    #month of year search