In [1]:
import datetime as dt
import pandas as pd
import numpy as np

Set up predictor and target dataframes, deal with unusable data

In [2]:
X = pd.read_csv('data/kc_house_data.csv')
y = X.price
X.drop(X.loc[X['sqft_basement']=='?'].index,inplace=True) # remove '?' from the data.
X.drop('price',inplace=True,axis=1)

Engineer data to present as numerical values, not objects. 

In [3]:
# convert all string types into np floats
X.sqft_basement = [float(sq) for sq in list(X.sqft_basement)]

# Replaces grade strings with numerics based on data dict. 
grade_raws = list(X.grade.unique())
# replaces a cell value with the int of the first character of its existing string
for raw in grade_raws:
    X.grade.replace(to_replace=raw,value=int(raw[0]),inplace=True)

# replaces condition objects with numerics based on data dict.
condition_dict = {'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5}
for key in condition_dict:
    X.condition.replace(to_replace=condition_dict,inplace=True)

# replace yr_built NaNs with numeric 0
X.yr_renovated.replace(to_replace=np.nan,value=0,inplace=True)

In [4]:
# convert waterfront into numeric boolean
waterfront_bool_list = []
for value in X.waterfront:
    if value == 'YES':
        waterfront_bool_list.append(1)
    else:
        waterfront_bool_list.append(0) 
X.waterfront = waterfront_bool_list

In [5]:
# convert view from string into categorical ordinal
view_rank_list = [] 
view_dict = {'NONE':0,'FAIR':1,'AVERAGE':2,'GOOD':3,'EXCELLENT':4}

for value in X.view:
    if value in list(view_dict.keys()):
        view_rank_list.append(view_dict[value])
    else:
        view_rank_list.append(0)
X.view = view_rank_list

Engineer date into a usable type of number (datetime ordinal)

In [6]:
# convert dates into ordinals
X.date = pd.to_datetime(X['date'])
X.date = X['date'].map(dt.datetime.toordinal)

In [7]:
null_dict = {}
for feature in X.columns:
    null_count = sum(X[feature].isna())
    null_dict[feature] = null_count
# null_dict

Investigate multicolinearity before engineering inferred features

In [8]:
# X.corr()
cor_df=X.corr().abs().stack().reset_index().sort_values(0, ascending=False)
cor_df['pairs'] = list(zip(cor_df.level_0, cor_df.level_1))
cor_df.set_index(['pairs'], inplace = True)
cor_df.drop(columns=['level_1', 'level_0'], inplace = True)
cor_df.columns = ['cc']
cor_df.drop_duplicates(inplace=True)

In [9]:
cor_df[(cor_df.cc>.70) & (cor_df.cc <1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(sqft_living, sqft_above)",0.876678
"(sqft_living15, sqft_living)",0.756389
"(sqft_living, bathrooms)",0.755278
"(sqft_above, sqft_living15)",0.731016
"(sqft_lot, sqft_lot15)",0.720649


In [10]:
colinear_drops = ['sqft_above','sqft_living15','bathrooms','sqft_lot15']

Engineering inferred feature
- bed bath ratio: ratio of bedrooms to bathrooms
- level ratios: ratio of square feet above 'grade' and below (ratio of everything else to the basement)
- live_lot_ratio: ratio of living space to lot size
- relative living space: ratio of living space to the living space of the nearest 15 houses (sqft_living :: sqft_living15)
- relatve lot size: same as living space but for lot size instead. 
- level difference: difference in square footage of living space to basement space

In [11]:
bbratios = []
lvl_ratios = []
live_lot_ratio = []

for index,row in X.iterrows(): #iterate through every record
#     print(row)
    bbratio = row.bedrooms/row.bathrooms # calculate ratio of bedrooms to bathrooms
    bbratios.append(bbratio) # append ratio to the list

    LLratio = row.sqft_living/row.sqft_lot
    live_lot_ratio.append(LLratio)

    if row.sqft_basement == 0: # sqft_basement is zero if there is no basement
        lvl_ratios.append(0) # ratio should also be zero if there is no ratio
    else:
        lvl_ratio = row.sqft_above / row.sqft_basement # calculate ratio of space above grade vs below grade
        lvl_ratios.append(lvl_ratio) # append ratio to the list

X['bed_bath_ratio'] = bbratios # create new column and asign list as its values
X['level_ratio'] = lvl_ratios # create new column and asign list as its values
X['live_lot_ratio'] = live_lot_ratio # you get the idea . . . 

In [12]:
rel_live_space = []
rel_lot_size = [] 
rel_difference = []

for index,row in X.iterrows(): # for every record 
    live_dif = row.sqft_living - row.sqft_living15 # calculate difference in sqft of the given house and the nearest 15 other houses
    rel_live_space.append(live_dif) # append it to the list

    lot_dif = row.sqft_lot - row.sqft_lot15 # calculate difference in sqft of the given lot and the nearest 15 other lots
    rel_lot_size.append(lot_dif) # append it to the list

    lvl_dif = row.sqft_above - row.sqft_basement # calculate difference between space above grade and below grade
    rel_difference.append(lvl_dif) # append it to the list

X['relative_living_space'] = rel_live_space # assign respective list to new column 
X['relative_lot_size'] = rel_lot_size
X['level_difference'] = rel_difference

In [13]:
X.columns

Index(['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'bed_bath_ratio', 'level_ratio',
       'live_lot_ratio', 'relative_living_space', 'relative_lot_size',
       'level_difference'],
      dtype='object')

In [14]:
cor_df=X.corr().abs().stack().reset_index().sort_values(0, ascending=False)
cor_df['pairs'] = list(zip(cor_df.level_0, cor_df.level_1))
cor_df.set_index(['pairs'], inplace = True)
cor_df.drop(columns=['level_1', 'level_0'], inplace = True)
cor_df.columns = ['cc']
cor_df.drop_duplicates(inplace=True)

In [15]:
cor_df[(cor_df.cc>.70) & (cor_df.cc <1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(level_difference, sqft_above)",0.887606
"(sqft_living, sqft_above)",0.876678
"(sqft_living, sqft_living15)",0.756389
"(bathrooms, sqft_living)",0.755278
"(sqft_lot, relative_lot_size)",0.754121
"(sqft_living15, sqft_above)",0.731016
"(sqft_lot15, sqft_lot)",0.720649
