In [1]:
import datetime as dt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/kc_house_data.csv') 
df.drop(df.loc[df['sqft_basement']=='?'].index,inplace=True) # remove '?' from the data.
df.drop(['yr_renovated','waterfront','view','lat','long'],inplace=True,axis=1) # remove data lean features

- when to buy: (date)
- where to buy: (waterfront(bool), waterfront_loc)
- what to buy:  (condition, grade) bed-bath ratio)

### Investigating relationship of condition to waterfront 

##### cleaning sqft_basement column

In [3]:
df.sqft_basement = [float(sq) for sq in list(df.sqft_basement)] # convert all string types into np floats

##### cleaning grade column

In [4]:
# Replaces grade objects with numerics based on data dict. 
grade_raws = list(df.grade.unique())

for raw in grade_raws:
    df.grade.replace(to_replace=raw,value=int(raw[0]),inplace=True) # replaces a cell value with the int of the first character of its existing string
    
df.grade.unique()

array([7, 6, 8, 1, 9, 5, 4, 3], dtype=int64)

##### cleaning condition column

In [5]:
# replaces condition objects with numerics based on data dict.
condition_dict = {'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5}
for key in condition_dict:
    df.condition.replace(to_replace=condition_dict,inplace=True) 

In [6]:
# df.info()

##### engineer day of year from date column, and age from yr_built column

In [7]:
day_list = []
ages = [] 

for date in df.date: # for every record in the date column
    dt_list = date.split("/") # split string into list
    dtime = dt.datetime(int(dt_list[2]),int(dt_list[0]),int(dt_list[1])) # pass into datetime
    day_year = dtime.strftime('%j') # get day of year (1-365)
    day_list.append(day_year) # append to list

for year in df.yr_built: # 
    ytime = dt.datetime(year,1,1)
    now = dt.datetime.now()
    age = now - ytime 
    ages.append(age)

df['age'] = ages # create new column and asign ages as its values 
df['day_of_year'] = day_list # create new column and asign day_list as its values

##### engineer bedroom-bathroom, above-below grade levels ratio

In [8]:
bbratios = []
lvl_ratios = []

for index,row in df.iterrows(): #iterate through every record
#     print(row)
    bbratio = row.bedrooms/row.bathrooms # calculate ratio of bedrooms to bathrooms
    bbratios.append(bbratio) # append ratio to the list

    if row.sqft_basement == 0: # sqft_basement is zero if there is no basement
        lvl_ratios.append(0) # ratio should also be zero if there is no ratio
    else:
        lvl_ratio = row.sqft_above / row.sqft_basement # calculate ratio of space above grade vs below grade
        lvl_ratios.append(lvl_ratio) # append ratio to the list

df['bed_bath_ratio'] = bbratios # create new column and asign list as its values
df['level_ratio'] = lvl_ratios # create new column and asign list as its values

##### engineer relative size features

In [9]:
rel_live_space = []
rel_lot_size = [] 
rel_difference = []

for index,row in df.iterrows(): # for every record 
    live_dif = row.sqft_living - row.sqft_living15 # calculate difference in sqft of the given house and the nearest 15 other houses
    rel_live_space.append(live_dif) # append it to the list

    lot_dif = row.sqft_lot - row.sqft_lot15 # calculate difference in sqft of the given lot and the nearest 15 other lots
    rel_lot_size.append(lot_dif) # append it to the list

    lvl_dif = row.sqft_above - row.sqft_basement # calculate difference between space above grade and below grade
    rel_difference.append(lvl_dif) # append it to the list

df['relative_living_space'] = rel_live_space # assign respective list to new column 
df['relative_lot_size'] = rel_lot_size
df['level_difference'] = rel_difference

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21143 entries, 0 to 21596
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype          
---  ------                 --------------  -----          
 0   id                     21143 non-null  int64          
 1   date                   21143 non-null  object         
 2   price                  21143 non-null  float64        
 3   bedrooms               21143 non-null  int64          
 4   bathrooms              21143 non-null  float64        
 5   sqft_living            21143 non-null  int64          
 6   sqft_lot               21143 non-null  int64          
 7   floors                 21143 non-null  float64        
 8   condition              21143 non-null  int64          
 9   grade                  21143 non-null  int64          
 10  sqft_above             21143 non-null  int64          
 11  sqft_basement          21143 non-null  float64        
 12  yr_built               21143 non-null  int64  

###### export dataframe as a csv to be used in another notebook. 

In [11]:
from pathlib import Path  
filepath = Path('data/cleaned_kc.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath,index=False)

In [12]:
df = pd.read_csv('data/cleaned_kc.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,...,zipcode,sqft_living15,sqft_lot15,age,day_of_year,bed_bath_ratio,level_ratio,relative_living_space,relative_lot_size,level_difference
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,3,7,...,98178,1340,5650,24538 days 14:04:55.984163,286,3.0,0.0,-160,0,1180.0
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,3,7,...,98125,1690,7639,25999 days 14:04:55.984163,343,1.333333,5.425,880,-397,1770.0
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,3,6,...,98028,2720,8062,32573 days 14:04:55.984163,56,2.0,0.0,-1950,1938,770.0
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,5,7,...,98136,1360,5000,20885 days 14:04:55.984163,343,1.333333,1.153846,600,0,140.0
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,3,8,...,98074,1800,7503,12850 days 14:04:55.984163,49,1.5,0.0,-120,577,1680.0
