In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/kc_house_data.csv')

### Investigating relationship of condition to waterfront 

###### cleaning grade column

In [5]:
# Replaces grade objects with numerics based on data dict. 
grade_raws = list(df.grade.unique())

for raw in grade_raws:
    df.grade.replace(to_replace=raw,value=int(raw[0]),inplace=True)
    
df.grade.unique()

array([7, 6, 8, 1, 9, 5, 4, 3], dtype=int64)

In [6]:
water_bad_grade = df.loc[(df['waterfront']=='YES') & (df['grade']<4)]
water_mid_grade = df.loc[(df['waterfront']=='YES') & ((df['grade']>3)&(df['grade']<7))]
water_lux_grade = df.loc[(df['waterfront']=='YES') & (df['grade']>6)]

###### cleaning condition column

In [7]:
# replaces condition objects with numerics based on data dict.
condition_dict = {'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5}
for key in condition_dict:
    df.condition.replace(to_replace=condition_dict,inplace=True)

In [8]:
water_below_con = df.loc[(df['waterfront']=='YES') & (df['condition'] < 3)]
water_above_con = df.loc[(df['waterfront']=='YES') & (df['condition'] > 3)]
water_at_con = df.loc[(df['waterfront']=='YES') & (df['condition'] == 3)]
null_water_con = df.loc[df['waterfront'].isnull()]
none_water_con = df.loc[df['waterfront'] == 'NO']
much_water_con = df.loc[df['waterfront'] == 'YES']

In [9]:
print(f"There are {df.shape[0]} total records available")
print(f"There are {null_water_con.shape[0]} records with NaN waterfronts")
print(f"There are {none_water_con.shape[0]} records stated without waterfronts")
print("\n")
print(f"Of the {much_water_con.shape[0]} waterfront properties")
print("There are no null 'grade' or 'condition' values")
print("\n")
print(f"There are {water_below_con.shape[0]} below average properties on that are on a waterfront")
print(f"There are {water_at_con.shape[0]} average properties on that are on a waterfront")
print(f"There are {water_above_con.shape[0]} above average properties on that are on a waterfront")
print("\n")
print(f"There are {water_bad_grade.shape[0]} badly conditioned properties on that are on a waterfront")
print(f"There are {water_mid_grade.shape[0]} mildly conditioned properties on that are on a waterfront")
print(f"There are {water_lux_grade.shape[0]} well conditioned properties on that are on a waterfront")

There are 21597 total records available
There are 2376 records with NaN waterfronts
There are 19075 records stated without waterfronts


Of the 146 waterfront properties
There are no null 'grade' or 'condition' values


There are 2 below average properties on that are on a waterfront
There are 80 average properties on that are on a waterfront
There are 64 above average properties on that are on a waterfront


There are 57 badly conditioned properties on that are on a waterfront
There are 14 mildly conditioned properties on that are on a waterfront
There are 75 well conditioned properties on that are on a waterfront


###### engineer column to show nearest body of water

In [65]:
waterfront_zip_dict = {'Duwamish':[98168],
'Elliott Bay':[98119,98104,98129,98132,98127,98125,98195,98101,98134,98170,98139,98131,98181], 
'Puget Sound':[98071,98083,98013,98070,98031,98131,98063,98195,98207,98190], 
'Lake Union':[98109], 
'Ship Canal':[00000], 
'Lake Washington':[98072,98077], 
'Lake Sammamish':[98074,98075,98029], 
'other lake':[00000], 
'river/slough waterfronts':[00000]}
water_zip_list = []
for front in waterfront_zip_dict:
    water_zip_list.extend(waterfront_zip_dict[front]) # list of all waterfront zipcodes
df['waterfront_loc'] = np.nan

In [66]:
waterfront_locs = []

for index,row in df.iterrows():
    for front in waterfront_zip_dict:
        if row.zipcode in waterfront_zip_dict[front]:
            waterfront_locs.append((index,front))
            
for tup in waterfront_locs:
    df.iloc[tup[0],-1] = tup[1]
    
# waterfront_locs
df.waterfront_loc

0                   NaN
1           Elliott Bay
2                   NaN
3                   NaN
4        Lake Sammamish
              ...      
21592               NaN
21593               NaN
21594               NaN
21595               NaN
21596               NaN
Name: waterfront_loc, Length: 21597, dtype: object

In [67]:
df.waterfront_loc.unique()

array([nan, 'Elliott Bay', 'Lake Sammamish', 'Puget Sound',
       'Lake Washington', 'Duwamish', 'Lake Union'], dtype=object)

In [77]:
print('Records containing waterfront bool and zip: ',df.loc[df['waterfront_loc'].notnull() & df['waterfront'].notnull()].shape[0])
print('Records with waterfront zip: ',df.loc[df['waterfront_loc'].notnull()].shape[0])
print('Records with waterfront bool: ',df.loc[df['waterfront'].notnull()].shape[0])

Records containing waterfront bool and zip:  2627
Records with waterfront zip:  2953
Records with waterfront bool:  19221


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              21597 non-null  int64  
 1   date            21597 non-null  object 
 2   price           21597 non-null  float64
 3   bedrooms        21597 non-null  int64  
 4   bathrooms       21597 non-null  float64
 5   sqft_living     21597 non-null  int64  
 6   sqft_lot        21597 non-null  int64  
 7   floors          21597 non-null  float64
 8   waterfront      19221 non-null  object 
 9   view            21534 non-null  object 
 10  condition       21597 non-null  int64  
 11  grade           21597 non-null  int64  
 12  sqft_above      21597 non-null  int64  
 13  sqft_basement   21597 non-null  object 
 14  yr_built        21597 non-null  int64  
 15  yr_renovated    17755 non-null  float64
 16  zipcode         21597 non-null  int64  
 17  lat             21597 non-null 