In [1]:
import datetime as dt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/kc_house_data.csv')

- when to buy: (date)
- where to buy: (waterfront(bool), waterfront_loc)
- what to buy:  (condition, grade) bed-bath ratio)

### Investigating relationship of condition to waterfront 

In [3]:
# df.head()

###### cleaning grade column

In [4]:
# Replaces grade objects with numerics based on data dict. 
grade_raws = list(df.grade.unique())

for raw in grade_raws:
    df.grade.replace(to_replace=raw,value=int(raw[0]),inplace=True)
    
df.grade.unique()

array([7, 6, 8, 1, 9, 5, 4, 3], dtype=int64)

In [5]:
water_bad_grade = df.loc[(df['waterfront']=='YES') & (df['grade']<4)]
water_mid_grade = df.loc[(df['waterfront']=='YES') & ((df['grade']>3)&(df['grade']<7))]
water_lux_grade = df.loc[(df['waterfront']=='YES') & (df['grade']>6)]

###### cleaning condition column

In [6]:
# replaces condition objects with numerics based on data dict.
condition_dict = {'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5}
for key in condition_dict:
    df.condition.replace(to_replace=condition_dict,inplace=True)

In [7]:
water_below_con = df.loc[(df['waterfront']=='YES') & (df['condition'] < 3)]
water_above_con = df.loc[(df['waterfront']=='YES') & (df['condition'] > 3)]
water_at_con = df.loc[(df['waterfront']=='YES') & (df['condition'] == 3)]
null_water_con = df.loc[df['waterfront'].isnull()]
none_water_con = df.loc[df['waterfront'] == 'NO']
much_water_con = df.loc[df['waterfront'] == 'YES']

###### engineer column to show nearest body of water

In [8]:
waterfront_zip_dict = {'Duwamish':[98168],
'Elliott Bay':[98119,98104,98129,98132,98127,98125,98195,98101,98134,98170,98139,98131,98181], 
'Puget Sound':[98071,98083,98013,98070,98031,98131,98063,98195,98207,98190], 
'Lake Union':[98109], 
'Ship Canal':[00000], 
'Lake Washington':[98072,98077], 
'Lake Sammamish':[98074,98075,98029], 
'other lake':[00000], 
'river/slough waterfronts':[00000]}
water_zip_list = []
for front in waterfront_zip_dict:
    water_zip_list.extend(waterfront_zip_dict[front]) # list of all waterfront zipcodes
df['waterfront_loc'] = np.nan

In [9]:
waterfront_locs = []

for index,row in df.iterrows():
    for front in waterfront_zip_dict:
        if row.zipcode in waterfront_zip_dict[front]:
            waterfront_locs.append((index,front))
            
for tup in waterfront_locs:
    df.iloc[tup[0],-1] = tup[1]
    
# waterfront_locs
df.waterfront_loc

0                   NaN
1           Elliott Bay
2                   NaN
3                   NaN
4        Lake Sammamish
              ...      
21592               NaN
21593               NaN
21594               NaN
21595               NaN
21596               NaN
Name: waterfront_loc, Length: 21597, dtype: object

In [10]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,waterfront_loc
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,...,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650,
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639,Elliott Bay
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,770,0.0,1933,,98028,47.7379,-122.233,2720,8062,
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000,
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503,Lake Sammamish


##### engineer day of year feature from date column

In [11]:
day_list = []
for date in df.date:
    dt_list = date.split("/")
    dtime = dt.datetime(int(dt_list[2]),int(dt_list[0]),int(dt_list[1]))
    day_year = dtime.strftime('%j')
    print(day_year)
    day_list.append(day_year)
df['day_of_year'] = day_list

286
343
056
343
049
132
178
015
105
071
093
147
148
280
071
024
212
149
339
114
134
238
184
136
324
307
177
335
175
061
314
335
175
314
337
164
148
364
044
171
196
223
188
301
210
199
084
197
118
070
259
048
365
036
062
132
231
097
239
054
344
240
294
341
154
324
252
282
237
163
255
005
161
191
075
309
286
110
160
082
336
356
301
028
153
318
307
169
139
247
142
070
057
206
357
251
089
255
192
136
162
300
346
280
267
092
143
204
064
239
206
063
259
328
301
325
177
174
008
131
002
323
096
308
169
061
198
198
164
096
005
118
163
132
026
225
217
149
188
127
178
189
128
105
058
337
176
218
303
308
232
099
090
091
136
209
324
219
317
276
127
068
063
171
224
342
050
083
156
162
075
002
122
016
272
041
190
150
321
007
206
104
164
204
248
325
105
125
108
339
230
309
255
176
335
135
315
204
013
287
070
316
197
216
062
216
262
036
327
198
112
198
135
014
147
015
052
184
240
296
234
156
247
261
246
155
068
149
015
062
103
167
325
324
245
161
099
079
182
149
342
274
027
117
238
043
272
057
073
112


203
219
232
041
043
140
234
287
178
219
126
237
181
301
365
132
133
209
345
274
307
174
170
077
107
170
330
090
253
027
272
091
055
321
183
058
143
027
280
170
129
232
268
300
078
353
261
139
155
049
226
048
042
206
206
254
069
189
072
006
057
057
240
205
301
161
125
182
142
364
008
171
028
125
262
197
044
133
272
097
082
113
197
289
253
342
079
124
219
118
077
205
148
191
190
190
119
139
006
232
265
183
118
217
318
139
354
120
147
275
232
324
349
357
124
195
133
295
050
328
260
065
212
266
213
092
024
097
107
223
321
218
293
199
196
188
016
106
336
057
301
345
117
281
111
036
083
058
168
082
155
086
224
107
232
197
055
089
296
110
248
188
141
141
050
090
254
282
123
304
283
213
051
171
259
096
283
210
261
203
057
265
351
125
301
281
322
339
072
171
218
345
198
072
058
223
030
063
126
321
188
267
027
029
238
227
063
028
203
055
140
129
338
097
216
104
325
089
090
078
240
232
015
315
302
260
205
329
205
206
112
216
056
049
322
198
062
259
155
300
283
350
105
196
076
293
350
351
164
117


251
162
289
252
316
178
147
223
092
016
294
258
120
055
345
118
153
155
220
345
190
197
106
169
055
104
178
006
205
337
256
027
119
292
203
037
086
133
174
005
062
232
079
168
344
268
070
301
335
181
066
028
319
101
274
343
330
265
037
195
204
076
337
360
176
170
191
163
173
204
267
089
352
037
205
330
269
196
111
076
117
289
161
072
141
177
055
057
104
232
112
192
063
026
302
086
100
307
261
211
127
082
164
297
005
122
083
190
223
063
230
106
064
053
127
286
098
096
191
027
212
211
133
161
072
169
309
037
195
029
220
232
163
336
105
129
147
239
190
068
344
238
230
206
082
255
253
195
282
265
125
177
259
110
113
153
155
329
294
055
192
104
190
083
190
056
227
318
043
112
192
126
231
275
132
248
104
161
077
120
339
365
040
167
155
204
278
084
175
111
112
210
132
251
360
202
199
223
324
030
338
357
089
273
197
035
082
325
197
176
142
131
204
310
135
084
177
198
251
083
036
127
287
325
350
223
160
120
323
198
222
241
251
205
062
098
086
252
216
177
189
049
303
261
205
086
177
096
063
120


084
078
280
064
226
267
238
013
351
289
135
352
190
223
040
040
147
114
055
131
307
164
224
196
083
205
128
262
063
069
240
198
190
089
231
282
133
103
058
237
240
121
309
002
323
153
289
044
307
076
069
176
114
282
266
041
078
223
217
035
351
283
077
154
036
082
245
117
107
351
126
119
220
184
190
259
113
190
124
139
197
196
177
136
296
138
140
343
210
028
223
283
112
189
177
241
329
324
205
307
275
183
169
224
075
160
317
344
189
296
161
181
293
127
125
290
063
128
111
161
195
350
089
321
154
189
252
178
174
217
119
174
323
167
308
246
167
356
234
160
150
099
203
023
122
132
344
132
105
276
293
269
318
122
232
083
147
288
349
113
129
161
253
302
258
184
318
100
098
111
086
119
135
150
015
153
286
301
083
134
128
100
072
281
316
157
105
127
268
171
239
051
120
293
142
112
182
106
226
117
025
240
170
178
133
311
174
096
098
247
057
056
167
231
337
127
120
142
124
162
255
049
310
210
339
266
132
098
322
295
358
223
049
328
044
026
040
232
079
323
238
184
258
246
192
294
041
217
323
240


201
287
282
274
217
096
349
121
335
262
322
147
283
122
266
360
283
157
124
050
282
314
253
232
084
253
097
259
280
276
125
050
342
126
211
160
125
084
056
316
103
314
293
350
136
247
325
323
149
092
343
132
358
248
147
140
071
365
078
216
311
126
314
132
266
082
316
134
346
028
154
123
150
110
147
141
171
196
184
007
356
323
259
322
231
328
070
023
217
014
338
086
022
203
143
213
198
238
339
050
057
176
124
114
056
157
106
258
135
113
192
092
057
136
220
064
070
230
297
092
225
048
056
203
335
006
254
177
346
281
202
268
304
304
175
155
181
085
335
140
098
084
311
300
212
069
220
169
204
195
142
086
203
129
107
041
118
091
340
129
318
213
188
335
197
063
087
182
041
143
083
035
140
068
023
120
148
162
071
216
251
020
296
192
338
121
337
028
311
013
125
135
154
302
260
065
098
131
113
163
007
126
090
225
064
268
044
212
086
273
176
321
212
289
065
176
282
058
197
226
098
069
049
351
161
147
360
169
147
084
328
328
282
075
296
107
071
199
129
195
357
075
232
097
314
037
126
062
176
238


157
231
069
299
217
203
303
181
118
234
161
293
183
245
042
113
132
063
318
226
363
170
206
075
288
089
065
171
086
262
012
198
195
288
289
267
189
057
111
163
063
290
198
140
303
113
168
148
160
267
170
049
054
297
179
093
328
237
167
128
352
129
132
209
160
363
254
350
076
154
262
290
202
260
132
232
171
106
188
346
061
161
147
075
308
142
203
178
084
099
056
360
352
245
132
108
099
048
353
036
013
068
171
259
349
042
049
131
055
218
314
337
231
230
085
295
328
206
097
167
303
163
041
125
195
099
224
075
346
113
324
140
314
143
268
317
071
252
363
261
100
223
266
086
085
061
357
316
171
113
160
213
351
235
071
158
265
107
226
147
296
318
302
142
301
060
181
167
139
209
097
097
171
326
308
302
294
247
117
224
188
082
091
339
033
119
063
147
314
258
125
209
294
190
140
107
090
129
302
086
279
125
021
114
076
081
114
170
140
198
205
144
147
251
178
153
315
114
220
156
099
350
128
223
310
148
237
124
289
079
295
288
224
324
238
170
314
100
171
329
143
209
280
288
325
351
154
182
079
160


083
209
125
161
315
211
241
259
254
097
164
262
290
047
008
241
098
012
365
321
344
198
176
128
189
245
084
253
114
155
226
133
168
364
293
188
117
211
213
134
124
160
169
352
113
237
161
303
269
248
128
079
021
041
223
129
134
210
213
070
211
084
290
352
302
189
129
262
057
128
155
199
289
083
322
125
188
050
288
198
213
050
269
202
195
132
171
121
053
350
125
124
296
188
195
231
265
196
105
335
126
325
063
077
140
283
161
128
062
085
219
290
183
086
253
022
181
206
099
076
245
167
223
196
217
268
225
233
240
076
178
195
160
008
117
311
176
343
364
237
069
147
281
124
316
076
155
225
304
100
198
090
238
125
082
226
352
234
133
253
176
126
085
238
300
338
062
065
301
068
322
213
183
176
149
216
092
118
204
051
146
100
302
223
139
005
241
290
090
049
171
219
269
064
308
072
171
143
255
164
227
275
083
246
182
164
188
055
122
202
282
234
232
181
227
344
276
175
245
225
120
363
127
353
213
144
273
163
336
005
103
160
082
169
121
308
078
345
188
209
054
061
110
107
198
114
177
114
127
365


##### engineer bedroom-bathroom ratio

In [14]:
bbratios = []
bedbath = df[['bedrooms','bathrooms']]

for index,row in bedbath.iterrows():
#     print(row)
    ratio = row.bedrooms/row.bathrooms
    bbratios.append(ratio)
    
df['bed_bath_ratio'] = bbratios

##### engineer relative size features

In [15]:
sizes = df[['sqft_living','sqft_lot','sqft_living15','sqft_lot15']]
rel_live_space = []
rel_lot_size = [] 

for index,row in sizes.iterrows():
    live_dif = row.sqft_living - row.sqft_living15
    rel_live_space.append(live_dif)
    lot_dif = row.sqft_lot - row.sqft_lot15
    rel_lot_size.append(lot_dif)

df['relative_living_space'] = rel_live_space
df['relative_lot_size'] = rel_lot_size

In [16]:
xydf = df[['price','day_of_year','relative_living_space','relative_lot_size','bed_bath_ratio','grade','waterfront_loc']]
print(xydf.shape)
xydf

(21597, 7)


Unnamed: 0,price,day_of_year,relative_living_space,relative_lot_size,bed_bath_ratio,grade,waterfront_loc
0,221900.0,286,-160,0,3.000000,7,
1,538000.0,343,880,-397,1.333333,7,Elliott Bay
2,180000.0,056,-1950,1938,2.000000,6,
3,604000.0,343,600,0,1.333333,7,
4,510000.0,049,-120,577,1.500000,8,Lake Sammamish
...,...,...,...,...,...,...,...
21592,360000.0,141,0,-378,1.200000,8,
21593,400000.0,054,480,-1387,1.600000,8,
21594,402101.0,174,0,-657,2.666667,7,
21595,400000.0,016,190,1101,1.200000,8,


In [17]:
sorted(xydf.grade.unique())

[1, 3, 4, 5, 6, 7, 8, 9]

###### export dataframe as a csv to be used in another notebook. 

In [18]:
# from pathlib import Path  
# filepath = Path('data/cleaned_kc.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# xydf.to_csv(filepath,index=False)  

In [None]:
df = pd.read_csv('data/cleaned_kc.csv')
df