# King County Housing Price Prediction

In [49]:
import pandas as pd
import numpy as np
import pickle 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from sklearn.preprocessing import PolynomialFeatures

In [50]:
df = pd.read_csv("kc_house_data_test_features.csv", index_col=0)

In [51]:
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [52]:
df.drop(columns=['date'],inplace = True)

**Feature Engineering**

**Bedrooms**

In [53]:
df['bedrooms'].replace(to_replace=[11,33],value=[4,3],inplace=True)

df['bedrooms'].replace(to_replace=[0],value=[3],inplace=True)

df['bedrooms'] = np.where(df['id'] == 6306400140, 5, df['bedrooms'])

df['bedrooms'] = np.where(df['id'] == 2954400190, 4, df['bedrooms'])

**Bathrooms**

In [54]:
df['bathrooms'] = np.where(df['id'] == 6306400140, 4.5, df['bathrooms'])

df['bathrooms'] = np.where(df['id'] == 2954400190, 4.5, df['bathrooms'])

df['bathrooms'].replace(to_replace=[0],value=[1.5],inplace=True)

**Years Old**

In [55]:
df['years_old'] = 2020- df['yr_built']

**New Home**

In [56]:
df['new_home'] = np.where(df['yr_built']>=2013, 1, 0)

**Renovated Home**

In [57]:
df['renovated_home'] = np.where(df['yr_renovated']>0, 1, 0)

**Location Features**

In [58]:
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [59]:
df['dist_Ballard'] = haversine(47.677,-122.385, df['lat'].values, df['long'].values)

df['dist_Belltown'] = haversine(47.61322,-122.3465, df['lat'].values, df['long'].values)

df['dist_CapitolHill'] = haversine(47.622942,-122.316456, df['lat'].values, df['long'].values)

df['dist_ChinaTown'] = haversine(47.591830966,-122.31916539, df['lat'].values, df['long'].values)

df['dist_PioneerSqaure'] = haversine(47.6016399,-122.3333457, df['lat'].values, df['long'].values)

df['dist_seattle'] = haversine(47.6062,-122.3321, df['lat'].values, df['long'].values)

In [60]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,years_old,new_home,renovated_home,dist_Ballard,dist_Belltown,dist_CapitolHill,dist_ChinaTown,dist_PioneerSqaure,dist_seattle
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576,15,0,0,20.135016,16.872076,15.737643,15.28695,16.078498,16.098281
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565,44,0,0,15.288874,15.630836,14.101541,15.557172,15.620525,15.379637
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916,15,0,0,17.436415,13.025328,12.524196,11.091447,12.035067,12.202989


**City Features**

In [61]:
unique_arr = df["zipcode"].unique()

zipcodes = list(unique_arr)

from uszipcode import SearchEngine, SimpleZipcode, Zipcode

search = SearchEngine()
zipcode = search.by_zipcode(98119)
zipcode.major_city

def get_cities(zipcodes):
    from uszipcode import SearchEngine, SimpleZipcode, Zipcode
    search = SearchEngine()
    cities = []
    
    for zc in zipcodes:
        zipcode = search.by_zipcode(zc)
        city = zipcode.major_city
        cities.append(city)
    return cities

get_cities(zipcodes)

conditions = [
   df['zipcode'].values == 98058,
 df['zipcode'].values == 98115,
 df['zipcode'].values ==98006,
 df['zipcode'].values ==98034,
 df['zipcode'].values ==98052,
 df['zipcode'].values ==98031,
 df['zipcode'].values ==98007,
 df['zipcode'].values ==98199,
 df['zipcode'].values ==98027,
 df['zipcode'].values ==98119,
 df['zipcode'].values ==98122,
 df['zipcode'].values ==98155,
 df['zipcode'].values ==98055,
 df['zipcode'].values ==98118,
 df['zipcode'].values ==98014,
 df['zipcode'].values ==98059,
 df['zipcode'].values ==98004,
df['zipcode'].values == 98053,
df['zipcode'].values == 98072,
df['zipcode'].values == 98133,
df['zipcode'].values == 98092,
df['zipcode'].values == 98074,
df['zipcode'].values == 98125,
df['zipcode'].values == 98045,
df['zipcode'].values == 98106,
df['zipcode'].values == 98022,
df['zipcode'].values == 98177,
df['zipcode'].values == 98008,
df['zipcode'].values == 98002,
df['zipcode'].values == 98040,
df['zipcode'].values == 98029,
df['zipcode'].values == 98042,
 df['zipcode'].values ==98001,
df['zipcode'].values == 98065,
df['zipcode'].values == 98056,
 df['zipcode'].values ==98144,
df['zipcode'].values == 98188,
df['zipcode'].values == 98109,
 df['zipcode'].values ==98010,
 df['zipcode'].values ==98075,
 df['zipcode'].values ==98198,
df['zipcode'].values == 98030,
df['zipcode'].values == 98105,
df['zipcode'].values == 98023,
df['zipcode'].values == 98028,
 df['zipcode'].values ==98112,
df['zipcode'].values == 98038,
df['zipcode'].values == 98116,
df['zipcode'].values == 98103,
df['zipcode'].values == 98178,
df['zipcode'].values == 98168,
df['zipcode'].values == 98005,
df['zipcode'].values == 98077,
df['zipcode'].values == 98146,
df['zipcode'].values == 98011,
 df['zipcode'].values ==98126,
df['zipcode'].values == 98108,
df['zipcode'].values == 98136,
df['zipcode'].values == 98033,
df['zipcode'].values == 98003,
df['zipcode'].values == 98117,
 df['zipcode'].values ==98107,
df['zipcode'].values == 98019,
df['zipcode'].values == 98102,
df['zipcode'].values == 98032,
df['zipcode'].values == 98148,
df['zipcode'].values == 98166,
 df['zipcode'].values ==98070,
df['zipcode'].values == 98024,
 df['zipcode'].values ==98039
]

choices = [
 'Renton',
 'Seattle',
 'Bellevue',
 'Kirkland',
 'Redmond',
 'Kent',
 'Bellevue',
 'Seattle',
 'Issaquah',
 'Seattle',
 'Seattle',
 'Seattle',
 'Renton',
 'Seattle',
 'Carnation',
 'Renton',
 'Bellevue',
 'Redmond',
 'Woodinville',
 'Seattle',
 'Auburn',
 'Sammamish',
 'Seattle',
 'North Bend',
 'Seattle',
 'Enumclaw',
 'Seattle',
 'Bellevue',
 'Auburn',
 'Mercer Island',
 'Issaquah',
 'Kent',
 'Auburn',
 'Snoqualmie',
 'Renton',
 'Seattle',
 'Seattle',
 'Seattle',
 'Black Diamond',
 'Sammamish',
 'Seattle',
 'Kent',
 'Seattle',
 'Federal Way',
 'Kenmore',
 'Seattle',
 'Maple Valley',
 'Seattle',
 'Seattle',
 'Seattle',
 'Seattle',
 'Bellevue',
 'Woodinville',
 'Seattle',
 'Bothell',
 'Seattle',
 'Seattle',
 'Seattle',
 'Kirkland',
 'Federal Way',
 'Seattle',
 'Seattle',
 'Duvall',
 'Seattle',
 'Kent',
 'Seattle',
 'Seattle',
 'Vashon',
 'Fall City',
 'Medina'
]


df['towns'] = np.select(conditions, choices, default='NA')

In [62]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,years_old,new_home,renovated_home,dist_Ballard,dist_Belltown,dist_CapitolHill,dist_ChinaTown,dist_PioneerSqaure,dist_seattle,towns
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576,15,0,0,20.135016,16.872076,15.737643,15.28695,16.078498,16.098281,Issaquah
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565,44,0,0,15.288874,15.630836,14.101541,15.557172,15.620525,15.379637,Woodinville
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916,15,0,0,17.436415,13.025328,12.524196,11.091447,12.035067,12.202989,Renton


In [63]:
cities = pd.get_dummies(df['towns'])
df = pd.concat([df,cities],axis=1)

In [64]:
df.rename(columns={'Federal Way':'Federal_Way','Maple Valley':'Maple_Valley','Mercer Island':'Mercer_Island','North Bend':'North_Bend','Black Diamond':'Black_Diamond','Fall City':'Fall_City'},inplace=True)

In [65]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,years_old,new_home,renovated_home,dist_Ballard,dist_Belltown,dist_CapitolHill,dist_ChinaTown,dist_PioneerSqaure,dist_seattle,towns,Auburn,Bellevue,Black_Diamond,Bothell,Carnation,Duvall,Enumclaw,Fall_City,Federal_Way,Issaquah,Kenmore,Kent,Kirkland,Maple_Valley,Medina,Mercer_Island,North_Bend,Redmond,Renton,Sammamish,Seattle,Snoqualmie,Vashon,Woodinville
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576,15,0,0,20.135016,16.872076,15.737643,15.28695,16.078498,16.098281,Issaquah,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565,44,0,0,15.288874,15.630836,14.101541,15.557172,15.620525,15.379637,Woodinville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916,15,0,0,17.436415,13.025328,12.524196,11.091447,12.035067,12.202989,Renton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


**Pickles**

In [69]:
final_model = pickle.load(open("model.pickle","rb"))
selected_columns = pickle.load(open("selected_columns.pickle","rb"))

**Predictions**

In [70]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,years_old,new_home,renovated_home,dist_Ballard,dist_Belltown,dist_CapitolHill,dist_ChinaTown,dist_PioneerSqaure,dist_seattle,towns,Auburn,Bellevue,Black_Diamond,Bothell,Carnation,Duvall,Enumclaw,Fall_City,Federal_Way,Issaquah,Kenmore,Kent,Kirkland,Maple_Valley,Medina,Mercer_Island,North_Bend,Redmond,Renton,Sammamish,Seattle,Snoqualmie,Vashon,Woodinville
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,53,0,0,7.050809,8.234968,6.899747,8.869785,8.56753,8.26648,Kirkland,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576,15,0,0,20.135016,16.872076,15.737643,15.28695,16.078498,16.098281,Issaquah,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565,44,0,0,15.288874,15.630836,14.101541,15.557172,15.620525,15.379637,Woodinville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916,15,0,0,17.436415,13.025328,12.524196,11.091447,12.035067,12.202989,Renton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [71]:
y_pred = final_model.predict(df_final[selected_columns])

In [72]:
pd.DataFrame(y_pred).to_csv('housing_preds_alexandra_bruno.csv')