# Predicting on the Holdout Dataset

## Import packages

In [1]:
import pandas as pd
import pickle
import numpy as np
import joblib
import scraper1
from geopy.distance import geodesic

## Read in the holdout dataset

In [2]:
holdout = pd.read_csv('kc_house_data_test_features.csv', index_col=0)
holdout = holdout.drop(columns=['id'])

In [3]:
holdout.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


## Modifying and creating new features

In [4]:
holdout['date'] = holdout['date'].str.slice(0, 8, 1)
holdout['year_sold'] = holdout['date'].str.slice(0, 4, 1)
holdout['month_sold'] = holdout['date'].str.slice(4, 6, 1)
holdout['day_sold'] = holdout['date'].str.slice(6, 8, 1)

In [5]:
holdout['year_sold'] = holdout['year_sold'].astype('int64')
holdout['month_sold'] = holdout['month_sold'].astype('int64')
holdout['day_sold'] = holdout['day_sold'].astype('int64')
holdout = holdout.drop(columns='date')

In [6]:
holdout['yr_renovated'] = holdout['yr_renovated'].replace(0, holdout['yr_built'])

In [7]:
holdout['years_since_update'] = (2020 - holdout['yr_renovated'])

In [8]:
holdout['basement'] = np.where(holdout['sqft_basement']==0, 0, 1)

In [9]:
holdout['bedrooms'] = holdout['bedrooms'].replace(0, holdout['bedrooms'].median())

In [10]:
holdout['q1'] = np.where((holdout['month_sold']==1) | (holdout['month_sold']==2) | (holdout['month_sold']==3), 1, 0)

In [11]:
holdout['q2'] = np.where((holdout['month_sold']==4) | (holdout['month_sold']==5) | (holdout['month_sold']==6), 1, 0)

In [12]:
holdout['q3'] = np.where((holdout['month_sold']==7) | (holdout['month_sold']==8) | (holdout['month_sold']==9), 1, 0)

In [13]:
school_coordinates = scraper1.top_schools()


In [14]:
def get_distance(lat, long):
    miles = []
    for i in range(school_coordinates.shape[0]):
        school = (school_coordinates.loc[i]['lat'], school_coordinates.loc[i]['long'])
        miles.append(geodesic((lat,long), school).miles)
    return min(miles)

In [15]:
for i in range(holdout.shape[0]):
    lat = holdout.loc[i]['lat']
    long = holdout.loc[i]['long']
    holdout.at[i, 'top_school_distance'] = get_distance(lat, long)

In [16]:
holdout['far_school'] = np.where(holdout['top_school_distance'] > holdout['top_school_distance'].quantile(q=.75), 1, 0)

In [17]:
selected_columns = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement','yr_built','yr_renovated','zipcode','lat','sqft_living15','sqft_lot15','years_since_update','basement','q2','top_school_distance','far_school']

In [18]:
holdout = holdout[selected_columns]

## Loading Scaler and Model Pickle Files

In [19]:
# model_scaler = joblib.load('scaler1.pkl') 

In [20]:
# model_scaler

In [21]:
# holdout.shape

## Step 3: Predict the holdout set

In [22]:
joblib_file = "model1.pkl"  
final_model = joblib.load(joblib_file)
output = final_model.predict(holdout)

## Exporting predictions

In [24]:
np.savetxt("housing_preds_amelia_dahm.csv", output, delimiter=",")