# Holdout Set

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression, mutual_info_regression, RFECV
import pickle
pd.set_option('display.max_columns', 300)

### Reading Holdout Set

In [3]:
holdout = pd.read_csv('kc_house_data_test_features.csv', index_col=0)
holdout.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


### Reading pickle file

In [4]:
file = open('model.pickle', 'rb')
final_model = pickle.load(file)
file.close()

### Cleaning and Feature Engineering for Holdout Set

In [5]:
holdout['date'] = pd.to_datetime(holdout['date'])

In [6]:
holdout['day_sold'] = pd.to_datetime(holdout['date']).dt.day
holdout['month_sold'] = pd.to_datetime(holdout['date']).dt.month
holdout['year_sold'] = pd.to_datetime(holdout['date']).dt.year

In [7]:
holdout = holdout.drop(columns=['id','date'])

In [8]:
holdout['basement'] = (holdout['sqft_basement']> 0)
base_dict = {False: 'No Basement', True: 'Basement'}
holdout['basement'] = holdout['basement'].replace(base_dict)

In [9]:
holdout['diff_living_15'] = holdout['sqft_living'] - holdout['sqft_living15']
holdout['diff_lot_15'] = holdout['sqft_lot'] - holdout['sqft_lot15']

In [10]:
holdout['total_sqft'] = holdout['sqft_living'] + holdout['sqft_lot']

In [11]:
holdout['basement'] = holdout['basement'].replace(['No Basement','Basement'], [0,1])

In [12]:
holdout_dummy = pd.get_dummies(holdout, columns=['grade', 'condition','waterfront','basement','zipcode'],drop_first=True)

In [13]:
holdout_dummy['top_5_months'] = (holdout_dummy['month_sold']>3) & (holdout_dummy['month_sold']<9)
holdout_dummy['top_5_months'] = holdout_dummy['top_5_months'].replace([True, False],[1,0])

In [14]:
holdout_dummy['house_age'] = holdout_dummy['year_sold'] - holdout_dummy['yr_built']
holdout_dummy['yrrenovation_yrsold'] = holdout_dummy['year_sold'] - holdout_dummy['yr_renovated']
holdout_dummy['has_renovation'] = (holdout_dummy['yrrenovation_yrsold']!=2014) & (holdout_dummy['yrrenovation_yrsold']!=2015)
holdout_dummy['has_renovation']= holdout_dummy['has_renovation'].replace([True,False],[1,0])

In [15]:
holdout_dummy['sqft_living^2']=holdout_dummy['sqft_living']**2
holdout_dummy['total_sqft^2']=holdout_dummy['total_sqft']**2
holdout_dummy['house_age*yrrenovation_yrsold']=holdout_dummy['house_age']*holdout_dummy['yrrenovation_yrsold']

In [16]:
holdout_dummy['top_5_months*sqft_living'] = holdout_dummy['top_5_months']*holdout_dummy['sqft_living']
holdout_dummy['house_age*has_renovation'] = holdout_dummy['house_age']*holdout_dummy['has_renovation']
holdout_dummy['basement_1*total_sqft'] = holdout_dummy['basement_1']*holdout_dummy['total_sqft']
holdout_dummy['sqft_living_to_lot'] = holdout_dummy['sqft_lot']/holdout_dummy['sqft_living']
holdout_dummy['comparing_living_living15^2'] = (holdout_dummy['sqft_living']/holdout_dummy['sqft_living15'])**2
holdout_dummy['comparing_lot_lot15^2'] = (holdout_dummy['sqft_lot']/holdout_dummy['sqft_lot15'])**2

In [17]:
holdout_dummy.drop(columns=['lat','long'], inplace=True)

### Predicting on Holdout Set

In [18]:
final_answers = final_model.predict(holdout_dummy)

### Creating CSV file for Predictions

In [23]:
pd.DataFrame(final_answers).to_csv('housing_prediction_billylopez.csv')