In [14]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', 300)
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

## Step 1: Read in hold out data, scalers, and best model

In [15]:
holdout = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [16]:
holdout['bedrooms'] = np.where(holdout.bedrooms <= 10, 10, holdout.bedrooms)

In [17]:
import pickle
from pickle import load

In [18]:
scaler = pd.read_pickle('scaler.pkl')
model = pd.read_pickle('model.pkl')

In [19]:
# final_scaler = read_pickle(filename)
# final_model = read_pickle(filename)

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values and scaling) that we performed on the original data.  

In [20]:
df_features= holdout[['date', 'bedrooms', 'bathrooms', 'sqft_living', 'waterfront', 'condition', 'zipcode', 'yr_built', 'yr_renovated', 'floors']]

In [21]:
df_features['date'] = pd.DatetimeIndex(df_features['date']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df_features['year']= np.where(df_features['yr_renovated']== 0, df_features['yr_built'], df_features['yr_renovated'])
df_features['yearsold']= 2016 - df_features['year']
df_features = pd.concat([df_features, pd.get_dummies(df_features['zipcode'])], 1)
df_features.drop(columns= ['yr_built','yr_renovated'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
df_features.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,waterfront,condition,zipcode,floors,year,yearsold,98001,98002,98003,98004,98005,98006,98007,98008,98010,98011,98014,98019,98022,98023,98024,98027,98028,98029,98030,98031,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,2014,10,2.5,2270,0,3,98034,1.0,1967,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2015,10,2.5,2270,0,3,98034,1.0,1967,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2014,10,2.5,1470,0,3,98029,2.0,2005,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2014,10,1.75,1280,0,3,98077,1.0,1976,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2015,10,2.75,2830,0,3,98059,2.0,2005,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# transformed_holdout = final_scaler(holdout)

In [25]:
df_features_scaled = scaler.transform(df_features)

In [26]:
df_features_scaled

array([[ 2.01400000e+03,  1.00000000e+01,  2.50000000e+00, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17],
       [ 2.01500000e+03,  1.00000000e+01,  2.50000000e+00, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17],
       [ 2.01400000e+03,  1.00000000e+01,  2.50000000e+00, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17],
       ...,
       [ 2.01400000e+03,  1.00000000e+01,  7.50000000e-01, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17],
       [ 2.01500000e+03,  1.00000000e+01,  2.50000000e+00, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17],
       [ 2.01400000e+03,  1.00000000e+01,  7.50000000e-01, ...,
        -1.41266088e-17,  2.08046420e-17,  1.05307447e-17]])

## Step 3: Predict the holdout set

In [27]:
# final_answers = final_model.predict(transformed_holdout)

In [28]:
yhat = model.predict(df_features_scaled)

In [29]:
yhat

array([-5.42180712e+19, -5.42180712e+19, -5.42150655e+19, ...,
       -5.42767324e+19, -5.42141383e+19, -5.42767324e+19])

## Step 4: Export your predictions

In [None]:
# final_answer.to_csv('housing_preds_groupinitials')

In [18]:
np.savetxt('housing_preds_ABTH.csv', yhat, delimiter=",")