In [1]:
import pandas as pd
import numpy as np
import math
from math import sin, cos, sqrt, atan2
from matplotlib import pyplot as plt
import numpy as np
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.feature_selection import RFECV
import warnings
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
plt.style.use('seaborn')

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)

## Step 1: Read in hold out data, scalers, and best model

In [2]:
holdout = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [3]:
final_scaler = pickle.load(open("scaler.pickle", 'rb'))
final_model =pickle.load(open("model.pickle", 'rb'))

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [4]:
#modifying Date fo make it more readable 
holdout['year'] = holdout['date'].apply(lambda x: int(x[0:4]))
holdout['month'] = holdout['date'].apply(lambda x: int(x[4:6]))

holdout['date'] = holdout['date'].apply(lambda x: x.strip('T000000'))
holdout['date'] = pd.to_datetime(holdout['date'], format='%Y%m%d')

#fixing outliers on bedrooms
holdout[holdout.bedrooms>7].bedrooms=holdout.bedrooms.median()
holdout[holdout.bathrooms>5].bathrooms=holdout.bathrooms.median()

In [5]:
holdout['coordinates'] = list(zip(holdout.lat, holdout.long))
holdout=holdout.drop(['lat','long'],axis=1)

In [6]:
#generated a feature that calculates the distance from city center to add more context to our location columns.
from geopy.distance import geodesic
seattle = (47.60646230355264, -122.33451886696908)
holdout['dis_from_city']=holdout['coordinates'].apply(lambda x: round(geodesic(seattle, x ).miles,2))
holdout.drop(["coordinates"],axis=1,inplace=True)

In [7]:
#binned grade to organize the rating of a house and separated into dummies in reference of how high it was scored.
holdout.grade=pd.cut(holdout.grade,bins=3,labels=[1,2,3])

In [8]:
#creating bins for the year it was built.
for x in range(len(holdout.yr_built)):
    holdout.yr_built[x]=(math.floor(holdout.yr_built[x]/10))*10

In [9]:
#used zipcode to create a column that knows if a zipcode is an expensive neighborhood or not.
high_income_zip=[98072, 98065, 98052, 98005, 8177, 98121, 98199, 98110, 98006, 98053, 98101, 98033, 98077, 98074, 98075, 98112, 98164, 98004, 98040, 98039]
holdout['high_income_zip']=0
for x in range(len(holdout.zipcode)):
    if holdout.zipcode[x] in high_income_zip:
        holdout.high_income_zip[x]=1
    else:
        holdout.high_income_zip[x]=0

In [10]:
#decided to create dummies for grade and condition to better expalin their categorical values
dummies = pd.get_dummies(holdout['grade'],prefix='grade')
dummies[list(dummies.columns)[:2]]
holdout=pd.concat([holdout, dummies[list(dummies.columns)[:2]]], axis=1, sort=False)
holdout=holdout.drop('grade',axis=1)

holdout = pd.concat([holdout, pd.get_dummies(holdout['condition'],drop_first = True,prefix = 'condition')], 1)

In [11]:
#created a binary variable that veryfies if a property has been resold or not. 
resold=holdout.id.value_counts(sort=True).to_frame()
res=list(holdout[holdout.id.isin(list(resold[resold.id>1].index))].index)
holdout['resold']=0
for x in res:
    holdout.resold[x]=1

In [12]:
#created a column that shows the age of a house since it was renovated.
conditions=[
    holdout.yr_renovated==0,
    holdout.yr_renovated!=0,
]
choises=[
    2020-holdout.yr_built,
    (2020 - holdout['yr_renovated']) 
]
holdout['age']=np.select(conditions,choises)

In [13]:
#creating polinomials to try to make a more linear relation for the features.
holdout['sqft_lot15_log'] = np.log(holdout['sqft_lot15'])
holdout['dis_from_city_log'] = np.log(holdout['dis_from_city'])
holdout['sqft_basement^2'] = holdout['sqft_basement']**2

In [14]:
#creating interactions with features that I think have relevance with eachother.
holdout['sqft_living_waterfront'] = holdout['sqft_living']*holdout['waterfront']
holdout['sqft_lot_high_income_zip'] = holdout['sqft_lot']*holdout['high_income_zip']
holdout['sqft_above_waterfront'] = holdout['sqft_above']*holdout['floors']
holdout['yr_built_sqrft_living'] = holdout['yr_built']*holdout['sqft_living']

In [15]:
#dropping unneeded columns
holdout.drop('date',axis=1,inplace=True)

In [16]:
transformed_holdout =pd.DataFrame(data=final_scaler.fit_transform(holdout),columns=holdout.columns)

## Step 3: Predict the holdout set

In [17]:
final_answers = final_model.predict(transformed_holdout)

In [18]:
transformed_holdout.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,year,month,dis_from_city,high_income_zip,grade_1,grade_2,condition_2,condition_3,condition_4,condition_5,resold,age,sqft_lot15_log,dis_from_city_log,sqft_basement^2,sqft_living_waterfront,sqft_lot_high_income_zip,sqft_above_waterfront,yr_built_sqrft_living
0,-0.907908,0.676412,0.227557,0.089644,-0.030766,-1.260349,-0.080742,-0.282951,-0.447707,-0.442861,1.182544,-0.677785,-0.182263,-0.858916,-0.016435,0.004705,-0.686228,0.454802,-0.442625,-0.525332,-0.095413,0.340019,-0.077786,0.519952,-0.427508,-0.232112,9.750214,0.771499,0.661453,-0.133677,0.564231,-0.074374,-0.23013,-0.912553,0.059934
1,-0.907908,0.676412,0.227557,0.089644,-0.030766,-1.260349,-0.080742,-0.282951,-0.447707,-0.442861,1.182544,-0.677785,-0.182263,-0.858916,-0.016435,0.004705,1.457242,-1.46554,-0.442625,-0.525332,-0.095413,0.340019,-0.077786,0.519952,-0.427508,-0.232112,9.750214,0.771499,0.661453,-0.133677,0.564231,-0.074374,-0.23013,-0.912553,0.059934
2,-0.340884,-0.427449,0.227557,-0.73795,-0.269995,0.445769,-0.080742,-0.282951,-0.447707,-0.867374,0.154414,0.628239,-0.182263,-0.957015,-0.772049,-0.336093,-0.686228,1.414973,0.706433,-0.525332,-0.095413,0.340019,-0.077786,0.519952,-0.427508,-0.232112,-0.102562,-0.587293,-1.474408,0.810348,-0.243687,-0.074374,-0.23013,-0.556481,-0.718953
3,-0.97753,-0.427449,-0.731556,-0.934504,0.084898,-1.260349,-0.080742,-0.282951,-0.447707,-1.012602,0.007539,-0.351279,-0.182263,-0.015263,-1.197941,-0.008173,-0.686228,1.73503,0.593696,1.903557,-0.095413,0.340019,-0.077786,0.519952,-0.427508,-0.232112,-0.102562,0.431801,0.625185,0.741212,-0.305834,-0.074374,1.094555,-1.14537,-0.934886
4,0.17154,0.676412,0.547262,0.66896,-0.113799,0.445769,-0.080742,-0.282951,-0.447707,0.998248,-0.604444,0.628239,-0.182263,-0.36842,1.096379,-0.104809,1.457242,-1.785596,0.136963,-0.525332,-0.095413,0.340019,-0.077786,0.519952,-0.427508,-0.232112,-0.102562,-0.587293,0.306646,0.420103,-0.421436,-0.074374,-0.23013,0.968238,0.684818


## Step 4: Export your predictions

In [19]:
final_answers.to_csv('housing_preds_your_ignacio_ruiz.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [24]:
final_answers=pd.DataFrame(final_answers)
final_answers.columns=['Predictions']
final_answers

Unnamed: 0,Predictions
0,1.490088e+06
1,1.565114e+06
2,1.291365e+06
3,1.800853e+06
4,1.443989e+06
...,...
4318,1.359922e+06
4319,1.597282e+06
4320,1.831134e+06
4321,1.418061e+06
