In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

import env
import acquire
import prepare
import model

import explore
import evaluate

import warnings
warnings.filterwarnings("ignore")

import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.metrics import mean_squared_error

In [15]:
df = acquire.zillow_2017_data()

Reading from local CSV...


In [19]:
df = prepare.prep_zillow_1(df)

In [27]:
df.sqft.describe()

count    44476.000000
mean      1696.054231
std        604.900917
min        152.000000
25%       1240.000000
50%       1570.000000
75%       2052.000000
max       3566.000000
Name: sqft, dtype: float64

For the first iteration of the model, we will only use bedrooms, bathrooms, and sqft to estimate tax value. We may add other features in later iterations. But we will drop other features here now:

In [4]:
df = df.drop(columns=['tax_amount', 'fips', 'age'])

In [5]:
df.head(3)

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value
0,4.0,3.5,3100.0,1023282.0
1,2.0,1.0,1465.0,464000.0
2,3.0,2.0,1243.0,564778.0


In [6]:
train, test, validate = prepare.train_test_validate_split(df)

train	 n = 24906
test	 n = 8896
validate n = 10674


In [7]:
target = 'tax_value'
features = [col for col in df.columns if col != target]

In [8]:
train.head(3)

Unnamed: 0,bedrooms,bathrooms,sqft,tax_value
26479,3.0,2.0,1431.0,564363.0
36845,3.0,2.0,1424.0,206362.0
3289,2.0,1.0,1300.0,93981.0


In [9]:
x_train = train[features]
y_train = train[target]

x_validate = validate[features]
y_validate = validate[target]

x_test = test[features]
y_test = test[target]

In [10]:
# establish a baseline prediction
train_results, baseline = model.determine_regression_baseline(train, target, return_results_df=True)

In [12]:
# create the model object and fit to the training sample
linreg = LinearRegression(normalize=True).fit(x_train, y_train)

# make predictions for the training sample
train_results['train_pred'] = linreg.predict(x_train)
# evaluate: train rmse
train_results['rmse_train'] = sqrt(mean_squared_error(y_train, train_results.train_pred))

validate_results = pd.DataFrame(index=y_validate.index)
# add baseline predictions
validate_results['baseline'] = y_train.mean()
# make predictions for the validate sample
validate_results['validate_pred'] = linreg.predict(x_validate)
#evaluate: validate rmse
validate_results['rmse_validate'] = sqrt(mean_squared_error(y_validate, validate_results.validate_pred))


In [13]:
train_results.head(1)

Unnamed: 0,actual,baseline_mean,RMSE_baseline,train_pred,rmse_train
26479,564363.0,364268.411909,232700.731416,324582.245505,208334.804066


In [14]:
validate_results.head(1)

Unnamed: 0,baseline,validate_pred,rmse_validate
3451,364268.411909,403529.770071,211104.144454
